### Libraries Importing 

In [9]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, mean, stddev, corr
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Student Performance Analysis") \
    .getOrCreate()


### **Machine Learning Analysis**

#### EDA

###### Set up of Pyspark and Load of dataset

In [None]:
# Load the dataset
df = spark.read.csv(r'C:\Users\iamim\OneDrive\Desktop\Code\dataset.csv', header=True, inferSchema=True)
# Show first 10 rows in PySpark
# Convert PySpark DataFrame to Pandas DataFrame (limit to 10 rows)
pandas_df = df.limit(10).toPandas()
# Use Pandas style to apply background gradient and improve table appearance
styled_table = pandas_df.style.background_gradient(cmap="Purples_r") \
                                .set_table_styles([{'selector': 'thead th', 'props': [('background-color', '#6a0dad'), ('color', 'white')]}]) \
                                .format({'time_spent_hours': '{:.2f}', 
                                         'assignments_completed': '{:.0f}', 
                                         'quiz_scores': '{:.2f}', 
                                         'previous_gpa': '{:.2f}', 
                                         'attendance_rate': '{:.2f}'})
# Display the styled table (works in Jupyter notebooks)
styled_table


###### Get Data Frame Shape

In [None]:
# Number of rows
row_count = df.count()

# Number of columns
column_count = len(df.columns)

print(f"Shape of the DataFrame: Rows = {row_count}, Columns = {column_count}")code.ipynb


###### Summary Information

In [None]:
# Data types of each column
df.printSchema()

# Step 1: Data Types of each column
# Get the schema of the DataFrame (data types)
schema_data = [(field.name, field.dataType) for field in df.schema.fields]

# Convert schema information to a Pandas DataFrame for better formatting
schema_df = pd.DataFrame(schema_data, columns=['Column Name', 'Data Type'])

# Display the schema with Pandas styling
schema_styled = schema_df.style.background_gradient(cmap="Blues") \
                                .set_table_styles([{'selector': 'thead th', 'props': [('background-color', '#003366'), ('color', 'white')]}]) \
                                .set_properties(**{'text-align': 'center'}) \
                                .set_caption("Data Types of Each Column")

# Display the styled schema table (use in Jupyter notebook or equivalent)
schema_styled

# Step 2: Check for distinct values in each column (similar to checking for nulls)
distinct_values = df.select([countDistinct(col(c)).alias(c) for c in df.columns]).collect()

# Convert distinct values to a Pandas DataFrame for better formatting
distinct_values_df = pd.DataFrame(distinct_values[0].asDict().items(), columns=['Column Name', 'Distinct Count'])

# Display distinct values with Pandas styling
distinct_values_styled = distinct_values_df.style.background_gradient(cmap="Purples_r") \
                                            .set_table_styles([{'selector': 'thead th', 'props': [('background-color', '#6a0dad'), ('color', 'white')]}]) \
                                            .set_properties(**{'text-align': 'center'}) \
                                            .set_caption("Distinct Values Count for Each Column")

# Display the styled distinct values table (use in Jupyter notebook or equivalent)
distinct_values_styled


###### Descriptive Analysis

In [None]:
# Step 1: Run describe() in PySpark
describe_df = df.describe()

# Step 2: Convert PySpark DataFrame to Pandas for better formatting
describe_pandas_df = describe_df.toPandas()

# Step 3: Use Pandas style to improve the table appearance
styled_describe = describe_pandas_df.style.background_gradient(cmap="Blues") \
                                      .set_table_styles([{'selector': 'thead th', 
                                                          'props': [('background-color', '#003366'), 
                                                                    ('color', 'white')]}]) \
                                      .set_properties(**{'text-align': 'center'}) \
                                      .set_caption("Statistical Summary of Numerical Columns")

# Step 4: Display the styled table (works in Jupyter notebook)
styled_describe


###### Numerical Columns

In [None]:
from pyspark.sql.types import NumericType

# Assuming 'df' is your DataFrame
numeric_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]

# Get the number of numerical columns
num_numerical_columns = len(numeric_columns)

# Show the result
print(f"Number of numerical columns: {num_numerical_columns}")

# Assuming 'df' is your DataFrame
numeric_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]

# Print the names of the numerical columns
print("Numerical columns:", numeric_columns)


###### Correlation Matrix

In [None]:
# List of numeric columns
numeric_cols = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
# Initialize an empty dictionary to store correlations
correlation_data = {col: [] for col in numeric_cols}

# Compute correlation between each pair of numeric columns
for col1 in numeric_cols:
    for col2 in numeric_cols:
        if col1 != col2:
            corr_value = df.corr(col1, col2)
        else:
            corr_value = 1.0  # Correlation with itself is always 1
        correlation_data[col1].append(corr_value)

# Convert the dictionary to a Pandas DataFrame for better formatting
correlation_df = pd.DataFrame(correlation_data, index=numeric_cols)

# Style the correlation matrix
styled_corr = correlation_df.style.background_gradient(cmap="coolwarm") \
                                .set_table_styles([{'selector': 'thead th', 
                                                    'props': [('background-color', '#003366'), 
                                                              ('color', 'white')]}]) \
                                .set_properties(**{'text-align': 'center'}) \
                                .set_caption("Correlation Matrix of Numeric Columns")

# Display the styled correlation matrix (works in Jupyter notebook)
styled_corr


###### Correlation Heatmap

In [None]:
# Convert PySpark DataFrame to Pandas for visualization
pandas_df = df.toPandas()

# Create a correlation heatmap using seaborn
plt.figure(figsize=(10, 10))  # Increase figure size for better clarity
correlation_matrix = pandas_df[numeric_cols].corr()

# Create the heatmap
sns.heatmap(correlation_matrix, 
            annot=True, 
            fmt=".2f",  # Format the annotations to two decimal places
            cmap='RdYlGn', 
            linewidths=0.5, 
            linecolor='grey',  # Color for grid lines
            cbar_kws={"shrink": .8},  # Shrink color bar
            square=True,  # Make cells square-shaped
            vmin=-1, vmax=1)  # Set limits for color mapping

# Enhance the plot with title and labels
plt.title("Correlation Heatmap", fontsize=18, fontweight='bold', pad=20)  # Title with padding
plt.xticks(rotation=45, ha='right', fontsize=12)  # Rotate x-axis labels
plt.yticks(rotation=0, fontsize=12)  # Rotate y-axis labels
plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels

# Show the plot
plt.show()


###### Categorical Columns 

In [None]:
from pyspark.sql.types import StringType, BooleanType

# Assuming 'df' is your DataFrame
categorical_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, (StringType, BooleanType))]

# Print the names of the categorical columns
print("Categorical columns:", categorical_columns)

# Print the count of categorical columns
print("Number of categorical columns:", len(categorical_columns))


###### Frequency Distribution of Categorical Columns

In [None]:

# Get frequency counts for categorical columns
categorical_cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

# Create a dictionary to store the styled DataFrames
styled_freq_dfs = {}

for col_name in categorical_cols:
    # Calculate frequency counts and convert to Pandas DataFrame
    freq_df = df.groupBy(col_name).count().orderBy('count', ascending=False).toPandas()
    
    # Style the DataFrame
    styled_freq = freq_df.style.background_gradient(cmap="RdYlGn") \
                                 .set_table_styles([{'selector': 'thead th', 
                                                     'props': [('background-color', '#4B0082'), 
                                                               ('color', 'white')]}]) \
                                 .set_properties(**{'text-align': 'center'}) \
                                 .set_caption(f"Frequency Counts for {col_name}")
    
    # Store the styled DataFrame in the dictionary
    styled_freq_dfs[col_name] = styled_freq

    # Display the styled DataFrame
    print(f"Frequency Counts for {col_name}:")
    display(styled_freq)


###### Missing Values Analysis

In [None]:
# Calculate total row count
row_count = df.count()

# Calculate percentage of missing values for each column
missing_percentage = df.select([
    (F.when(F.count(col(c)) == 0, 1).otherwise(0) * 100).alias(f'{c}_missing_percentage') 
     for c in df.columns
])

# Collect the results
missing_percentage_values = missing_percentage.collect()[0].asDict()

# Convert to Pandas DataFrame for better formatting
missing_percentage_df = pd.DataFrame(list(missing_percentage_values.items()), columns=['Column Name', 'Missing Percentage'])

# Style the missing value percentage table
styled_missing = missing_percentage_df.style.background_gradient(cmap="RdYlGn") \
                                        .set_table_styles([{'selector': 'thead th', 
                                                            'props': [('background-color', '#b22222'), 
                                                                      ('color', 'white')]}]) \
                                        .set_properties(**{'text-align': 'center'}) \
                                        .set_caption("Percentage of Missing Values for Each Column")

# Display the styled missing value percentage table (works in Jupyter notebook)
styled_missing


###### Variance and Standard Deviation of Numerical Values

In [None]:
# Step 1: Calculate standard deviation for each numeric column
stddev_df = df.select([F.stddev(col(c)).alias(f'{c}_stddev') for c in numeric_cols])

# Step 2: Calculate variance for each numeric column
variance_df = df.select([F.variance(col(c)).alias(f'{c}_variance') for c in numeric_cols])

# Step 3: Collect the results into Pandas DataFrames
stddev_values = stddev_df.collect()[0].asDict()
variance_values = variance_df.collect()[0].asDict()

stddev_pandas_df = pd.DataFrame(list(stddev_values.items()), columns=['Column Name', 'Standard Deviation'])
variance_pandas_df = pd.DataFrame(list(variance_values.items()), columns=['Column Name', 'Variance'])

# Step 4: Style the standard deviation and variance tables
styled_stddev = stddev_pandas_df.style.background_gradient(cmap="RdYlGn_r") \
                                    .set_table_styles([{'selector': 'thead th', 
                                                        'props': [('background-color', '#4B0082'), 
                                                                  ('color', 'white')]}]) \
                                    .set_properties(**{'text-align': 'center'}) \
                                    .set_caption("Standard Deviation for Numeric Columns")

styled_variance = variance_pandas_df.style.background_gradient(cmap="RdYlGn") \
                                      .set_table_styles([{'selector': 'thead th', 
                                                          'props': [('background-color', '#228B22'), 
                                                                    ('color', 'white')]}]) \
                                      .set_properties(**{'text-align': 'center'}) \
                                      .set_caption("Variance for Numeric Columns")

# Step 5: Display the styled tables (works in Jupyter notebook)
display(styled_stddev)
display(styled_variance)


###### Skewness and Kurtosis

In [None]:
# Step 1: Calculate skewness for each numeric column
skewness_df = df.select([F.skewness(F.col(c)).alias(f'{c}_skewness') for c in numeric_cols])

# Step 2: Calculate kurtosis for each numeric column
kurtosis_df = df.select([F.kurtosis(F.col(c)).alias(f'{c}_kurtosis') for c in numeric_cols])

# Step 3: Collect the results into Pandas DataFrames
skewness_values = skewness_df.collect()[0].asDict()
kurtosis_values = kurtosis_df.collect()[0].asDict()

skewness_pandas_df = pd.DataFrame(list(skewness_values.items()), columns=['Column Name', 'Skewness'])
kurtosis_pandas_df = pd.DataFrame(list(kurtosis_values.items()), columns=['Column Name', 'Kurtosis'])

# Step 4: Style the skewness and kurtosis tables
styled_skewness = skewness_pandas_df.style.background_gradient(cmap="Blues") \
                                        .set_table_styles([{'selector': 'thead th', 
                                                            'props': [('background-color', '#1E90FF'), 
                                                                      ('color', 'white')]}]) \
                                        .set_properties(**{'text-align': 'center'}) \
                                        .set_caption("Skewness for Numeric Columns")

styled_kurtosis = kurtosis_pandas_df.style.background_gradient(cmap="Oranges") \
                                      .set_table_styles([{'selector': 'thead th', 
                                                          'props': [('background-color', '#FF4500'), 
                                                                    ('color', 'white')]}]) \
                                      .set_properties(**{'text-align': 'center'}) \
                                      .set_caption("Kurtosis for Numeric Columns")

# Step 5: Display the styled tables (works in Jupyter notebook)
display(styled_skewness)
display(styled_kurtosis)


###### Outliers Detection using Z-Score Analysis

In [None]:


# List of numeric columns
numeric_cols = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
# Initialize the original DataFrame with Z-Score columns
zscore_df = df

# Calculate Z-Score for each numeric column and add it to the DataFrame
for col_name in numeric_cols:
    stats = df.select(F.mean(F.col(col_name)).alias('mean'), F.stddev(F.col(col_name)).alias('std')).first()
    mean_value, std_value = stats['mean'], stats['std']
    
    zscore_df = zscore_df.withColumn(f'{col_name}_zscore', (F.col(col_name) - mean_value) / std_value)

# Convert to Pandas DataFrame for better formatting
pandas_zscore_df = zscore_df.select(*numeric_cols, *[f'{col}_zscore' for col in numeric_cols]).toPandas()

# Set display options for better formatting
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Don't wrap to next line

# Print the top 20 rows of the Z-Scores DataFrame
print("Top 20 rows of the Z-Scores DataFrame:")
# print(pandas_zscore_df.head(20))

# Display the styled DataFrame for the first 20 rows
styled_zscore = pandas_zscore_df.head(20).style.background_gradient(cmap="viridis") \
                                        .set_table_styles([{'selector': 'thead th', 
                                                            'props': [('background-color', '#4B0082'), 
                                                                      ('color', 'white')]}]) \
                                        .set_properties(**{'text-align': 'center'}) \
                                        .set_caption("Z-Scores for Numeric Columns (Top 20 Rows)")

# Display the styled DataFrame in a Jupyter Notebook
styled_zscore


###### Quantile Analysis

In [None]:
# Calculate quantiles (0%, 25%, 50%, 75%, and 100%)
quantiles = df.approxQuantile(numeric_cols, [0.0, 0.25, 0.5, 0.75, 1.0], 0.05)

# Create a DataFrame for better visualization
quantiles_df = pd.DataFrame(quantiles, columns=["0%", "25%", "50%", "75%", "100%"], index=numeric_cols)

# Style the DataFrame
styled_quantiles = quantiles_df.style \
    .background_gradient(cmap="viridis") \
    .set_table_styles([{'selector': 'thead th', 
                        'props': [('background-color', '#4B0082'), 
                                  ('color', 'white')]}]) \
    .set_properties(**{'text-align': 'center'}) \
    .set_caption("Quantiles for Numeric Columns")

# Display the styled DataFrame
styled_quantiles


###### Covariance

In [None]:

# Assuming 'df' is your PySpark DataFrame and 'numeric_cols' contains the list of numeric columns
covariance_results = []

# Calculate covariance between pairs of numeric columns
for col1 in numeric_cols:
    for col2 in numeric_cols:
        if col1 != col2:
            cov_value = df.stat.cov(col1, col2)  # Use stat.cov for covariance in PySpark
            covariance_results.append([col1, col2, cov_value])

# Randomly sample 3 to 5 rows from the covariance results
random_sample = random.sample(covariance_results, min(len(covariance_results), random.randint(3, 5)))

# Convert the random sample to a Pandas DataFrame for styling (PySpark doesn't support advanced styling directly)
covariance_df = pd.DataFrame(random_sample, columns=["Column 1", "Column 2", "Covariance"])

# Style the DataFrame using Pandas
styled_covariance = covariance_df.style \
    .background_gradient(cmap="viridis") \
    .set_table_styles([{'selector': 'thead th', 
                        'props': [('background-color', '#4B0082'), 
                                  ('color', 'white')]}]) \
    .set_properties(**{'text-align': 'center'}) \
    .set_caption("Sampled Covariance Between Numeric Columns")

# Display the styled DataFrame
styled_covariance


###### Ratio of Variance (F-test for Feature Importance)

In [None]:

# List of numeric columns
numeric_cols = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 
                'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

# Calculate variance for each numeric column
variance_results = []

for col_name in numeric_cols:
    variance = df.select(F.variance(F.col(col_name))).collect()[0][0]
    variance_results.append([col_name, variance])

# Convert the results to a Pandas DataFrame for visualization
variance_df = pd.DataFrame(variance_results, columns=["Column Name", "Variance"])

# Style the DataFrame
styled_variance = variance_df.style \
    .background_gradient(cmap="viridis") \
    .set_table_styles([{'selector': 'thead th', 
                        'props': [('background-color', '#4B0082'), 
                                  ('color', 'white')]}]) \
    .set_properties(**{'text-align': 'center'}) \
    .set_caption("Variance of Numeric Columns")

# Display the styled DataFrame
styled_variance


---------

### Data Visvalization

###### Plot distribution for each numeric column

In [None]:
# Convert PySpark DataFrame to Pandas DataFrame for visualization
pandas_df = df.toPandas()

# Set the aesthetic style of the plots
sns.set(style="whitegrid", font_scale=1.2)  # Slightly larger font scale for better readability
sns.set_palette("Set2")  # Use Set2 color palette for a soft, professional look

# Define an enhanced function to plot the distribution with more customization
def plot_distribution(col_name):
    plt.figure(figsize=(12, 6))  # Increase figure size for better visibility
    sns.histplot(pandas_df[col_name], kde=True, color='dodgerblue', bins=30, stat='density', alpha=0.7)  # Updated color and transparency
    plt.axvline(pandas_df[col_name].mean(), color='red', linestyle='--', label=f'Mean: {pandas_df[col_name].mean():.2f}')  # Mean line with label
    plt.axvline(pandas_df[col_name].median(), color='green', linestyle='-', label=f'Median: {pandas_df[col_name].median():.2f}')  # Median line with label
    
    # Enhancements
    plt.title(f'Distribution of {col_name}', fontsize=20, fontweight='bold', color='navy')  # Title customization with color
    plt.xlabel(col_name, fontsize=14, labelpad=10, color='navy')  # X-axis label with padding
    plt.ylabel('Density', fontsize=14, labelpad=10, color='navy')  # Y-axis label with padding
    plt.xticks(fontsize=12, rotation=45, color='darkblue')  # X-axis tick labels with rotation
    plt.yticks(fontsize=12, color='darkblue')  # Y-axis tick labels
    plt.legend(fontsize=12, loc='upper right', frameon=True, shadow=True)  # Add a shadowed legend for better visibility
    plt.grid(True, linestyle='--', alpha=0.6)  # Add grid with transparency and dashed style
    plt.tight_layout()  # Adjust layout to fit labels
    
    # Additional styling
    plt.gca().spines['top'].set_visible(False)  # Remove the top spine
    plt.gca().spines['right'].set_visible(False)  # Remove the right spine
    plt.gca().spines['left'].set_color('grey')  # Color the left spine
    plt.gca().spines['bottom'].set_color('grey')  # Color the bottom spine

    plt.show()

# Plot distribution for each numeric column using the enhanced function
for col_name in numeric_cols:
    plot_distribution(col_name)


###### Box plot for Numeric columns

In [None]:


# Assuming 'df' is your PySpark DataFrame and 'numeric_cols' is a list of your numeric columns
# Convert the PySpark DataFrame to Pandas for plotting
pandas_df = df.toPandas()

# Set the aesthetic style of the plots
sns.set(style="whitegrid", font_scale=1.2)  # Slightly larger font scale for better readability
sns.set_palette("Set2")  # Use Set2 color palette for soft, professional tones

# Define a function to create enhanced box plots
def plot_boxplot(col_name):
    plt.figure(figsize=(10, 6))  # Increase figure size for better visibility
    sns.boxplot(x=pandas_df[col_name], color='#6A5ACD', width=0.4, fliersize=5, linewidth=1.5)  # Box width and outlier size
    plt.title(f'Box Plot of {col_name}', fontsize=20, fontweight='bold', color='#333')  # Enhanced title styling
    plt.xlabel(col_name, fontsize=16, color='#555')  # X-axis label with custom color
    plt.xticks(fontsize=14, color='#333')  # X-axis tick labels
    plt.grid(True, linestyle='--', linewidth=0.6, alpha=0.7)  # Add dashed grid with transparency for better readability
    
    # Add median line with enhanced design
    median_value = pandas_df[col_name].median()
    plt.axvline(x=median_value, color='orange', linestyle='--', label=f'Median: {median_value:.2f}')  # Display median
    
    # Annotate the median value on the plot
    plt.annotate(f'Median: {median_value:.2f}', 
                 xy=(median_value, 0.05), 
                 xytext=(median_value + 0.5, 0.2),  # Adjust annotation position
                 arrowprops=dict(facecolor='orange', shrink=0.05),
                 fontsize=12, color='darkorange')

    plt.legend(fontsize=12, loc='best', frameon=True, shadow=True)  # Add legend for median line
    plt.gca().spines['top'].set_visible(False)  # Remove the top spine for cleaner look
    plt.gca().spines['right'].set_visible(False)  # Remove the right spine

    plt.tight_layout()  # Adjust layout to fit labels and avoid overlapping
    plt.show()

# Create box plots for each numeric column using the enhanced function
for col_name in numeric_cols:
    plot_boxplot(col_name)



###### Pair plot

In [None]:
# Assuming 'df' is your PySpark DataFrame and 'numeric_cols' is a list of numeric columns

# Convert the PySpark DataFrame to Pandas for visualization
pandas_df = df.select(numeric_cols).toPandas()

# Set the aesthetic style of the plots
sns.set(style="whitegrid", font_scale=1.2)  # Slightly larger font scale for better readability

# Create a pair plot with enhanced appearance
pair_plot = sns.pairplot(
    pandas_df[numeric_cols], 
    diag_kind='kde', 
    palette="coolwarm",  # Using a cool-warm palette for better contrast
    height=3  # Set height for each subplot
)

# Add titles and adjust aesthetics
pair_plot.fig.suptitle("Enhanced Pair Plot of Numeric Columns", fontsize=22, fontweight='bold', y=1.03)  # Add a title

# Customize the lower triangle with scatter plots
pair_plot.map_lower(sns.scatterplot, color='teal', alpha=0.7, edgecolor="black")  # Add border for points

# Add correlation values in the upper triangle for more information
def annotate_correlation(x, y, **kwargs):
    corr = x.corr(y)
    ax = plt.gca()
    ax.annotate(f'Corr: {corr:.2f}', xy=(0.5, 0.5), xycoords=ax.transAxes, 
                ha='center', va='center', fontsize=12, color='darkred')

pair_plot.map_upper(annotate_correlation)

# Customize diagonal with KDE plots
pair_plot.map_diag(sns.kdeplot, color='darkblue', lw=2, shade=True)  # KDE with shade and thicker line

# Adjust the layout to fit everything nicely
plt.subplots_adjust(top=0.92, wspace=0.2, hspace=0.2)  # Adjust margins to fit title and subplots

# Show the final plot
plt.show()


###### Columns Names

In [None]:
print(pandas_df.columns)


###### Density Heatmap

In [None]:

# Assuming 'df' is your PySpark DataFrame
# Numeric columns based on your dataset
numeric_cols = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 
                'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 
                'absences', 'G1', 'G2', 'G3']

# Extract relevant columns from PySpark DataFrame
heatmap_df = df.select(numeric_cols)

# Convert to Pandas for visualization
pandas_heatmap_df = heatmap_df.toPandas()

# Set up the figure with subplots
fig, axes = plt.subplots(3, 2, figsize=(16, 18))  # Adjusted layout to fit more plots

# 1. Density Heatmap for Age vs G1 Score
sns.kdeplot(data=pandas_heatmap_df, x='age', y='G1', cmap='Blues', fill=True, ax=axes[0, 0])
axes[0, 0].set_title('Density Heatmap: Age vs G1 Score', fontsize=16)
axes[0, 0].set_xlabel('Age', fontsize=14)
axes[0, 0].set_ylabel('G1 Score', fontsize=14)

# 2. Density Heatmap for Medu vs G2 Score
sns.kdeplot(data=pandas_heatmap_df, x='Medu', y='G2', cmap='Greens', fill=True, ax=axes[0, 1])
axes[0, 1].set_title('Density Heatmap: Mother\'s Education vs G2 Score', fontsize=16)
axes[0, 1].set_xlabel('Mother\'s Education (Medu)', fontsize=14)
axes[0, 1].set_ylabel('G2 Score', fontsize=14)

# 3. Density Heatmap for Study Time vs G3 Score
sns.kdeplot(data=pandas_heatmap_df, x='studytime', y='G3', cmap='Reds', fill=True, ax=axes[1, 0])
axes[1, 0].set_title('Density Heatmap: Study Time vs G3 Score', fontsize=16)
axes[1, 0].set_xlabel('Study Time', fontsize=14)
axes[1, 0].set_ylabel('G3 Score', fontsize=14)

# 4. Density Heatmap for Failures vs G1 Score
sns.kdeplot(data=pandas_heatmap_df, x='failures', y='G1', cmap='Purples', fill=True, ax=axes[1, 1])
axes[1, 1].set_title('Density Heatmap: Failures vs G1 Score', fontsize=16)
axes[1, 1].set_xlabel('Failures', fontsize=14)
axes[1, 1].set_ylabel('G1 Score', fontsize=14)

# 5. Density Heatmap for Absences vs G2 Score
sns.kdeplot(data=pandas_heatmap_df, x='absences', y='G2', cmap='Oranges', fill=True, ax=axes[2, 0])
axes[2, 0].set_title('Density Heatmap: Absences vs G2 Score', fontsize=16)
axes[2, 0].set_xlabel('Absences', fontsize=14)
axes[2, 0].set_ylabel('G2 Score', fontsize=14)

# 6. Density Heatmap for Freetime vs G3 Score
sns.kdeplot(data=pandas_heatmap_df, x='freetime', y='G3', cmap='Blues', fill=True, ax=axes[2, 1])
axes[2, 1].set_title('Density Heatmap: Free Time vs G3 Score', fontsize=16)
axes[2, 1].set_xlabel('Free Time', fontsize=14)
axes[2, 1].set_ylabel('G3 Score', fontsize=14)

# Adjust layout
plt.tight_layout()
plt.show()


###### Student's Sex


In [None]:
# Create a temporary view
df.createOrReplaceTempView("students")
# Count female students
female_count = spark.sql("SELECT COUNT(*) AS female_count FROM students WHERE sex = 'F'")
print('Number of female students:', female_count.collect()[0]['female_count'])

# Count male students
male_count = spark.sql("SELECT COUNT(*) AS male_count FROM students WHERE sex = 'M'")
print('Number of male students:', male_count.collect()[0]['male_count'])
# Count male and female students
gender_count = spark.sql("SELECT sex, COUNT(*) AS count FROM students GROUP BY sex")
gender_count_pd = gender_count.toPandas()  # Convert to Pandas DataFrame

# Set the style
sns.set_style('whitegrid')

# Create a countplot
plt.figure(figsize=(8, 6))
sns.countplot(x='sex', data=gender_count_pd, palette='plasma')

# Add title and labels
plt.title('Count of Male and Female Students', fontsize=16)
plt.xlabel('Sex', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Show the plot
plt.show()


Observation: The gender distribution is pretty even.


###### Age of Students


In [None]:
# Select the age column and convert to Pandas DataFrame
age_data = df.select("age").toPandas()

# Set the style
sns.set_style('whitegrid')

# Create a KDE plot
plt.figure(figsize=(8, 6))
b = sns.kdeplot(age_data['age'], fill=True)

# Set titles and labels
b.axes.set_title('Ages of Students', fontsize=16)
b.set_xlabel('Age', fontsize=14)
b.set_ylabel('Density', fontsize=14)

# Show the plot
plt.show()


Observation: The student age seems to be ranging from 15-19, where gender distribution is pretty even in each age group.


###### Students from Urban & Rural Areas


In [None]:
# Create a temporary view
df.createOrReplaceTempView("students")

# df Urban students
urban_count = spark.sql("SELECT COUNT(*) AS urban_count FROM students WHERE address = 'U'")
u_stud = urban_count.collect()[0]['urban_count']
print('Number of Urban students:', u_stud)

# Count Rural students
rural_count = spark.sql("SELECT COUNT(*) AS rural_count FROM students WHERE address = 'R'")
r_stud = rural_count.collect()[0]['rural_count']
print('Number of Rural students:', r_stud)


In [None]:

# Create a Spark session
spark = SparkSession.builder \
    .appName("Address Count Plot") \
    .getOrCreate()


# Group by 'address' and count occurrences
address_counts = df.groupBy('address').count()

# Convert to Pandas DataFrame for plotting
address_counts_pd = address_counts.toPandas()

# Get colors from the 'magma' colormap
colors = cm.get_cmap('magma', len(address_counts_pd))  # Create a colormap

# Plotting
plt.figure(figsize=(8, 6))
plt.bar(address_counts_pd['address'], address_counts_pd['count'], color=colors(range(len(address_counts_pd))))
plt.title('Number of Students by Address')
plt.xlabel('Address')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--')
plt.show()


Obervations: Approximately 77.72% students come from urban region and 22.28% from rural region.


In [None]:


# Create a Spark session
spark = SparkSession.builder \
    .appName("Address G3 Count Plot") \
    .getOrCreate()

# Group by 'address' and 'G3' and count occurrences
address_g3_counts = df.groupBy('address', 'G3').count()

# Convert to Pandas DataFrame for plotting
address_g3_counts_pd = address_g3_counts.toPandas()

# Set the style of seaborn
sns.set(style="whitegrid")

# Create a figure and axis
plt.figure(figsize=(12, 8))

# Color palette for better visual distinction
palette = sns.color_palette("magma", len(address_g3_counts_pd['G3'].unique()))

# Plot bars for each G3 value
for idx, g3_value in enumerate(address_g3_counts_pd['G3'].unique()):
    subset = address_g3_counts_pd[address_g3_counts_pd['G3'] == g3_value]
    plt.bar(subset['address'], subset['count'], label=f'G3 = {g3_value}', alpha=0.7, color=palette[idx])

# Add titles and labels
plt.title('Number of Students by Address and G3', fontsize=18, weight='bold')
plt.xlabel('Address', fontsize=14)
plt.ylabel('Count of Students', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)  # Rotate x-axis labels for better visibility
plt.legend(title='G3 Values', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add data labels on top of bars
for idx, g3_value in enumerate(address_g3_counts_pd['G3'].unique()):
    subset = address_g3_counts_pd[address_g3_counts_pd['G3'] == g3_value]
    for i in range(len(subset)):
        plt.text(x=subset['address'].iloc[i], 
                 y=subset['count'].iloc[i] + 1,  # Slightly above the bar
                 s=subset['count'].iloc[i],
                 ha='center', va='bottom', fontsize=10)

# Show the plot
plt.tight_layout()  # Adjust the layout to prevent clipping
plt.show()


###### Does age affect final grade?

In [None]:


# Create a Spark session
spark = SparkSession.builder \
    .appName("Age vs Final Grade Boxplot") \
    .getOrCreate()

# Assuming 'df' is your PySpark DataFrame, convert it to Pandas DataFrame
df_pd = df.toPandas()  # Convert PySpark DataFrame to Pandas DataFrame

# Set the seaborn style
sns.set(style="whitegrid")

# Create the boxplot
plt.figure(figsize=(12, 6))
b = sns.boxplot(x='age', y='G3', data=df_pd, palette='gist_heat')

# Set title and labels
b.set_title('Age vs Final Grade', fontsize=18, weight='bold')
b.set_xlabel('Age', fontsize=14)
b.set_ylabel('Final Grade (G3)', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()


Observation: 
* Plotting the distribution rather than statistics would help us better understand the data.
* The above plot shows that the median grades of the three age groups(15,16,17) are similar. Note the skewness of age group 19. (may be due to sample size). Age group 20 seems to score highest grades among all.

In [None]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Age vs Final Grade Swarm Plot") \
    .getOrCreate()

# Assuming 'df' is your PySpark DataFrame, convert it to Pandas DataFrame
df_pd = df.toPandas()  # Convert PySpark DataFrame to Pandas DataFrame

# Set the seaborn style
sns.set(style="whitegrid")

# Create the swarm plot
plt.figure(figsize=(12, 6))
b = sns.swarmplot(x='age', y='G3', hue='sex', data=df_pd, palette='PiYG', dodge=True)

# Set title and labels
b.set_title('Does Age Affect Final Grade?', fontsize=18, weight='bold')
b.set_xlabel('Age', fontsize=14)
b.set_ylabel('Final Grade (G3)', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()


###### Do urban students perform better than rural students?

In [None]:


# Create a Spark session
spark = SparkSession.builder \
    .appName("Grade Distribution by Address") \
    .getOrCreate()

# Assuming 'stud' is your PySpark DataFrame
# Calculate grades for Urban and Rural students
urban_grades = df.filter(df['address'] == 'U').select('G3').toPandas()
rural_grades = df.filter(df['address'] == 'R').select('G3').toPandas()

# Set the seaborn style
sns.set(style="whitegrid")

# Create the KDE plots
plt.figure(figsize=(10, 6))
sns.kdeplot(urban_grades['G3'], label='Urban', shade=True)
sns.kdeplot(rural_grades['G3'], label='Rural', shade=True)

# Set title and labels
plt.title('Do Urban Students Score Higher Than Rural Students?', fontsize=18)
plt.xlabel('Grade', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend()
plt.show()


Observations: The above graph clearly shows there is not much difference between the grades based on location.


###### Student status by alchool consumption :

In [None]:


# Create a Spark session
spark = SparkSession.builder \
    .appName("Weekend Alcohol Consumption Impact") \
    .getOrCreate()

# Assuming 'stud' is your PySpark DataFrame
# Create a crosstab
alc_tab = df.groupBy('G3', 'Walc').count()

# Calculate total counts for each 'G3' status
total_counts = alc_tab.groupBy('G3').agg(F.sum('count').alias('total_count'))

# Join to get the percentages
alc_perc = alc_tab.join(total_counts, on='G3') \
    .withColumn('percentage', (alc_tab['count'] / total_counts['total_count']) * 100) \
    .select('G3', 'Walc', 'percentage')

# Convert to Pandas DataFrame for plotting
alc_perc_pd = alc_perc.toPandas()

# Plotting
plt.figure(figsize=(14, 6))
alc_perc_pd.pivot(index='G3', columns='Walc', values='percentage').plot(kind='bar', colormap="Dark2_r", ax=plt.gca(), fontsize=16)

plt.title('Student Status by Weekend Alcohol Consumption', fontsize=20)
plt.xlabel('Student Status', fontsize=16)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xticks(rotation=0)  # Rotate x-axis labels for better visibility
plt.grid(axis='y', linestyle='--')
plt.show()


Observations: We come to know students between range 40% - 45% consume alcohol weekly 

###### Student status by internet accessibility:

In [None]:


# Create a Spark session
spark = SparkSession.builder \
    .appName("Internet Accessibility Impact") \
    .getOrCreate()

# Assuming 'stud' is your PySpark DataFrame
# Create a crosstab
alc_tab = df.groupBy('G3', 'internet').count()

# Calculate total counts for each 'G3' status
total_counts = alc_tab.groupBy('G3').agg(F.sum('count').alias('total_count'))

# Join to get the percentages
alc_perc = alc_tab.join(total_counts, on='G3') \
    .withColumn('percentage', (alc_tab['count'] / total_counts['total_count']) * 100) \
    .select('G3', 'internet', 'percentage')

# Convert to Pandas DataFrame for plotting
alc_perc_pd = alc_perc.toPandas()

# Plotting
plt.figure(figsize=(14, 6))
alc_perc_pd.pivot(index='G3', columns='internet', values='percentage').plot(kind='bar', colormap="Dark2_r", ax=plt.gca(), fontsize=16)

plt.title('Student Status by Internet Accessibility', fontsize=20)
plt.xlabel('Student Status', fontsize=16)
plt.ylabel('Percentage of Students', fontsize=16)
plt.xticks(rotation=0)  # Rotate x-axis labels for better visibility
plt.grid(axis='y', linestyle='--')
plt.show()


Observation: Majority of the students have internet connection

###### Going Out with Friends Attribute

In [None]:


# Create a Spark session
spark = SparkSession.builder \
    .appName("Go Out vs Final Grade") \
    .getOrCreate()

# Assuming 'stud' is your PySpark DataFrame
# Group by 'goout' and count occurrences
goout_counts = df.groupBy('goout').agg(F.count('*').alias('count'))

# Convert to Pandas DataFrame for plotting
goout_counts_pd = goout_counts.toPandas()

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(goout_counts_pd['goout'], goout_counts_pd['count'], color='orangered', alpha=0.7)

# Setting the title and labels
plt.title('Go Out vs Final Grade (G3)', fontsize=20)
plt.xlabel('Go Out Frequency', fontsize=16)
plt.ylabel('Count of Students', fontsize=16)
plt.xticks(rotation=0)  # Rotate x-axis labels for better visibility
plt.grid(axis='y', linestyle='--')

plt.show()


Observation : The students have an average score when it comes to going out with friends.



## Machine Learning

* numeric_cols = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 
* 'failures', 'famrel', 'freetime', 'goout', 
* 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

###### Conversion of numerical columns to feature columns

In [6]:
# Initialize Spark session
spark = SparkSession.builder.appName("ML_Model_Implementation").getOrCreate()

# Load your dataset
new = spark.read.csv("dataset.csv", header=True, inferSchema=True)

# Convert numerical columns to features vector
numerical_columns = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 
                     'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
assembler = VectorAssembler(inputCols=numerical_columns[:-3], outputCol="features")

# Transform the dataset and create the label column
new = assembler.transform(new).withColumn("label", (col("G3") > 10).cast("int"))

# Split the data
train_data, test_data = new.randomSplit([0.8, 0.2], seed=42)

# Initialize lists to store results
results = []




###### Logistic Regression 

In [None]:


# 1. Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

# Evaluating Logistic Regression
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)

# Training Accuracy for Logistic Regression
train_predictions_lr = lr_model.transform(train_data)
train_accuracy_lr = evaluator.evaluate(train_predictions_lr)

# Confusion Matrix and Classification Report for Logistic Regression
def compute_confusion_matrix_and_report(predictions):
    predictions_and_labels = predictions.select("prediction", "label").rdd

    # Extract predicted and actual labels
    y_pred = np.array(predictions_and_labels.map(lambda x: x[0]).collect())
    y_true = np.array(predictions_and_labels.map(lambda x: x[1]).collect())

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Classification Report
    class_report = classification_report(y_true, y_pred)

    return conf_matrix, class_report

# Compute confusion matrix and classification report
conf_matrix_lr, class_report_lr = compute_confusion_matrix_and_report(lr_predictions)

# Print results
print(f"Logistic Regression Training Accuracy: {train_accuracy_lr}")
print(f"Logistic Regression Testing Accuracy: {lr_accuracy}")
print("Confusion Matrix for Logistic Regression:\n", conf_matrix_lr)
print("Classification Report for Logistic Regression:\n", class_report_lr)

# Function to plot confusion matrix
def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Plot confusion matrix for Logistic Regression
plot_confusion_matrix(conf_matrix_lr, "Logistic Regression")


###### Random Forest Classifier

In [None]:


# 2. Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

# Evaluating Random Forest
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
rf_accuracy = evaluator.evaluate(rf_predictions)

# Training Accuracy for Random Forest
train_predictions_rf = rf_model.transform(train_data)
train_accuracy_rf = evaluator.evaluate(train_predictions_rf)

# Confusion Matrix and Classification Report for Random Forest
def compute_confusion_matrix_and_report(predictions):
    predictions_and_labels = predictions.select("prediction", "label").rdd

    # Extract predicted and actual labels
    y_pred = np.array(predictions_and_labels.map(lambda x: x[0]).collect())
    y_true = np.array(predictions_and_labels.map(lambda x: x[1]).collect())

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Classification Report
    class_report = classification_report(y_true, y_pred)

    return conf_matrix, class_report

# Compute confusion matrix and classification report
conf_matrix_rf, class_report_rf = compute_confusion_matrix_and_report(rf_predictions)

# Print results
print(f"Random Forest Training Accuracy: {train_accuracy_rf}")
print(f"Random Forest Testing Accuracy: {rf_accuracy}")
print("Confusion Matrix for Random Forest:\n", conf_matrix_rf)
print("Classification Report for Random Forest:\n", class_report_rf)

# Function to plot confusion matrix
def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Plot confusion matrix for Random Forest
plot_confusion_matrix(conf_matrix_rf, "Random Forest")


###### Gradient Boosting Classifier

In [None]:


# 3. Gradient Boosting Classifier
gbt = GBTClassifier(featuresCol="features", labelCol="label", maxIter=100)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)

# Evaluating Gradient Boosting
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt_accuracy = evaluator.evaluate(gbt_predictions)

# Training Accuracy for Gradient Boosting
train_predictions_gbt = gbt_model.transform(train_data)
train_accuracy_gbt = evaluator.evaluate(train_predictions_gbt)

# Confusion Matrix and Classification Report for Gradient Boosting
def compute_confusion_matrix_and_report(predictions):
    predictions_and_labels = predictions.select("prediction", "label").rdd

    # Extract predicted and actual labels
    y_pred = np.array(predictions_and_labels.map(lambda x: x[0]).collect())
    y_true = np.array(predictions_and_labels.map(lambda x: x[1]).collect())

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Classification Report
    class_report = classification_report(y_true, y_pred)

    return conf_matrix, class_report

# Compute confusion matrix and classification report
conf_matrix_gbt, class_report_gbt = compute_confusion_matrix_and_report(gbt_predictions)

# Print results
print(f"Gradient Boosting Training Accuracy: {train_accuracy_gbt}")
print(f"Gradient Boosting Testing Accuracy: {gbt_accuracy}")
print("Confusion Matrix for Gradient Boosting:\n", conf_matrix_gbt)
print("Classification Report for Gradient Boosting:\n", class_report_gbt)

# Function to plot confusion matrix
def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Plot confusion matrix for Gradient Boosting
plot_confusion_matrix(conf_matrix_gbt, "Gradient Boosting")



###### All Result 

In [None]:
# Print all results at the end
for res in results:
    print(f"Model: {res['model']}, Accuracy: {res['accuracy']}")


###### Training and Validation Accuracy 

In [None]:
# Store accuracies for each model
train_accuracies = []
validation_accuracies = []
model_names = ["Logistic Regression", "Random Forest", "Gradient Boosting"]

# Evaluator instance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Logistic Regression Training and Validation Accuracy
train_predictions_lr = lr_model.transform(train_data)
train_accuracy_lr = evaluator.evaluate(train_predictions_lr)
validation_accuracy_lr = evaluator.evaluate(lr_predictions)

train_accuracies.append(train_accuracy_lr)
validation_accuracies.append(validation_accuracy_lr)

# Random Forest Training and Validation Accuracy
train_predictions_rf = rf_model.transform(train_data)
train_accuracy_rf = evaluator.evaluate(train_predictions_rf)
validation_accuracy_rf = evaluator.evaluate(rf_predictions)

train_accuracies.append(train_accuracy_rf)
validation_accuracies.append(validation_accuracy_rf)

# Gradient Boosting Training and Validation Accuracy
train_predictions_gbt = gbt_model.transform(train_data)
train_accuracy_gbt = evaluator.evaluate(train_predictions_gbt)
validation_accuracy_gbt = evaluator.evaluate(gbt_predictions)

train_accuracies.append(train_accuracy_gbt)
validation_accuracies.append(validation_accuracy_gbt)

# Plotting Training and Validation Accuracy for each model
plt.figure(figsize=(10, 6))

# Plot training accuracy
plt.plot(model_names, train_accuracies, label='Training Accuracy', marker='o')

# Plot validation accuracy
plt.plot(model_names, validation_accuracies, label='Validation Accuracy', marker='o')

plt.title("Training and Validation Accuracy for ML Models")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.ylim(0, 1)  # Accuracy ranges between 0 and 1
plt.legend()
plt.grid(True)
plt.show()

print(f"Logistic Regression Training Accuracy: {train_accuracy_lr}")
print(f"Logistic Regression Validation Accuracy: {validation_accuracy_lr}")
print(f"Random Forest Training Accuracy: {train_accuracy_rf}")
print(f"Random Forest Validation Accuracy: {validation_accuracy_rf}")
print(f"Gradient Boosting Training Accuracy: {train_accuracy_gbt}")
print(f"Gradient Boosting Validation Accuracy: {validation_accuracy_gbt}")


###### Model Creation

In [None]:
# Load your dataset
data = pd.read_csv("dataset.csv")

# Define numerical columns
numerical_columns = ['age', 'studytime', 'failures', 
                     'Medu', 'G1', 'G2']

# Prepare the features and label
X = data[numerical_columns]  # Use all 6 features (including G1 and G2)
y = (data['G3'] > 10).astype(int)  # Label: binary classification based on G3

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (optional but often beneficial)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the entire Random Forest model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

print("Random Forest model saved to 'random_forest_model.pkl'")

# Loading the model for prediction later (if needed)
with open('random_forest_model.pkl', 'rb') as f:
    loaded_rf_model = pickle.load(f)

print("Loaded model parameters:", loaded_rf_model)


###### Hyper Tuning and Aditional Optimization

In [None]:
# Load your dataset
data = pd.read_csv("dataset.csv")

# Define numerical columns
numerical_columns = ['age', 'studytime', 'failures', 'Medu', 'G1', 'G2']

# Prepare the features and label
X = data[numerical_columns]  # Use all 6 features (including G1 and G2)
y = (data['G3'] > 10).astype(int)  # Label: binary classification based on G3

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']  # Removed 'auto'
}

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print("Best parameters found: ", grid_search.best_params_)

# Use the best model found by GridSearchCV
best_rf_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_rf_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Save the trained Random Forest model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)

# Save the scaler for future use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Random Forest model and scaler saved.")
