In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# 1. Load the dataset
print("Loading dataset...")
df = pd.read_csv("Students_Grading_Dataset.csv")

# Display basic information
print("\n--- Dataset Overview ---")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())
print("\nData types:")
display(df.dtypes)
print("\nSummary statistics:")
display(df.describe())



Loading dataset...

--- Dataset Overview ---
Shape: (5000, 23)

First 5 rows:


Unnamed: 0,Student_ID,First_Name,Last_Name,Email,Gender,Age,Department,Attendance (%),Midterm_Score,Final_Score,...,Projects_Score,Total_Score,Grade,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Parent_Education_Level,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night
0,S1000,Omar,Williams,student0@university.com,Female,22,Engineering,52.29,55.03,57.82,...,85.9,56.09,F,6.2,No,Yes,High School,Medium,5,4.7
1,S1001,Maria,Brown,student1@university.com,Male,18,Engineering,97.27,97.23,45.8,...,55.65,50.64,A,19.0,No,Yes,,Medium,4,9.0
2,S1002,Ahmed,Jones,student2@university.com,Male,24,Business,57.19,67.05,93.68,...,73.79,70.3,D,20.7,No,Yes,Master's,Low,6,6.2
3,S1003,Omar,Williams,student3@university.com,Female,24,Mathematics,95.15,47.79,80.63,...,92.12,61.63,A,24.8,Yes,Yes,High School,High,3,6.7
4,S1004,John,Smith,student4@university.com,Female,23,CS,54.18,46.59,78.89,...,68.42,66.13,F,15.4,Yes,Yes,High School,High,2,7.1



Data types:


Student_ID                     object
First_Name                     object
Last_Name                      object
Email                          object
Gender                         object
Age                             int64
Department                     object
Attendance (%)                float64
Midterm_Score                 float64
Final_Score                   float64
Assignments_Avg               float64
Quizzes_Avg                   float64
Participation_Score           float64
Projects_Score                float64
Total_Score                   float64
Grade                          object
Study_Hours_per_Week          float64
Extracurricular_Activities     object
Internet_Access_at_Home        object
Parent_Education_Level         object
Family_Income_Level            object
Stress_Level (1-10)             int64
Sleep_Hours_per_Night         float64
dtype: object


Summary statistics:


Unnamed: 0,Age,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Total_Score,Study_Hours_per_Week,Stress_Level (1-10),Sleep_Hours_per_Night
count,5000.0,4484.0,5000.0,5000.0,4483.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,21.0484,75.431409,70.326844,69.640788,74.798673,74.910728,4.980024,74.92486,75.121804,17.65886,5.4808,6.48814
std,1.989786,14.372446,17.213209,17.238744,14.411799,14.504281,2.890136,14.423415,14.399941,7.275864,2.86155,1.452283
min,18.0,50.01,40.0,40.0,50.0,50.03,0.0,50.01,50.02,5.0,1.0,4.0
25%,19.0,63.265,55.4575,54.6675,62.09,62.49,2.44,62.32,62.835,11.4,3.0,5.2
50%,21.0,75.725,70.51,69.735,74.81,74.695,4.955,74.98,75.395,17.5,5.0,6.5
75%,23.0,87.4725,84.97,84.5,86.97,87.63,7.5,87.3675,87.6525,24.1,8.0,7.7
max,24.0,100.0,99.98,99.98,99.98,99.96,10.0,100.0,99.99,30.0,10.0,9.0


In [None]:
print("\n--- Missing Values Analysis ---")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
missing_percentage = (missing_values / len(df)) * 100
print("\nMissing values percentage:")
print(missing_percentage[missing_percentage > 0])




--- Missing Values Analysis ---
Attendance (%)             516
Assignments_Avg            517
Parent_Education_Level    1794
dtype: int64

Missing values percentage:
Attendance (%)            10.32
Assignments_Avg           10.34
Parent_Education_Level    35.88
dtype: float64


In [16]:
print("\n--- Handling Missing Values ---")
# For numeric columns: impute with median
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_columns:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
        print(f"Imputed missing values in {col} with median: {median_value}")

# For categorical columns: impute with mode
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if df[col].isnull().sum() > 0:
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)
        print(f"Imputed missing values in {col} with mode: {mode_value}")


--- Handling Missing Values ---
Imputed missing values in Attendance (%) with median: 75.725
Imputed missing values in Assignments_Avg with median: 74.81
Imputed missing values in Parent_Education_Level with mode: PhD


In [None]:
print("\n--- Removing Duplicates ---")
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
if duplicate_count > 0:
    df.drop_duplicates(inplace=True)
    print(f"Removed {duplicate_count} duplicate rows.")




--- Removing Duplicates ---
Number of duplicate rows: 0


In [None]:
print("\n--- Checking Data Consistency ---")
# Example: Check for consistency in categorical columns
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"Column {col} has {len(unique_values)} unique values: {unique_values[:10] if len(unique_values) > 10 else unique_values}")




--- Checking Data Consistency ---
Column Student_ID has 5000 unique values: ['S1000' 'S1001' 'S1002' 'S1003' 'S1004' 'S1005' 'S1006' 'S1007' 'S1008'
 'S1009']
Column First_Name has 8 unique values: ['Omar' 'Maria' 'Ahmed' 'John' 'Liam' 'Sara' 'Emma' 'Ali']
Column Last_Name has 6 unique values: ['Williams' 'Brown' 'Jones' 'Smith' 'Davis' 'Johnson']
Column Email has 5000 unique values: ['student0@university.com' 'student1@university.com'
 'student2@university.com' 'student3@university.com'
 'student4@university.com' 'student5@university.com'
 'student6@university.com' 'student7@university.com'
 'student8@university.com' 'student9@university.com']
Column Gender has 2 unique values: ['Female' 'Male']
Column Department has 4 unique values: ['Engineering' 'Business' 'Mathematics' 'CS']
Column Grade has 5 unique values: ['F' 'A' 'D' 'B' 'C']
Column Extracurricular_Activities has 2 unique values: ['No' 'Yes']
Column Internet_Access_at_Home has 2 unique values: ['Yes' 'No']
Column Parent_Educa

In [None]:
    print("\n--- Outlier Detection ---")
    def detect_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
        return outliers, lower_bound, upper_bound

    # Detect outliers in numeric columns
    for col in numeric_columns:
        outliers, lower_bound, upper_bound = detect_outliers(df, col)
        if len(outliers) > 0:
            print(f"Column {col} has {len(outliers)} outliers")
            print(f"  - Outlier values: {outliers.values[:5]}{'...' if len(outliers) > 5 else ''}")
            print(f"  - Bounds: [{lower_bound}, {upper_bound}]")
            
            # Create boxplot for columns with outliers
            if len(outliers) > 0:
                plt.figure(figsize=(10, 6))
                sns.boxplot(x=df[col])
                plt.title(f'Boxplot of {col}')
                plt.tight_layout()
                plt.savefig(f'boxplot_{col}.png')
                
                plt.close()


--- Outlier Detection ---


In [10]:
print("\n--- Normalizing and Standardizing Data ---")
# Create a copy of the dataframe before scaling
df_scaled = df.copy()

# Min-Max Scaling (0-1 range)
print("Applying Min-Max scaling to numeric columns...")
scaler = MinMaxScaler()
df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Also create a standardized version (z-score)
df_standardized = df.copy()
print("Applying Z-score standardization to numeric columns...")
std_scaler = StandardScaler()
df_standardized[numeric_columns] = std_scaler.fit_transform(df[numeric_columns])


--- Normalizing and Standardizing Data ---
Applying Min-Max scaling to numeric columns...
Applying Z-score standardization to numeric columns...


In [11]:
# 8. Save the processed datasets
print("\n--- Saving Processed Datasets ---")
df.to_csv("students_cleaned.csv", index=False)
print("Saved cleaned dataset to 'students_cleaned.csv'")

df_scaled.to_csv("students_normalized.csv", index=False)
print("Saved normalized dataset to 'students_normalized.csv'")

df_standardized.to_csv("students_standardized.csv", index=False)
print("Saved standardized dataset to 'students_standardized.csv'")

print("\n--- Data Preprocessing Complete ---")
print(f"Original shape: {df.shape}")
print(f"Final shape after preprocessing: {df.shape}")
print("The data is now ready for analysis or modeling.")


--- Saving Processed Datasets ---
Saved cleaned dataset to 'students_cleaned.csv'
Saved normalized dataset to 'students_normalized.csv'
Saved standardized dataset to 'students_standardized.csv'

--- Data Preprocessing Complete ---
Original shape: (5000, 23)
Final shape after preprocessing: (5000, 23)
The data is now ready for analysis or modeling.
