# Project : Students Performance in Exams

In [None]:
# Step 1: Import Libraries & Load Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("StudentsPerformance.csv")
print(df.head(5))
df.info()

In [None]:
# Step 2: Data Cleaning

# Rename columns for consistency
df.rename(columns={'gender':'Gender','race/ethnicity':'Ethnicity','parental level of education':'Parental_Education','lunch':'Lunch',
                   'test preparation course': 'Test_Preparation','math score':'Maths_Score','reading score':'Reading_Score',
                   'writing score':'Writing_Score'}, inplace=True)
# Handle missing values 
pd.isna(df).sum()

# Convert Categorical Columns into category dtype
categorical_cols = ([['Gender', 'Ethnicity', 'Parental_Education', 'Lunch', 'Test_Preparation']])
for col in categorical_cols:
    df[col] = df[col].astype('category')

# for capitalize the first letter of the column
# df.columns = [col.capitalize() for col in df.columns]

print("\nData Types after Conversion:")
print(df.dtypes)

In [None]:
# Step 3: Feature Engineering

# Total Score
df['Total_Score'] = df[["Maths_Score","Reading_Score","Writing_Score"]].sum(axis=1)
# Average Score
df['Average_Score'] = df[["Maths_Score","Reading_Score","Writing_Score"]].mean(axis=1)
# Result (pass/fail)
df['Result'] = np.where(df['Average_Score'] >= 60, 'Pass', 'Fail')
# Define grade conditions 
conditions = [
    (df['Average_Score'] >= 90),
    (df['Average_Score'] >= 80) &
    (df['Average_Score'] < 90),
    (df['Average_Score'] >= 70) &
    (df['Average_Score'] < 80),
    (df['Average_Score'] >= 60) &
    (df['Average_Score'] < 70),
    (df['Average_Score'] < 60)
]
# Define grade labels
grades = ['A','B','C','D','E']
# Create new column
df['Grades'] = np.select(conditions, grades)

print(df[['Total_Score','Average_Score','Result','Grades']].head())


In [None]:
# Step 4: Exploratory Data Analysis (EDA)

# Average score by Gender
grouped = df.groupby('Gender')['Average_Score'].mean()
# Average score by Ethnicity
ethnicity_perf = df.groupby('Ethnicity',as_index=False) ['Average_Score'].mean()
# Effect of Test Preparation on Performance
test_prep_effect = df.groupby('Test_Preparation')['Average_Score'].mean()

print("Average Score based on Test Preparation:")
print(test_prep_effect)

In [None]:
# Step 5: Visualization Dashboard
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram - Distribution of Average Scores
plt.figure(figsize=(6,4))
sns.histplot(df['Average_Score'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Average Scores',fontsize=13, fontweight='bold')
plt.xlabel('Average_Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Bar Plot - Average Score by Gender
plt.figure(figsize=(6,4))
sns.barplot(x='Gender', y='Average_Score',data=df, color='yellow')
plt.title('Average Score by Gender', fontsize=13, fontweight='bold')
plt.xlabel('Gender')
plt.ylabel('Average_Score')
plt.tight_layout()
plt.show()

# Bar Plot - Average Score by Ethnicity
plt.figure(figsize=(6,4))
sns.barplot(x='Ethnicity',y='Average_Score',data=df,color='black')
plt.title('Distribution of Avg Score by Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Average_Score')
plt.tight_layout()
plt.show()

# Bar Plot - Score Spread by Gender
plt.figure(figsize=(6,4))
sns.boxplot(x='Gender', y='Average_Score', data=df, palette='Accent')
plt.title('Score Distribution by Gender', fontsize=13, fontweight='bold')
plt.xlabel('Gender')
plt.ylabel('Average_Score')
plt.show()

# Bar plot - Effect of Test Preparation
plt.figure(figsize=(6,4)) #set fig size

# plot bar chart
test_prep_effect.plot(kind='bar', color=['orange','skyblue'], edgecolor='black')
plt.title("Effect of Test Preparation on Student Performance", fontsize=17, fontweight='bold')
plt.xlabel("Test Preparation Status", fontsize=11)
plt.ylabel("Average_Score", fontsize=11)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Correlation Heatmap - Relationship Between Numeric Columns
plt.figure(figsize=(8,5))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Correlation Heatmap', fontsize=13, fontweight='bold')
plt.show()

# Pass/Fail Distribution
plt.figure(figsize=(5,4))
sns.countplot(x='Result', data=df, palette='pastel', edgecolor='black')
plt.title('Pass/Fail Distribution', fontsize=13, fontweight='bold')
plt.xlabel('Result')
plt.ylabel('Number of Students')
plt.show()

In [None]:
# Step 6: Export Cleaned Data
df.to_csv('StudentsPerformance_Cleaned.csv', index=False)
print("Cleaned dataset saved as 'StudentPerformance_Cleaned.csv'")