# üìì Student Performance Prediction Project

## üìù Step 1. Import Libraries

In [None]:

# Data Handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier, XGBRegressor

# Metrics
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score, accuracy_score,
    mean_squared_error, r2_score
)


## üìù Step 2. Load Dataset

In [None]:

# Load dataset (download from Kaggle: https://www.kaggle.com/datasets/spscientist/students-performance-in-exams)
df = pd.read_csv("StudentsPerformance.csv")

print("First 5 rows:\n", df.head())
print("\nInfo:\n")
print(df.info())
print("\nSummary:\n")
print(df.describe())


## üìù Step 3. Data Cleaning & Feature Engineering

In [None]:

# Encode categorical features
le = LabelEncoder()
for col in ['gender', 'race/ethnicity', 'parental level of education',
            'lunch', 'test preparation course']:
    df[col] = le.fit_transform(df[col])

# Create new target variables
df['average_score'] = df[['math score','reading score','writing score']].mean(axis=1)
df['pass_fail'] = np.where(df['average_score'] >= 40, 1, 0)  # 1=Pass, 0=Fail

print(df.head())


## üìù Step 4. Exploratory Data Analysis (Statistics + Plots)

In [None]:

# Basic statistics
print("Mean Score:", df['average_score'].mean())
print("Median Score:", df['average_score'].median())
print("Mode Score:", df['average_score'].mode()[0])
print("Variance:", df['average_score'].var())
print("Standard Deviation:", df['average_score'].std())
print("Skewness:", df['average_score'].skew())

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Boxplot for IQR
sns.boxplot(x=df['average_score'])
plt.title("Boxplot of Average Score (IQR for Outliers)")
plt.show()

# Distribution
sns.histplot(df['average_score'], kde=True)
plt.title("Distribution of Average Scores")
plt.show()


## üìù Step 5. Classification: Pass/Fail Prediction

In [None]:

X = df.drop(['pass_fail','average_score'], axis=1)
y = df['pass_fail']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)
y_pred_log = log_clf.predict(X_test)

# XGBoost Classifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluation
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_log))
print("Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred_log))

print("\nXGBoost Classifier Report:\n", classification_report(y_test, y_pred_xgb))
print("Confusion Matrix (XGBoost):\n", confusion_matrix(y_test, y_pred_xgb))


## üìù Step 6. Regression: Predict Final Score

In [None]:

X = df.drop(['pass_fail','average_score'], axis=1)
y = df['average_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# XGBoost Regressor
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred_xgb = xgb_reg.predict(X_test)

# Evaluation
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_lin))
print("Linear Regression R¬≤:", r2_score(y_test, y_pred_lin))

print("\nXGBoost Regressor MSE:", mean_squared_error(y_test, y_pred_xgb))
print("XGBoost Regressor R¬≤:", r2_score(y_test, y_pred_xgb))


## üìù Step 7. Visualization of Predictions

In [None]:

# Confusion Matrix Heatmap for Classification
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d', cmap='Blues')
plt.title("XGBoost Classification Confusion Matrix")
plt.show()

# Regression Predictions vs Actual
plt.scatter(y_test, y_pred_xgb, alpha=0.7)
plt.xlabel("Actual Scores")
plt.ylabel("Predicted Scores")
plt.title("Actual vs Predicted Scores (Regression)")
plt.show()
