# Student Performance Prediction

This notebook builds an XGBRegressor model to predict student performance using machine learning.

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## Importing the dataset

In [13]:
dataset1 = pd.read_csv('student_performance_1.csv', index_col=0)
dataset2 = pd.read_csv('student_performance_2.csv', index_col=0)
dataset = pd.concat([dataset1, dataset2], ignore_index=True)
# Assuming the last column is the target (y)
X = dataset.iloc[:, 1:-2].values  # adjust according to your dataset
y = dataset.iloc[:, -2].values

dataset.head()

Unnamed: 0,weekly_self_study_hours,attendance_percentage,class_participation,total_score,grade
0,18.5,95.6,3.8,97.9,A
1,14.0,80.0,2.5,83.9,B
2,19.5,86.3,5.3,100.0,A
3,25.7,70.2,7.0,100.0,A
4,13.4,81.9,6.9,92.0,A


## Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the XGBRegressor

In [None]:
classifier = XGBRegressor(
    max_depth=3,
    min_child_weight=2.2,
    colsample_bytree=0.61,
    subsample=0.48,
    learning_rate=0.56,
    n_estimators=6,
    random_state=0
)
classifier.fit(X_train, y_train)

## Predictions and evaluation

In [None]:
y_pred = classifier.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"RÂ²   = {r2:.3f}")
print(f"MAE  = {mae:.3f}")
print(f"MSE  = {mse:.3f}")

## Cross-validation

In [None]:
cv_scores = cross_val_score(classifier, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
print("CV Mean MSE:", -cv_scores.mean())

## Grid Search for hyperparameter tuning

In [None]:
param_grid = {
    'max_depth': [2, 3, 4],
    'min_child_weight': [2.1, 2.15, 2.2],
    'subsample': [0.475, 0.48, 0.485],
    'colsample_bytree': [0.565, 0.57, 0.575],
    'learning_rate': [0.555, 0.56, 0.565],
    'n_estimators': [5, 6, 7]
}

grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

print("Best MSE:", -grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
dataset = pd.read_csv('/content/drive/MyDrive/AI/1. Machine learning/ML projects/Student Performance Dataset/student_performance.csv')

midpoint = len(dataset) // 2
df_half1 = dataset.iloc[:midpoint]
df_half2 = dataset.iloc[midpoint:]

df_half1.to_csv('student_performance_1.csv', index=False)
df_half2.to_csv('student_performance_2.csv', index=False)

print("Dataset successfully split into two halves and saved as 'student_performance_1.csv' and 'student_performance_2.csv'.")

Dataset successfully split into two halves and saved as 'student_performance_1.csv' and 'student_performance_2.csv'.


In [None]:
reloaded_df_half1 = pd.read_csv('student_performance_1.csv')
reloaded_df_half2 = pd.read_csv('student_performance_2.csv')

print("Successfully reloaded 'student_performance_1.csv' into reloaded_df_half1.")
print("Successfully reloaded 'student_performance_2.csv' into reloaded_df_half2.")
