## XGBoost Model

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.tree import plot_tree
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_pickle('data/clean/StudentPerformanceFactors.pkl')
df

Unnamed: 0,Hours_Studied,Attendance,Previous_Scores,Tutoring_Sessions,Exam_Score
0,23,84,73,0,67
1,19,64,59,2,61
2,24,98,91,2,74
3,29,89,98,1,71
4,19,92,65,3,70
...,...,...,...,...,...
6602,25,69,76,1,68
6603,23,76,81,3,69
6604,20,90,65,3,68
6605,10,86,91,2,68


In [8]:
X = df.drop(columns=['Exam_Score'], axis=1)
y = df['Exam_Score']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

x_train.head(10)

Unnamed: 0,Hours_Studied,Attendance,Previous_Scores,Tutoring_Sessions
2356,18,95,96,2
4758,16,89,58,2
2281,16,69,55,1
485,11,65,78,2
4200,21,95,57,1
3958,27,63,92,0
3890,24,80,51,3
4306,16,66,68,2
6232,19,75,96,2
3235,21,63,54,1


In [9]:
model = XGBRegressor(random_state = 42)

model.fit(x_train, y_train)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6],
    'min_child_weight': [1, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

grid_search.fit(x_train, y_train)

best_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.8}


In [10]:
# Make predictions on the test data
y_pred = best_model.predict(x_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate R^2 Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

Mean Squared Error: 6.219224418489642
R² Score: 0.5997751355171204


In [11]:
with open('models/xgboost_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)