In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('dataset/train.csv')
sub_df = pd.read_csv('dataset/test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


In [4]:
train_df = train_df.drop(columns=['id']) # Instead of re-assignment inplace=True could be used

sub_ids = sub_df['id']
sub_df = sub_df.drop(columns=['id'])

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_df.drop(columns='exam_score'), train_df['exam_score'])

In [6]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# columns_to_remove = [
#   'gender',
#   'course',
#   'internet_access',
#   'exam_difficulty',
#   'age'  #maybe keep
# ]
# X_train = X_train.drop(columns=columns_to_remove)
# X_test= X_test.drop(columns=columns_to_remove)

categorical_columns = [
  'gender',
  'course',
  'internet_access',
  'sleep_quality',
  'study_method',
  'facility_rating',
  'exam_difficulty'
]

label_encoders = {}
for col in categorical_columns:
  label_encoder = LabelEncoder()
  X_train[col] = label_encoder.fit_transform(X_train[col])
  X_test[col] = label_encoder.transform(X_test[col])
  label_encoders[col] = label_encoder

X_train.head()

Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty
457102,22,2,5,1.54,45.5,1,5.3,2,1,2,2
433080,18,1,4,6.85,47.3,1,9.4,2,4,2,1
475720,17,0,5,7.91,53.0,1,9.4,0,3,1,0
23388,22,1,1,3.39,71.4,0,8.1,0,2,1,0
569506,19,1,2,5.64,53.5,1,7.4,2,2,0,2


In [7]:
from xgboost import XGBRegressor

xg_boost = XGBRegressor(objective='reg:squarederror')
xg_boost.fit(X_train, Y_train)
predictions = xg_boost.predict(X_test)

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 6, 7, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [20, 50, 70, 100, 120, 150]
}

grid_search_cv_5 = GridSearchCV(estimator=xg_boost, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid_search_cv_5.fit(X_train, Y_train)

print("===Best params for 5===")
print(grid_search_cv_5.best_params_)

grid_search_cv_10 = GridSearchCV(estimator=xg_boost, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=10)
grid_search_cv_10.fit(X_train, Y_train)

print("===Best params for 10===")
print(grid_search_cv_10.best_params_)

predictions = grid_search_cv_5.predict(X_test)

===Best params for 5===
{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
===Best params for 10===
{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

print('===MAE===')
print(mean_absolute_error(Y_test, predictions))

print('\n===MSE===')
print(mean_squared_error(Y_test, predictions))

print('\n===RMSE (RMSD)===')
print(root_mean_squared_error(Y_test, predictions))

print('\n===R2===')
print(r2_score(Y_test, predictions))

===MAE===
7.0124076524088075

===MSE===
77.3527824428769

===RMSE (RMSD)===
8.795043060888156

===R2===
0.7835744238239736


In [10]:
# sub_df = sub_df.drop(columns=columns_to_remove)

for col in categorical_columns:
  sub_df[col] = label_encoders.get(col).transform(sub_df[col])

test_predictions = grid_search_cv_5.predict(sub_df)

pd.concat([sub_ids, pd.DataFrame(test_predictions)], axis=1).to_csv("predictions.csv", index=False, header=['id', 'exam_score'])