In [None]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('dataset/train.csv')
sub_df = pd.read_csv('dataset/test.csv')

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(columns=['id']) # Instead of re-assignment inplace=True could be used

sub_ids = sub_df['id']
sub_df = sub_df.drop(columns=['id'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_df.drop(columns='exam_score'), train_df['exam_score'])

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

columns_to_remove = [
  'gender',
  'course',
  'internet_access',
  'exam_difficulty',
  'age'  #maybe keep
]
X_train = X_train.drop(columns=columns_to_remove)
X_test= X_test.drop(columns=columns_to_remove)

categorical_columns = [
#  'gender',
#  'course',
#  'internet_access',
  'sleep_quality',
  'study_method',
  'facility_rating',
#  'exam_difficulty'
]

label_encoders = {}
for col in categorical_columns:
  label_encoder = LabelEncoder()
  X_train[col] = label_encoder.fit_transform(X_train[col])
  X_test[col] = label_encoder.transform(X_test[col])
  label_encoders[col] = label_encoder

X_train.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, Y_train)
predictions = decision_tree.predict(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
  'max_depth': [None, 2, 5, 10, 15, 20]
}

grid_search_cv = GridSearchCV(estimator=decision_tree, param_grid=parameters, scoring='neg_root_mean_squared_error')
grid_search_cv.fit(X_train, Y_train)

print("===Best params===")
print(grid_search_cv.best_params_)

predictions = grid_search_cv.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

print('===MAE===')
print(mean_absolute_error(Y_test, predictions))

print('\n===MSE===')
print(mean_squared_error(Y_test, predictions))

print('\n===RMSE (RMSD)===')
print(root_mean_squared_error(Y_test, predictions))

print('\n===R2===')
print(r2_score(Y_test, predictions))

In [None]:
sub_df = sub_df.drop(columns=columns_to_remove)

for col in categorical_columns:
  sub_df[col] = label_encoders.get(col).transform(sub_df[col])

test_predictions = grid_search_cv.predict(sub_df)

pd.concat([sub_ids, pd.DataFrame(test_predictions)], axis=1).to_csv("predictions.csv", index=False, header=['id', 'exam_score'])