# Trees

## Importing libraries

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import root_mean_squared_error

random_state = 1
np.random.seed(random_state)

## Preparing the dataset

In [3]:
def prepare_train(train_data):  
    train_dict = train_data.to_dict(orient='records')
    dv = DictVectorizer(sparse=True)
    X_train = dv.fit_transform(train_dict)
    return X_train, dv

def prepare_val(val_data, dv):
    val_dict = val_data.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    return X_val

In [4]:
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.drop('student_id', axis=1, inplace=True)
df.fillna(0, inplace=True)
train_data, temp_data = train_test_split(df, test_size=0.4, random_state=random_state)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=random_state)
print(train_data.shape, val_data.shape, test_data.shape)
X_train = train_data.drop('jamb_score', axis=1)
y_train = train_data.jamb_score
X_val = val_data.drop('jamb_score', axis=1)
y_val = val_data.jamb_score
X_test = test_data.drop('jamb_score', axis=1)
y_test = test_data.jamb_score

X_train_vec, dv = prepare_train(X_train)
X_val_vec = prepare_val(X_val, dv)
X_test_vec = prepare_val(X_test, dv)

(3000, 16) (1000, 16) (1000, 16)


## Question 1

In [40]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train_vec, y_train)
split_feature_index = dt.tree_.feature[0]
features_names = dv.get_feature_names_out()
f"'{features_names[split_feature_index]}' used for splitting data"


"'study_hours_per_week' used for splitting data"

## Question 2

In [64]:
rf = RandomForestRegressor(n_estimators=10, random_state=random_state, n_jobs=-1)
rf.fit(X_train_vec, y_train)
y_pred = rf.predict(X_val_vec)
rmse = root_mean_squared_error(y_val, y_pred)
f"RMSE is {round(rmse, 2)}"

'RMSE is 43.16'

## Question 3

In [67]:
rmse = float('+inf')
for n_estimators in range(10, 210, 10):
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
    rf.fit(X_train_vec, y_train)
    y_pred = rf.predict(X_val_vec)
    new_rmse = round(root_mean_squared_error(y_val, y_pred), 3)
    if new_rmse >= rmse:
        print(f'RMSE {new_rmse} stopped improving')
        break
    print(f'{new_rmse} RMSE for {n_estimators} n_estimators')
    rmse = new_rmse

43.158 RMSE for 10 n_estimators
41.79 RMSE for 20 n_estimators
41.556 RMSE for 30 n_estimators
41.076 RMSE for 40 n_estimators
40.957 RMSE for 50 n_estimators
40.774 RMSE for 60 n_estimators
40.588 RMSE for 70 n_estimators
40.503 RMSE for 80 n_estimators
40.435 RMSE for 90 n_estimators
40.365 RMSE for 100 n_estimators
40.348 RMSE for 110 n_estimators
40.302 RMSE for 120 n_estimators
40.286 RMSE for 130 n_estimators
40.263 RMSE for 140 n_estimators
40.254 RMSE for 150 n_estimators
40.2 RMSE for 160 n_estimators
40.187 RMSE for 170 n_estimators
40.136 RMSE for 180 n_estimators
RMSE 40.152 stopped improving


## Question 4

depths = [10, 15, 20, 25]
rmses = []
for max_depth in depths:
    max_depth_rmses = []
    for n_estimators in range(10, 210, 10):
        rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, max_depth=max_depth, n_jobs=-1)
        rf.fit(X_train_vec, y_train)
        y_pred = rf.predict(X_val_vec)
        rmse = root_mean_squared_error(y_val, y_pred)
        max_depth_rmses.append(rmse)
        print(f"RMSE {rmse}, max_depth {max_depth}, n_estimators {n_estimators}")
    mean_rmse = np.mean(max_depth_rmses)
    rmses.append(mean_rmse)
    print(f"Mean RMSE {mean_rmse}, max_depth {max_depth}")
rmses

In [10]:
depth = depths[np.array(rmses).argmax()]
f"{depth} best max_depth"

'25 best max_depth'

## Question 5

In [18]:
rf = RandomForestRegressor(n_estimators=10, random_state=random_state, max_depth=20, n_jobs=-1)
rf.fit(X_train_vec, y_train)
feature = dv.get_feature_names_out()[rf.feature_importances_.argmax()]
f"'{feature}' most important feature" 

"'study_hours_per_week' most important feature"

## Question 6

In [22]:
dtrain = xgb.DMatrix(X_train_vec, label=y_train, feature_names=dv.feature_names_)
dval = xgb.DMatrix(X_val_vec, label=y_val, feature_names=dv.feature_names_)
watchlist = [(dtrain, 'train'), (dval, 'val')]
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [23]:
model = xgb.train(xgb_params, dtrain,
                  num_boost_round=100,
                  evals=watchlist, verbose_eval=10)

[0]	train-rmse:42.84835	val-rmse:44.52338
[10]	train-rmse:31.73818	val-rmse:40.83759
[20]	train-rmse:27.52551	val-rmse:41.28381
[30]	train-rmse:25.19051	val-rmse:41.61678
[40]	train-rmse:22.21691	val-rmse:42.04865
[50]	train-rmse:19.90566	val-rmse:42.31672
[60]	train-rmse:17.78802	val-rmse:42.54742
[70]	train-rmse:16.24964	val-rmse:42.89325
[80]	train-rmse:14.65331	val-rmse:43.21506
[90]	train-rmse:13.37213	val-rmse:43.24201
[99]	train-rmse:12.29305	val-rmse:43.34291


In [24]:
xgb_params['eta'] = 0.1
model = xgb.train(xgb_params, dtrain,
                  num_boost_round=100,
                  evals=watchlist, verbose_eval=10)

[0]	train-rmse:45.64414	val-rmse:46.63724
[10]	train-rmse:37.26338	val-rmse:41.35829
[20]	train-rmse:33.94837	val-rmse:40.40272
[30]	train-rmse:31.97085	val-rmse:40.20269
[40]	train-rmse:30.48894	val-rmse:40.26732
[50]	train-rmse:29.42464	val-rmse:40.42217
[60]	train-rmse:28.40566	val-rmse:40.50575
[70]	train-rmse:27.14408	val-rmse:40.58776
[80]	train-rmse:26.19196	val-rmse:40.64646
[90]	train-rmse:25.38224	val-rmse:40.72848
[99]	train-rmse:24.58526	val-rmse:40.83188


eta 0.1 leads to the best RMSE score on the validation dataset