In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

import asyncio
import sys

if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())



In [None]:
df = pd.read_csv(r"C:\Users\hillarik\desktop\MLzoomcamp\06 credit scoring\jamb_exam_results.csv")
df.head()


In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

In [None]:
df = df.drop(columns=['student_id'], errors='ignore')
df = df.fillna(0)


In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)



In [None]:
y_train = df_train['jamb_score'].values
y_val = df_val['jamb_score'].values
y_test = df_test['jamb_score'].values

X_train = df_train.drop(columns='jamb_score')
X_val = df_val.drop(columns='jamb_score')
X_test = df_test.drop(columns='jamb_score')


In [None]:
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

df_train

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))


QUESTION ONE

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Get the feature used for splitting
feature_used = dv.feature_names_[dt.tree_.feature[0]]
print("Feature used for splitting:", feature_used)

QUESTION 2

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)


QUESTION 3

In [None]:
fscores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)
    scores.append((n, rmse))

    print(f"n: {n}, rmse: {rmse:.3f}")

print(scores)


QUESTION 4

In [None]:
scores = []
for d in [10, 15, 20, 25]:
    for n in range(10,201,10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1)
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)
scores.append((n, d, rmse))
        #print(f"d: {d}, n: {n}, rmse: {rmse:.3f}")

In [None]:
columns = ['n_estimators','max_depth', 'rmse']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
for d in [10, 15, 20, 25]:
    df_subset = df_scores[df_scores.max_depth == d]

    plt.plot(df_subset.n_estimators, df_subset.rmse,
             label='max_depth=%d' % d)

    plt.legend()

QUESTION 5

In [None]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
most_important_feature = dv.get_feature_names_out()[np.argmax(importances)]
print("Most important feature:", most_important_feature)


QUESTION 6

In [None]:
import xgboost as xgb


dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)


In [None]:
# Train with eta = 0.3
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Train the model
model = xgb.train(xgb_params, dtrain, num_boost_round=200, 
                  verbose_eval=5, evals=watchlist)


In [None]:
# Train with eta = 0.1
xgb_params['eta'] = 0.1
model_0_1 = xgb.train(xgb_params, dtrain, num_boost_round=100)


In [None]:
key = f"eta={xgb_params['eta']}"
scores[key] = parse_xgb_output(output)

In [None]:

for eta, df_score in scores.items():
    plt.plot(df_score.num_iter, df_score.val_rmse, label=eta)

plt.legend()