In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error


In [2]:
df = pd.read_csv('jamb_exam_results.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop(columns=['student_id']).fillna(0)

y = df['jamb_score']
X = df.drop(columns=['jamb_score'])

X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

X_train, X_val, y_train, y_val = train_test_split(
    X_full_train, y_full_train, test_size=0.25, random_state=1
)

dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient='records'))
X_val_dv = dv.transform(X_val.to_dict(orient='records'))


In [3]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_dv, y_train)

feature_index = dt.tree_.feature[0]
q1_feature = dv.feature_names_[feature_index]
q1_feature


'study_hours_per_week'

In [4]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

rmse_q2 = root_mean_squared_error(y_val, rf.predict(X_val_dv))
rmse_q2


42.13724207871227

In [5]:
rows = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_dv, y_train)
    rmse = root_mean_squared_error(y_val, rf.predict(X_val_dv))
    rows.append((n, rmse, round(rmse, 3)))

pd.DataFrame(rows, columns=["n_estimators", "rmse", "rmse_3dp"])


Unnamed: 0,n_estimators,rmse,rmse_3dp
0,10,42.137242,42.137
1,20,41.461215,41.461
2,30,41.106171,41.106
3,40,40.917194,40.917
4,50,40.852279,40.852
5,60,40.784281,40.784
6,70,40.677098,40.677
7,80,40.539333,40.539
8,90,40.504346,40.504
9,100,40.516805,40.517


In [6]:
depths = [10, 15, 20, 25]
rows = []

for depth in depths:
    rmses = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_dv, y_train)
        rmses.append(root_mean_squared_error(y_val, rf.predict(X_val_dv)))

    rows.append((depth, np.mean(rmses)))

pd.DataFrame(rows, columns=["max_depth", "mean_rmse"]).sort_values("mean_rmse")


Unnamed: 0,max_depth,mean_rmse
0,10,40.392498
1,15,40.735282
2,20,40.739734
3,25,40.787866


In [7]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

features = np.array(dv.feature_names_)
importances = rf.feature_importances_

target_features = np.array([
    'study_hours_per_week',
    'attendance_rate',
    'distance_to_school',
    'teacher_quality'
])

mask = np.isin(features, target_features)

top_idx = np.argmax(importances[mask])
most_important = features[mask][top_idx]
most_important


np.str_('study_hours_per_week')

In [9]:
print("===== FINAL ANSWERS =====\n")

print("Q1. Split feature (Decision Tree, max_depth=1):")
print("   study_hours_per_week\n")

print("Q2. RMSE on validation set (Random Forest, n_estimators=10):")
print("   42.13\n")

print("Q3. n_estimators where RMSE stops improving (3 decimal places):")
print("   80\n")

print("Q4. Best max_depth by average RMSE:")
print("   10\n")

print("Q5. Most important feature (Random Forest, max_depth=20):")
print("   study_hours_per_week\n")

print("===== END =====")


===== FINAL ANSWERS =====

Q1. Split feature (Decision Tree, max_depth=1):
   study_hours_per_week

Q2. RMSE on validation set (Random Forest, n_estimators=10):
   42.13

Q3. n_estimators where RMSE stops improving (3 decimal places):
   80

Q4. Best max_depth by average RMSE:
   10

Q5. Most important feature (Random Forest, max_depth=20):
   study_hours_per_week

===== END =====
