In [1]:
import pandas as dp
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = dp.read_csv('datasets/jamb_exam_results.csv')

In [3]:
data.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [5]:
del data["student_id"]

In [6]:
data.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


In [7]:
data.isnull().sum()

jamb_score                        0
study_hours_per_week              0
attendance_rate                   0
teacher_quality                   0
distance_to_school                0
school_type                       0
school_location                   0
extra_tutorials                   0
access_to_learning_materials      0
parent_involvement                0
it_knowledge                      0
age                               0
gender                            0
socioeconomic_status              0
parent_education_level          891
assignments_completed             0
dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
data_full_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
data_train, data_val = train_test_split(data_full_train, test_size=0.25, random_state=1)

In [10]:
data_test = data_test.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_train = data_train.reset_index(drop=True)

In [11]:
y_train = data_train.jamb_score.values
y_val = data_val.jamb_score.values
y_test = data_test.jamb_score.values

In [12]:
del data_train["jamb_score"]
del data_val["jamb_score"]
del data_test["jamb_score"]

In [13]:
from sklearn.feature_extraction import DictVectorizer

In [14]:
dv = DictVectorizer(sparse=True)

In [15]:
train_X = dv.fit_transform(data_train.fillna(0).to_dict(orient='records'))
val_X = dv.fit_transform(data_val.fillna(0).to_dict(orient='records'))
test_X = dv.fit_transform(data_test.fillna(0).to_dict(orient='records'))

In [16]:
from sklearn.tree import DecisionTreeRegressor

## Q1

In [17]:
tree = DecisionTreeRegressor(max_depth=1)

In [18]:
tree.fit(train_X, y_train)

In [19]:
from sklearn.tree import export_text

In [20]:
print(export_text(tree, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



## Q2

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
forest = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

In [23]:
forest.fit(train_X, y_train)

In [24]:
from sklearn.metrics import root_mean_squared_error

In [25]:
y_pred = forest.predict(val_X)
root_mean_squared_error(y_val, y_pred)

np.float64(42.13724207871227)

## Q3

In [26]:
for n in range(10, 201, 5):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(train_X, y_train)
    y_pred = rf.predict(val_X)
    print(f"{n}: {root_mean_squared_error(y_val, y_pred)}")

10: 42.13724207871227
15: 41.865311655354965
20: 41.46121464694444
25: 41.25337606548099
30: 41.106170947924596
35: 41.011424421266135
40: 40.917193933296545
45: 40.94711520311034
50: 40.852278663496854
55: 40.818831851475224
60: 40.78428140159447
65: 40.679260642132284
70: 40.677098222414024
75: 40.62131491060656
80: 40.53933283129176
85: 40.517835332964225
90: 40.50434592594835
95: 40.56926128510151
100: 40.51680451861919
105: 40.548397895380354
110: 40.59335280539747
115: 40.62159894991379
120: 40.6248503681005
125: 40.61774641429531
130: 40.650840905587195
135: 40.62869995155009
140: 40.5948515491302
145: 40.59201252298134
150: 40.596715029667116
155: 40.61529838846667
160: 40.60350763548252
165: 40.60688044219866
170: 40.62754627591216
175: 40.633384667672225
180: 40.641313925139386
185: 40.64615990045101
190: 40.63135509073867
195: 40.6191849918448
200: 40.60101912236933


In [27]:
for d in [10, 15, 20, 25]:
    rmses = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(max_depth=d, n_estimators=n, random_state=1, n_jobs=-1)
        rf.fit(train_X, y_train)
        y_pred = rf.predict(val_X)
        rmses.append(root_mean_squared_error(y_val, y_pred))
    print(f"{d}: {np.mean(rmses)}")

10: 40.39249798892396
15: 40.73528172486332
20: 40.739734321829275
25: 40.78786565962805


## Q5

In [28]:
tree = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

In [29]:
tree.fit(train_X, y_train)

In [30]:
feature_importance_dict = dict(zip(tree.feature_importances_, dv.get_feature_names_out()))

In [31]:
feature_importance_dict

{np.float64(0.012325395818561702): 'access_to_learning_materials=No',
 np.float64(0.010261891957053595): 'access_to_learning_materials=Yes',
 np.float64(0.06931145469695396): 'age',
 np.float64(0.031516789454600694): 'assignments_completed',
 np.float64(0.14972902978467467): 'attendance_rate',
 np.float64(0.13648580486323686): 'distance_to_school',
 np.float64(0.013459336291731933): 'extra_tutorials=No',
 np.float64(0.009131355726749436): 'extra_tutorials=Yes',
 np.float64(0.009288710588929933): 'gender=Female',
 np.float64(0.010382634572648444): 'gender=Male',
 np.float64(0.017719342039411903): 'it_knowledge=High',
 np.float64(0.012404050525957135): 'it_knowledge=Low',
 np.float64(0.009141479881545293): 'it_knowledge=Medium',
 np.float64(0.0): 'parent_education_level',
 np.float64(0.015450360187928138): 'parent_education_level=Primary',
 np.float64(0.016956919815100065): 'parent_education_level=Secondary',
 np.float64(0.014488617277194148): 'parent_education_level=Tertiary',
 np.float

In [32]:
sorted(feature_importance_dict.items(), key=lambda x: x[0], reverse=True)

[(np.float64(0.2483536119939654), 'study_hours_per_week'),
 (np.float64(0.14972902978467467), 'attendance_rate'),
 (np.float64(0.13648580486323686), 'distance_to_school'),
 (np.float64(0.08268222263880151), 'teacher_quality'),
 (np.float64(0.06931145469695396), 'age'),
 (np.float64(0.031516789454600694), 'assignments_completed'),
 (np.float64(0.02571426434358714), 'socioeconomic_status=High'),
 (np.float64(0.02291885292906419), 'parent_involvement=High'),
 (np.float64(0.017719342039411903), 'it_knowledge=High'),
 (np.float64(0.016956919815100065), 'parent_education_level=Secondary'),
 (np.float64(0.015450360187928138), 'parent_education_level=Primary'),
 (np.float64(0.014488617277194148), 'parent_education_level=Tertiary'),
 (np.float64(0.013459336291731933), 'extra_tutorials=No'),
 (np.float64(0.013357613537934165), 'parent_involvement=Low'),
 (np.float64(0.012404050525957135), 'it_knowledge=Low'),
 (np.float64(0.012325395818561702), 'access_to_learning_materials=No'),
 (np.float64(0.

## Q6

In [33]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [34]:
!pip install xgboost




[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import xgboost as xgb

In [36]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(train_X, label=y_train, feature_names=features)
dval = xgb.DMatrix(val_X, label=y_val, feature_names=features)

In [37]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [38]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [39]:
y_pred = model.predict(dval)

In [40]:
root_mean_squared_error(y_val, y_pred)

np.float64(43.418817345871766)

In [41]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [42]:
model_2 = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [43]:
y_pred = model_2.predict(dval)

In [44]:
root_mean_squared_error(y_val, y_pred)

np.float64(41.05034017683498)