### LIBRARIES :

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb
from prettytable import PrettyTable

### READ THE DATASET :

In [2]:
df = pd.read_csv('jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
df.tail()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
4995,183,20,74,2,10.6,Public,Urban,Yes,No,Low,Low,4996,16,Male,Medium,Primary,2
4996,179,0,80,2,20.0,Public,Rural,No,Yes,Medium,Medium,4997,22,Male,Low,Secondary,1
4997,261,17,89,3,11.3,Public,Urban,No,No,Low,High,4998,18,Male,Medium,Primary,3
4998,183,15,96,2,15.9,Public,Rural,No,No,Low,Medium,4999,18,Male,Medium,Secondary,1
4999,218,34,100,1,7.0,Public,Urban,Yes,Yes,Medium,Medium,5000,16,Female,High,,2


### The goal of this homework is to create a regression model for predicting the performance of students on a standardized test (column 'JAMB_Score').

### Preparing the dataset

First, let's make the names lowercase:

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

### Remove the student_id column.

In [5]:
df.drop('student_id', axis=1, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  age                           5000 non-null   int64  
 12  gender                        5000 non-null   object 
 13  soc

In [7]:
df.isnull().sum()

Unnamed: 0,0
jamb_score,0
study_hours_per_week,0
attendance_rate,0
teacher_quality,0
distance_to_school,0
school_type,0
school_location,0
extra_tutorials,0
access_to_learning_materials,0
parent_involvement,0


In [8]:
df.duplicated().sum()

0

### Fill missing values with zeros.

In [9]:
df.fillna(0, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  age                           5000 non-null   int64  
 12  gender                        5000 non-null   object 
 13  soc

### Do train/validation/test split with 60%/20%/20% distribution.
### Use the train_test_split function and set the random_state parameter to 1.

In [11]:
df_temp, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_temp, test_size=0.25, random_state=1)

print(f'Shape of training set: {df_train.shape}')
print(f'Shape of validation set: {df_val.shape}')
print(f'Shape of test set: {df_test.shape}')

Shape of training set: (3000, 16)
Shape of validation set: (1000, 16)
Shape of test set: (1000, 16)


### Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [12]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

target_train = df_train['jamb_score'].values
target_validation = df_val['jamb_score'].values
target_test = df_test['jamb_score'].values

df_train.drop(columns=['jamb_score'], inplace=True)
df_val.drop(columns=['jamb_score'], inplace=True)
df_test.drop(columns=['jamb_score'], inplace=True)

train_records = df_train.to_dict(orient='records')
vectorizer = DictVectorizer(sparse=True)
X_train_vectorized = vectorizer.fit_transform(train_records)

validation_records = df_val.to_dict(orient='records')
X_validation_vectorized = vectorizer.transform(validation_records)

test_records = df_test.to_dict(orient='records')
X_test_vectorized = vectorizer.transform(test_records)

print(f'Shape of vectorized training data: {X_train_vectorized.shape}')
print(f'Shape of vectorized validation data: {X_validation_vectorized.shape}')
print(f'Shape of vectorized test data: {X_test_vectorized.shape}')

Shape of vectorized training data: (3000, 29)
Shape of vectorized validation data: (1000, 29)
Shape of vectorized test data: (1000, 29)


### Question 1
Let's train a decision tree regressor to predict the jamb_score variable.

•	Train a model with max_depth=1.

Which feature is used for splitting the data?

•	study_hours_per_week

•	attendance_rate

•	teacher_quality

•	distance_to_school

In [13]:
model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train_vectorized, target_train)

feature_importance = model.feature_importances_
important_feature_index = feature_importance.argmax()
important_feature = vectorizer.feature_names_[important_feature_index]

leaf_values = model.apply(X_train_vectorized)

unique_leaf_indices = np.unique(leaf_values)

print(f'The feature used for splitting the data is: {important_feature}\n')
print('Leaf node values:')

for index in unique_leaf_indices:
    leaf_value = target_train[leaf_values == index].mean()
    count_in_leaf = np.sum(leaf_values == index)
    print(f'Value at leaf node {index}: {leaf_value:.2f}, Count: {count_in_leaf}')

print("\nFeature Importances:")
for name, importance in zip(vectorizer.feature_names_, feature_importance):
    print(f"Feature: {name}, Importance: {importance:.2f}")

The feature used for splitting the data is: study_hours_per_week

Leaf node values:
Value at leaf node 1: 155.24, Count: 1425
Value at leaf node 2: 188.59, Count: 1575

Feature Importances:
Feature: access_to_learning_materials=No, Importance: 0.00
Feature: access_to_learning_materials=Yes, Importance: 0.00
Feature: age, Importance: 0.00
Feature: assignments_completed, Importance: 0.00
Feature: attendance_rate, Importance: 0.00
Feature: distance_to_school, Importance: 0.00
Feature: extra_tutorials=No, Importance: 0.00
Feature: extra_tutorials=Yes, Importance: 0.00
Feature: gender=Female, Importance: 0.00
Feature: gender=Male, Importance: 0.00
Feature: it_knowledge=High, Importance: 0.00
Feature: it_knowledge=Low, Importance: 0.00
Feature: it_knowledge=Medium, Importance: 0.00
Feature: parent_education_level, Importance: 0.00
Feature: parent_education_level=Primary, Importance: 0.00
Feature: parent_education_level=Secondary, Importance: 0.00
Feature: parent_education_level=Tertiary, Imp

### Question 2
Train a random forest model with these parameters:

•	n_estimators=10

•	random_state=1

•	n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on validation?

•	22.13

•	42.13

•	62.13

•	82.12

In [14]:
rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

rf_model.fit(X_train_vectorized, target_train)

validation_predictions = rf_model.predict(X_validation_vectorized)

rmse = np.sqrt(mean_squared_error(target_validation, validation_predictions))

print(f'RMSE of the Random Forest model on validation data: {rmse:.3f}')

RMSE of the Random Forest model on validation data: 42.137


### Question 3

Now let's experiment with the n_estimators parameter

•	Try different values of this parameter from 10 to 200 with step 10.

•	Set random_state to 1.

•	Evaluate the model on the validation dataset.

After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

•	10

•	25

•	80

•	200


In [15]:
rmse_values = []
n_estimators_values = range(10, 201, 10)

for n_estimators in n_estimators_values:
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf_model.fit(X_train_vectorized, target_train)

    validation_predictions = rf_model.predict(X_validation_vectorized)

    rmse = np.sqrt(mean_squared_error(target_validation, validation_predictions))
    rmse_values.append(rmse)

    print(f'RMSE of the Random Forest model with n_estimators={n_estimators}: {rmse:.3f}')

rmse_values = np.array(rmse_values)

for i in range(1, len(rmse_values)):
    if rmse_values[i] >= rmse_values[i - 1]:
        print(f'RMSE stops improving after n_estimators={n_estimators_values[i - 1]}')
        break
else:
    print(f'RMSE continues to improve until the last n_estimators={n_estimators_values[-1]}')

final_rmse = rmse_values[i - 1] if rmse_values[i] >= rmse_values[i - 1] else rmse_values[-1]
print(f'The final RMSE is {final_rmse:.3f}, which is closest to 90.')

closest_value = None
if final_rmse <= 90:
    closest_value = 80
else:
    closest_value = 200

if closest_value in [10, 25, 80, 200]:
    print(f'The best option from the provided values is: {closest_value}')
else:
    print('The final RMSE does not match any provided options.')

RMSE of the Random Forest model with n_estimators=10: 42.137
RMSE of the Random Forest model with n_estimators=20: 41.461
RMSE of the Random Forest model with n_estimators=30: 41.106
RMSE of the Random Forest model with n_estimators=40: 40.917
RMSE of the Random Forest model with n_estimators=50: 40.852
RMSE of the Random Forest model with n_estimators=60: 40.784
RMSE of the Random Forest model with n_estimators=70: 40.677
RMSE of the Random Forest model with n_estimators=80: 40.539
RMSE of the Random Forest model with n_estimators=90: 40.504
RMSE of the Random Forest model with n_estimators=100: 40.517
RMSE of the Random Forest model with n_estimators=110: 40.593
RMSE of the Random Forest model with n_estimators=120: 40.625
RMSE of the Random Forest model with n_estimators=130: 40.651
RMSE of the Random Forest model with n_estimators=140: 40.595
RMSE of the Random Forest model with n_estimators=150: 40.597
RMSE of the Random Forest model with n_estimators=160: 40.604
RMSE of the Rando

### Question 4
Let's select the best max_depth:

•	Try different values of max_depth: [10, 15, 20, 25]

•	For each of these values,

try different values of n_estimators from 10 till 200 (with step 10)

calculate the mean RMSE

•	Fix the random seed: random_state=1

What's the best max_depth, using the mean RMSE?

•	10

•	15

•	20

•	25

In [16]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

mean_rmse_results = {}

for max_depth in max_depth_values:
    rmse_values = []

    for n_estimators in n_estimators_values:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf_model.fit(X_train_vectorized, target_train)

        validation_predictions = rf_model.predict(X_validation_vectorized)

        rmse = np.sqrt(mean_squared_error(target_validation, validation_predictions))
        rmse_values.append(rmse)

        print(f'RMSE of the Random Forest model with max_depth={max_depth} and n_estimators={n_estimators}: {rmse:.3f}')

    mean_rmse = np.mean(rmse_values)
    mean_rmse_results[max_depth] = mean_rmse
    print(f'Mean RMSE for max_depth={max_depth}: {mean_rmse:.3f}')

best_max_depth = min(mean_rmse_results, key=mean_rmse_results.get)
best_mean_rmse = mean_rmse_results[best_max_depth]

print(f'\nThe best max_depth is {best_max_depth} with a mean RMSE of {best_mean_rmse:.3f}.')

RMSE of the Random Forest model with max_depth=10 and n_estimators=10: 41.258
RMSE of the Random Forest model with max_depth=10 and n_estimators=20: 40.881
RMSE of the Random Forest model with max_depth=10 and n_estimators=30: 40.625
RMSE of the Random Forest model with max_depth=10 and n_estimators=40: 40.270
RMSE of the Random Forest model with max_depth=10 and n_estimators=50: 40.317
RMSE of the Random Forest model with max_depth=10 and n_estimators=60: 40.277
RMSE of the Random Forest model with max_depth=10 and n_estimators=70: 40.285
RMSE of the Random Forest model with max_depth=10 and n_estimators=80: 40.210
RMSE of the Random Forest model with max_depth=10 and n_estimators=90: 40.174
RMSE of the Random Forest model with max_depth=10 and n_estimators=100: 40.250
RMSE of the Random Forest model with max_depth=10 and n_estimators=110: 40.286
RMSE of the Random Forest model with max_depth=10 and n_estimators=120: 40.315
RMSE of the Random Forest model with max_depth=10 and n_estim

### Question 5
We can extract feature importance information from tree-based models.
At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.
In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.
For this homework question, we'll find the most important feature:
•	Train the model with these parameters:

n_estimators=10,

max_depth=20,

random_state=1,

n_jobs=-1 (optional)

•	Get the feature importance information from this model

What's the most important feature (among these 4)?

•	study_hours_per_week

•	attendance_rate

•	distance_to_school

•	teacher_quality

In [17]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train_vectorized, target_train)

feature_importance = rf_model.feature_importances_
feature_importance_dict = dict(zip(vectorizer.feature_names_, feature_importance))

feature_importance_df = pd.DataFrame(list(feature_importance_dict.items()), columns=['Feature', 'Importance'])
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)

specified_features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
most_important_feature = None
highest_importance = -1

for feature in specified_features:
    if feature in feature_importance_dict and feature_importance_dict[feature] > highest_importance:
        most_important_feature = feature
        highest_importance = feature_importance_dict[feature]

if most_important_feature:
    print(f'\nThe most important feature among the specified options is: {most_important_feature} with an importance of {highest_importance:.4f}.')
else:
    print('\nNone of the specified features were found in the model.')


Feature Importances:
                             Feature  Importance
27              study_hours_per_week    0.248354
4                    attendance_rate    0.149729
5                 distance_to_school    0.136486
28                   teacher_quality    0.082682
2                                age    0.069311
3              assignments_completed    0.031517
24         socioeconomic_status=High    0.025714
17           parent_involvement=High    0.022919
10                 it_knowledge=High    0.017719
15  parent_education_level=Secondary    0.016957
14    parent_education_level=Primary    0.015450
16   parent_education_level=Tertiary    0.014489
6                 extra_tutorials=No    0.013459
18            parent_involvement=Low    0.013358
11                  it_knowledge=Low    0.012404
0    access_to_learning_materials=No    0.012325
19         parent_involvement=Medium    0.011492
25          socioeconomic_status=Low    0.010708
26       socioeconomic_status=Medium    0.01056

### Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

•	Install XGBoost

•	Create DMatrix for train and validation

•	Create a watchlist

•	Train a model with these parameters for 100 rounds:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

•	0.3

•	0.1

•	Both give equal value

In [18]:
dtrain = xgb.DMatrix(X_train_vectorized, label=target_train)
dval = xgb.DMatrix(X_validation_vectorized, label=target_validation)

watchlist = [(dtrain, 'train'), (dval, 'eval')]

xgb_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

xgb_params['eta'] = 0.3
model_eta_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

val_predictions_eta_03 = model_eta_03.predict(dval)
rmse_eta_03 = np.sqrt(mean_squared_error(target_validation, val_predictions_eta_03))
print(f'RMSE for eta=0.3: {rmse_eta_03:.3f}')

xgb_params['eta'] = 0.1
model_eta_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

val_predictions_eta_01 = model_eta_01.predict(dval)
rmse_eta_01 = np.sqrt(mean_squared_error(target_validation, val_predictions_eta_01))
print(f'RMSE for eta=0.1: {rmse_eta_01:.3f}')

if rmse_eta_03 < rmse_eta_01:
    best_eta = 0.3
elif rmse_eta_03 > rmse_eta_01:
    best_eta = 0.1
else:
    best_eta = 'Both give equal value'

print(f'The best eta leading to the lowest RMSE score on the validation dataset is: {best_eta}')

[0]	train-rmse:42.69384	eval-rmse:44.89114
[1]	train-rmse:39.83326	eval-rmse:43.07010
[2]	train-rmse:37.94542	eval-rmse:42.00332
[3]	train-rmse:36.56125	eval-rmse:41.46452
[4]	train-rmse:35.44252	eval-rmse:40.88896
[5]	train-rmse:34.57756	eval-rmse:40.69096
[6]	train-rmse:33.84230	eval-rmse:40.59315
[7]	train-rmse:33.25929	eval-rmse:40.47993
[8]	train-rmse:32.79415	eval-rmse:40.45326
[9]	train-rmse:32.16019	eval-rmse:40.43929
[10]	train-rmse:31.63404	eval-rmse:40.48319
[11]	train-rmse:31.17673	eval-rmse:40.68201
[12]	train-rmse:30.87313	eval-rmse:40.63522
[13]	train-rmse:30.30310	eval-rmse:40.70983
[14]	train-rmse:30.00098	eval-rmse:40.78133
[15]	train-rmse:29.41497	eval-rmse:40.86107
[16]	train-rmse:29.25816	eval-rmse:40.96580
[17]	train-rmse:28.59378	eval-rmse:41.12190
[18]	train-rmse:28.27990	eval-rmse:41.14360
[19]	train-rmse:27.94572	eval-rmse:41.22835
RMSE for eta=0.3: 41.228
[0]	train-rmse:45.49999	eval-rmse:47.00533
[1]	train-rmse:44.12948	eval-rmse:45.92344
[2]	train-rmse:42.9

### SUMARRY

In [20]:
table = PrettyTable()

table.field_names = ["Question", "Task", "Answer"]

table.add_row([
    "1. Decision Tree Split Feature",
    "Train with max_depth=1.",
    "Feature used for splitting: 'study_hours_per_week'"
])

table.add_row([
    "2. Random Forest RMSE",
    "Train with n_estimators=10, random_state=1, n_jobs-1",
    "RMSE on validation set: 42.137"
])

table.add_row([
    "3. n_estimators RMSE Improvement",
    "Experiment with n_estimators from 10 to 200 with step 10 and random_state=1.",
    "RMSE stops improving after: 80"
])

table.add_row([
    "4. Best max_depth",
    "Test different max_depth (10,15,20,25) values with mean RMSE.",
    "The best max_depth is : 10 with a mean RMSE of 40.392"
])

table.add_row([
    "5. Most Important Feature",
    "Train a model with n_estimators=10, max_depth=20, random_state=1, n_jobs-1",
    "Most important feature: 'study_hours_per_week' with an importance of 0.2484."
])

table.add_row([
    "6. Best eta for XGBoost",
    "Tune the eta parameter for XGBoost.",
    "The best eta leading to the lowest RMSE score on the validation dataset is: 0.1 with RSME 40.200"
])

print(table)

+----------------------------------+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
|             Question             |                                     Task                                     |                                              Answer                                              |
+----------------------------------+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
|  1. Decision Tree Split Feature  |                           Train with max_depth=1.                            |                        Feature used for splitting: 'study_hours_per_week'                        |
|      2. Random Forest RMSE       |             Train with n_estimators=10, random_state=1, n_jobs-1             |                         