## Homework

## Download the Dataset:

In [2]:
import pandas as pd

url = "https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv"
df = pd.read_csv(url)

# Afficher les premières lignes du fichier pour vérifier le contenu
df.head()


Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


## Load and Clean the Data:

Read the dataset and rename columns to lowercase and replace spaces with underscores:

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


## Drop the Unnecessary Column:

Drop student_id since it’s not useful for prediction:

In [5]:
df = df.drop(columns=['student_id'])
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


## Fill Missing Values:

Fill missing values with zero

In [6]:
df = df.fillna(0)


## Split the Dataset:

Split the dataset into train/validation/test with 60%/20%/20% proportions:

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_temp = train_test_split(df, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)


## Prepare Data for Modeling:

Separate target (jamb_score) and features, then vectorize the features:

In [8]:
from sklearn.feature_extraction import DictVectorizer

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(df_train.drop(columns=['jamb_score']).to_dict(orient='records'))
X_val = dv.transform(df_val.drop(columns=['jamb_score']).to_dict(orient='records'))
X_test = dv.transform(df_test.drop(columns=['jamb_score']).to_dict(orient='records'))


## Question 1

Let's train a decision tree regressor to predict the jamb_score variable.

Train a model with max_depth=1.

Which feature is used for splitting the data?

study_hours_per_week
attendance_rate
teacher_quality
distance_to_school

In [9]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

split_feature = dv.feature_names_[dt.tree_.feature[0]]
print(f"The feature used for splitting: {split_feature}")


The feature used for splitting: study_hours_per_week


## Question 2: Random Forest Regressor with n_estimators=10

Train a random forest regressor and calculate the RMSE on validation data:

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE of the model on validation data: {rmse}")


RMSE of the model on validation data: 43.157758977963624


## Question 3: Experiment with n_estimators Parameter

Train random forest models with n_estimators values from 10 to 200 (step 10), and find the point where RMSE stops improving:

In [11]:
best_rmse = float('inf')
n_estimator_optimal = None

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    if rmse < best_rmse:
        best_rmse = rmse
        n_estimator_optimal = n

print(f"Optimal n_estimators where RMSE stops improving: {n_estimator_optimal}")


Optimal n_estimators where RMSE stops improving: 180


## Question 4: Select the Best max_depth

Test different max_depth values and calculate the mean RMSE for each configuration:

In [12]:
depths = [10, 15, 20, 25]
best_depth = None
best_mean_rmse = float('inf')

for depth in depths:
    mean_rmse_for_depth = 0
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mean_rmse_for_depth += rmse

    mean_rmse_for_depth /= len(range(10, 201, 10))
    
    if mean_rmse_for_depth < best_mean_rmse:
        best_mean_rmse = mean_rmse_for_depth
        best_depth = depth

print(f"The best max_depth based on mean RMSE: {best_depth}")


The best max_depth based on mean RMSE: 10


## Question 5: Feature Importance

Train a random forest model with n_estimators=10, max_depth=20, and extract feature importance:

In [13]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
important_feature = dv.feature_names_[np.argmax(importances)]
print(f"The most important feature: {important_feature}")


The most important feature: study_hours_per_week


## Question 6: XGBoost Model and Tuning eta

Train an XGBoost model and compare eta=0.3 vs. eta=0.1:

In [16]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Parameters for eta=0.3
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1
}
model = xgb.train(xgb_params, dtrain, 100, watchlist, early_stopping_rounds=10, verbose_eval=False)

# Validation RMSE for eta=0.3
rmse_03 = model.eval(dval)

# Parameters for eta=0.1
xgb_params['eta'] = 0.1
model = xgb.train(xgb_params, dtrain, 100, watchlist, early_stopping_rounds=10, verbose_eval=False)

# Validation RMSE for eta=0.1
rmse_01 = model.eval(dval)

print(f"Best RMSE for eta=0.3: {rmse_03}, eta=0.1: {rmse_01}")


Best RMSE for eta=0.3: [0]	eval-rmse:41.15979266591052266, eta=0.1: [0]	eval-rmse:40.25735712687057344


