In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

### Preparing the dataset

In [2]:
url = "https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv"

df = pd.read_csv(url)
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
df = df.drop(columns=['student_id'])

In [4]:
df.isnull().sum()

Unnamed: 0,0
jamb_score,0
study_hours_per_week,0
attendance_rate,0
teacher_quality,0
distance_to_school,0
school_type,0
school_location,0
extra_tutorials,0
access_to_learning_materials,0
parent_involvement,0


In [5]:
df = df.fillna(0)
df.isnull().sum()

Unnamed: 0,0
jamb_score,0
study_hours_per_week,0
attendance_rate,0
teacher_quality,0
distance_to_school,0
school_type,0
school_location,0
extra_tutorials,0
access_to_learning_materials,0
parent_involvement,0


In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [7]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

df_train = df_train.drop(columns=['jamb_score'])
df_val = df_val.drop(columns=['jamb_score'])
df_test = df_test.drop(columns=['jamb_score'])

In [8]:
df_train.shape, df_val.shape, df_test.shape

((3000, 15), (1000, 15), (1000, 15))

In [9]:
dv = DictVectorizer(sparse=False)
X_train_dict = df_train.to_dict(orient='records')
X_val_dict = df_val.to_dict(orient='records')

X_train_encoded = dv.fit_transform(X_train_dict)
X_val_encoded = dv.transform(X_val_dict)

### Question 1

In [10]:
model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train_encoded, y_train)

In [11]:
feature_importances = model.feature_importances_

# feature importance
sorted_indices = feature_importances.argsort()[::-1]
sorted_feature_names = [dv.feature_names_[i] for i in sorted_indices]
sorted_importances = feature_importances[sorted_indices]

########
print(f"The feature used for splitting the data is: {sorted_feature_names[0]}")

The feature used for splitting the data is: study_hours_per_week


### Question 2

In [12]:
model_rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model_rf.fit(X_train_encoded, y_train)

In [13]:
y_pred = model_rf.predict(X_val_encoded)
print(f"The RMSE on the validation data is: {round(root_mean_squared_error(y_val, y_pred),3)}")

The RMSE on the validation data is: 42.137


### Question 3

In [14]:
# Lista para almacenar los resultados
rmse_values = []

# Probar diferentes valores de n_estimators
for n in range(10, 201, 10):
    model_rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model_rf.fit(X_train_encoded, y_train)
    y_pred = model_rf.predict(X_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)
    rmse_values.append((n, rmse))
    print(f"n_estimators={n}, RMSE={rmse:.3f}")

n_estimators=10, RMSE=42.137
n_estimators=20, RMSE=41.461
n_estimators=30, RMSE=41.106
n_estimators=40, RMSE=40.917
n_estimators=50, RMSE=40.852
n_estimators=60, RMSE=40.784
n_estimators=70, RMSE=40.677
n_estimators=80, RMSE=40.539
n_estimators=90, RMSE=40.504
n_estimators=100, RMSE=40.517
n_estimators=110, RMSE=40.593
n_estimators=120, RMSE=40.625
n_estimators=130, RMSE=40.651
n_estimators=140, RMSE=40.595
n_estimators=150, RMSE=40.597
n_estimators=160, RMSE=40.604
n_estimators=170, RMSE=40.628
n_estimators=180, RMSE=40.641
n_estimators=190, RMSE=40.631
n_estimators=200, RMSE=40.601


### Question 4

In [15]:
# Valores de max_depth a probar
max_depths = [10, 15, 20, 25]
# Lista para almacenar los resultados
mean_rmse_values = {}

# Probar diferentes valores de max_depth
for max_depth in max_depths:
    rmse_list = []
    # Probar diferentes valores de n_estimators
    for n in range(10, 201, 10):
        model_rf = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        model_rf.fit(X_train_encoded, y_train)
        y_pred = model_rf.predict(X_val_encoded)
        rmse = root_mean_squared_error(y_val, y_pred)
        rmse_list.append(rmse)
    # Calcular la media del RMSE para este max_depth
    mean_rmse = np.mean(rmse_list)
    mean_rmse_values[max_depth] = mean_rmse
    print(f"max_depth={max_depth}, Mean RMSE={mean_rmse:.3f}")

# Encontrar el mejor max_depth con el menor RMSE
best_max_depth = min(mean_rmse_values, key=mean_rmse_values.get)
print(f"The best max_depth based on mean RMSE is: {best_max_depth}")

max_depth=10, Mean RMSE=40.392
max_depth=15, Mean RMSE=40.735
max_depth=20, Mean RMSE=40.740
max_depth=25, Mean RMSE=40.788
The best max_depth based on mean RMSE is: 10


### Question 5

In [16]:
# Entrenar el modelo de Random Forest
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train_encoded, y_train)

In [17]:
# Obtener las importancias de las características
feature_importances = rf_model.feature_importances_

importance_df = pd.DataFrame({
    'feature': dv.feature_names_,
    'importance': feature_importances
})

# Ordenar el DataFrame por importancia
importance_df = importance_df.sort_values(by='importance', ascending=False)
most_important_feature = importance_df.iloc[0]
print(f"The most important feature is: {most_important_feature['feature']} with importance: {most_important_feature['importance']:.4f}")

The most important feature is: study_hours_per_week with importance: 0.2484


### Question 6

In [18]:
# DMatrix para train y val
dtrain = xgb.DMatrix(X_train_encoded, label=y_train)
dval = xgb.DMatrix(X_val_encoded, label=y_val)

# Crear un watchlist
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [19]:
# Definir los parámetros del modelo
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

In [20]:
# Entrenar el modelo con eta=0.3
model_0_3 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Hacer predicciones en el conjunto de validación y calcular RMSE
y_pred_0_3 = model_0_3.predict(dval)
rmse_0_3 = root_mean_squared_error(y_val, y_pred_0_3)
print(f"RMSE with eta=0.3: {rmse_0_3:.4f}")

[0]	train-rmse:42.69552	val-rmse:44.86028
[1]	train-rmse:39.85005	val-rmse:43.06070
[2]	train-rmse:37.95285	val-rmse:41.83772
[3]	train-rmse:36.52203	val-rmse:41.26987
[4]	train-rmse:35.36066	val-rmse:41.02965
[5]	train-rmse:34.43646	val-rmse:40.87186
[6]	train-rmse:33.71528	val-rmse:40.69132
[7]	train-rmse:33.07449	val-rmse:40.65074
[8]	train-rmse:32.41772	val-rmse:40.72710
[9]	train-rmse:31.91000	val-rmse:40.82972
[10]	train-rmse:31.55119	val-rmse:40.83684
[11]	train-rmse:30.99534	val-rmse:40.98243
[12]	train-rmse:30.33163	val-rmse:41.05515
[13]	train-rmse:30.04347	val-rmse:41.18635
[14]	train-rmse:29.44279	val-rmse:41.32160
[15]	train-rmse:29.08791	val-rmse:41.37832
[16]	train-rmse:28.84106	val-rmse:41.45737
[17]	train-rmse:28.28883	val-rmse:41.51113
RMSE with eta=0.3: 41.5111


In [21]:
# Cambiar eta a 0.1 y repetir el proceso
xgb_params['eta'] = 0.1

# Entrenar el modelo con eta=0.1
model_0_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Hacer predicciones en el conjunto de validación y calcular RMSE
y_pred_0_1 = model_0_1.predict(dval)
rmse_0_1 = root_mean_squared_error(y_val, y_pred_0_1)
print(f"RMSE with eta=0.1: {rmse_0_1:.4f}")

[0]	train-rmse:45.50072	val-rmse:46.99373
[1]	train-rmse:44.12583	val-rmse:45.89771
[2]	train-rmse:42.92815	val-rmse:45.01746
[3]	train-rmse:41.90445	val-rmse:44.27569
[4]	train-rmse:40.94238	val-rmse:43.62411
[5]	train-rmse:40.15460	val-rmse:43.05644
[6]	train-rmse:39.43451	val-rmse:42.64605
[7]	train-rmse:38.77902	val-rmse:42.28889
[8]	train-rmse:38.17103	val-rmse:42.00035
[9]	train-rmse:37.60436	val-rmse:41.73633
[10]	train-rmse:37.11353	val-rmse:41.55631
[11]	train-rmse:36.65501	val-rmse:41.33670
[12]	train-rmse:36.26337	val-rmse:41.18307
[13]	train-rmse:35.84634	val-rmse:41.03057
[14]	train-rmse:35.45796	val-rmse:40.89024
[15]	train-rmse:35.07766	val-rmse:40.70892
[16]	train-rmse:34.73181	val-rmse:40.60601
[17]	train-rmse:34.40922	val-rmse:40.47982
[18]	train-rmse:34.13466	val-rmse:40.43780
[19]	train-rmse:33.88018	val-rmse:40.38390
[20]	train-rmse:33.57997	val-rmse:40.37859
[21]	train-rmse:33.33993	val-rmse:40.33974
[22]	train-rmse:33.08144	val-rmse:40.31700
[23]	train-rmse:32.90

In [22]:
# Comparar RMSE
if rmse_0_3 < rmse_0_1:
    print("The best eta is 0.3")
elif rmse_0_1 < rmse_0_3:
    print("The best eta is 0.1")
else:
    print("Both eta values give equal RMSE")

The best eta is 0.1
