In [7]:
import pandas as pd

# Lee el archivo CSV
df = pd.read_csv('jamb_exam_results.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb


In [15]:
df = df.drop(columns=['student_id'])

df = df.fillna(0)

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

df_train = df_train.drop(columns=['jamb_score'])
df_val = df_val.drop(columns=['jamb_score'])
df_test = df_test.drop(columns=['jamb_score'])

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(df_train.to_dict(orient="records"))
X_val = dv.transform(df_val.to_dict(orient="records"))
X_test = dv.transform(df_test.to_dict(orient="records"))

# Question 1

In [16]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

print(export_text(dt, feature_names= dv.get_feature_names_out()))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



# Question 2

In [17]:

rf = RandomForestRegressor(n_estimators=10,
                            random_state=1,
                            n_jobs=-1)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)

rmse = root_mean_squared_error(y_pred, y_val)
print(f"RMSE on validation set: {rmse:.2f}")

RMSE on validation set: 42.14


# Question 3

In [18]:
rmse_results = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    rmse_results.append((n, rmse))
    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")
    
min_rmse = min(rmse_results, key=lambda x: x[1])
print(f"Minimum RMSE: {min_rmse[1]:.3f} with n_estimators: {min_rmse[0]}")

n_estimators: 10, RMSE: 42.137
n_estimators: 20, RMSE: 41.461
n_estimators: 30, RMSE: 41.106
n_estimators: 40, RMSE: 40.917
n_estimators: 50, RMSE: 40.852
n_estimators: 60, RMSE: 40.784
n_estimators: 70, RMSE: 40.677
n_estimators: 80, RMSE: 40.539
n_estimators: 90, RMSE: 40.504
n_estimators: 100, RMSE: 40.517
n_estimators: 110, RMSE: 40.593
n_estimators: 120, RMSE: 40.625
n_estimators: 130, RMSE: 40.651
n_estimators: 140, RMSE: 40.595
n_estimators: 150, RMSE: 40.597
n_estimators: 160, RMSE: 40.604
n_estimators: 170, RMSE: 40.628
n_estimators: 180, RMSE: 40.641
n_estimators: 190, RMSE: 40.631
n_estimators: 200, RMSE: 40.601
Minimum RMSE: 40.504 with n_estimators: 90


# Question 4

In [19]:
mean_rmse_per_depth = {}

for depth in [10, 15, 20, 25]:
    rmse_values = []
    
    
    for n in range(10, 201, 10):
        
        rf = RandomForestRegressor(max_depth=depth, n_estimators=n, random_state=1)
        rf.fit(X_train, y_train)
        
    
        y_pred = rf.predict(X_val)
        
        
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmse_values.append(rmse)
    
    mean_rmse_per_depth[depth] = np.mean(rmse_values)

best_depth = min(mean_rmse_per_depth, key=mean_rmse_per_depth.get)

print(f"Best max_depth: {best_depth} with Mean RMSE: {mean_rmse_per_depth[best_depth]:.3f}")

Best max_depth: 10 with Mean RMSE: 40.392


# Question 5

In [20]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_

feature_importances = pd.DataFrame({'feature': dv.get_feature_names_out(), 'importance': importances})

feature_importances = feature_importances.sort_values(by='importance', ascending=False)

print(feature_importances.head(1))

                 feature  importance
27  study_hours_per_week    0.248354


# Question 6

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_params = {
    'eta': 0.3,  # Learning rate
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',  # Regression task
    'nthread': 8,  # Use all available cores
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)