In [19]:
import numpy as np 
import pandas as pd 
import random 
random.seed(42)
np.random.seed(42)

In [20]:
df_train = pd.read_csv("development.csv")
df_train.shape

(5712, 22)

In [21]:
df_train.head(5)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,target
0,,,71.086074,0.007284,47.109485,1322.311127,,-174.266297,-678.159587,-273.88966,...,val_3,val_1,val_1,dc12d,4c8db,e62e2,cfcb8,d34b0,,-1.052352
1,,-224.620718,,0.007744,-0.076009,3915.768234,,-16.490485,-232.795611,188.108128,...,val_0,val_0,val_1,dc12d,f1bdb,e62e2,2fa6e,2adde,,-1.477522
2,0.0235,306.374047,,0.005558,7.110974,-1733.038983,,38.970001,0.392658,70.28805,...,val_2,,val_1,8cf49,,1da64,8a3e2,2ea90,,-1.430001
3,,,4333.222247,0.011852,-15.727447,-3388.461853,,77.309234,,-109.032918,...,val_2,val_1,val_2,8cf49,,d73de,8a3e2,d34b0,,0.852819
4,,,,0.007584,-5.373907,3060.014081,,94.684765,-45.916707,-19.217713,...,val_0,val_0,val_2,de4bc,f1bdb,e62e2,8a3e2,7ef86,,0.484446


In [22]:
df_train["var_6"].isna().sum()

np.int64(3998)

In [23]:
df_train["var_20"].isna().sum()

np.int64(3998)

In [24]:
X = df_train.drop(columns=["target"])
y = df_train["target"]

In [None]:
num_cols = [f"var_{i}" for i in range(0, 10)]
ord_cols = [f"var_{i}" for i in range(10, 15)]
cat_cols = [f"var_{i}" for i in range(15, 21)]

# df_numerical = num_cols.copy
# df_ordinal = ord_cols.copy()
# df_categorical = cat_cols.copy()


# As most of values in var_20 are missing (which is categorical), we can remove this column
cat_cols.remove("var_20")

In [26]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [27]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer   
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [28]:
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ord_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),   
    ("encoder", OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipe, num_cols),
    ("ord", ord_pipe, ord_cols),
    ("cat", cat_pipe, cat_cols)
])

In [29]:
from sklearn.metrics import r2_score
from sklearn.model_selection import ParameterGrid

params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "n_jobs": [-1]
}

scores = []
for config in ParameterGrid(params):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(**config, random_state=42))
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = r2_score(y_val, y_pred)
    print(config, "R2 ==> ", score) 
    scores.append((score))
    



{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1} R2 ==>  0.8444025285903181
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': -1} R2 ==>  0.8451726956452776
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'n_jobs': -1} R2 ==>  0.8440008729637416
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'n_jobs': -1} R2 ==>  0.8439021959004206
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200, 'n_jobs': -1} R2 ==>  0.8433852784343324
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300, 'n_jobs': -1} R2 ==>  0.8420157598542022
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100, 'n_jobs': -1} R2 ==>  0.8374200179027985
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200, 'n_jobs

In [30]:
best_score = max(scores)
best_index = scores.index(best_score)
best_config = list(ParameterGrid(params))[best_index]
print("Best config: ", best_config)

Best config:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': -1}


In [31]:
final_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(**best_config, random_state=42))
])

final_model.fit(X_train_val, y_train_val)
y_test_pred = final_model.predict(X_test)
test_score = r2_score(y_test, y_test_pred)
print("Test R2 ==> ", test_score)

Test R2 ==>  0.8327861991761006
