In [26]:
import numpy as np 
import pandas as pd 
import random 
random.seed(42)
np.random.seed(42)

In [None]:
df_train = pd.read_csv("train_dataset.csv")
df_train.shape

(6000, 59)

In [28]:
df_train.head(5)

Unnamed: 0,cont_0,cont_1,cont_2,cont_3,cont_4,cont_5,cont_6,cont_7,cont_8,cont_9,...,ord_19,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,target
0,-0.029933,19.494106,-4.330267,,-6.403516,-12.47462,,-0.180768,440.930299,,...,ord_7_val_5,cat_0_val_9,cat_1_val_2,cat_2_val_1,cat_3_val_10,cat_4_val_0,cat_5_val_0,cat_6_val_0,cat_7_val_1,0.510454
1,0.004747,1.930336,2.370512,0.014357,-2.381447,2.54512,-652.050389,,,,...,ord_7_val_4,cat_0_val_2,cat_1_val_1,,cat_3_val_9,,cat_5_val_0,cat_6_val_0,cat_7_val_5,-0.476601
2,,18.865632,1.826231,-0.03369,-7.17725,8.423474,-3322.816093,6.550298,-137.358945,,...,ord_7_val_4,cat_0_val_6,cat_1_val_3,,,cat_4_val_0,cat_5_val_0,cat_6_val_0,cat_7_val_1,-0.918673
3,-0.017541,,9.851084,0.035294,1.077285,4.897107,-647.925452,-2.024097,,11.097455,...,ord_7_val_4,cat_0_val_2,,cat_2_val_2,cat_3_val_18,,,cat_6_val_0,cat_7_val_5,0.139556
4,0.005074,-32.895531,-12.880423,0.025106,,1.029729,-3938.124133,,172.214384,9.974146,...,ord_7_val_1,cat_0_val_3,cat_1_val_0,cat_2_val_0,cat_3_val_13,,cat_5_val_0,cat_6_val_0,cat_7_val_3,-1.616743


In [29]:
df_train.columns

Index(['cont_0', 'cont_1', 'cont_2', 'cont_3', 'cont_4', 'cont_5', 'cont_6',
       'cont_7', 'cont_8', 'cont_9', 'cont_10', 'cont_11', 'cont_12',
       'cont_13', 'cont_14', 'cont_15', 'cont_16', 'cont_17', 'cont_18',
       'cont_19', 'cont_20', 'cont_21', 'cont_22', 'cont_23', 'cont_24',
       'cont_25', 'cont_26', 'cont_27', 'cont_28', 'cont_29', 'ord_0', 'ord_1',
       'ord_2', 'ord_3', 'ord_4', 'ord_5', 'ord_6', 'ord_7', 'ord_8', 'ord_9',
       'ord_10', 'ord_11', 'ord_12', 'ord_13', 'ord_14', 'ord_15', 'ord_16',
       'ord_17', 'ord_18', 'ord_19', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'cat_4', 'cat_5', 'cat_6', 'cat_7', 'target'],
      dtype='object')

In [30]:
X = df_train.drop(columns=["target"])
y = df_train["target"]

In [31]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [32]:
ord_cols = [col for col in cat_cols if col.startswith("ord_")]
nom_cols = [col for col in cat_cols if col.startswith("cat_")]

In [33]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

In [35]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())        
])

ord_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder())
])

nom_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_cols),
    ("ord", ord_pipeline, ord_cols),
    ("nom", nom_pipeline, nom_cols)
])

In [36]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import r2_score

params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "max_features": [None, "sqrt", "log2"], 
    "n_jobs": [-1]  
}
scores = []
for config in ParameterGrid(params):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(**config, random_state=42))
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = r2_score(y_val, y_pred)
    print(config, "R2 -->", score)
    scores.append((score))

{'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1} R2 --> 0.7789618065799037
{'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': -1} R2 --> 0.7819827165705897
{'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 300, 'n_jobs': -1} R2 --> 0.7835557223187402
{'max_depth': None, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 100, 'n_jobs': -1} R2 --> 0.7789390169612556
{'max_depth': None, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 200, 'n_jobs': -1} R2 --> 0.7817911132907636
{'max_depth': None, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 300, 'n_jobs': -1} R2 --> 0.7829986264805502
{'max_depth': None, 'max_features': None, 'min_samples_split': 10, 'n_estimators': 100, 'n_jobs': -1} R2 --> 0.7760801779597276
{'max_depth': None, 'max_features': None, 'min_samples_split': 10, 'n_estimators': 200, 'n_jobs': -1} R2 --> 0

In [37]:
best_score = max(scores)
best_index = scores.index(best_score)
best_config = list(ParameterGrid(params))[best_index]
print("Best config:", best_config)

Best config: {'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 300, 'n_jobs': -1}


In [38]:
final_model = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("regressor", RandomForestRegressor(**best_config, random_state=42))
])

final_model.fit(X_train_val, y_train_val)
y_test_pred = final_model.predict(X_test)
test_score = r2_score(y_test, y_test_pred)
print("Test R2 score:", test_score)

Test R2 score: 0.8090189776481103
