In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from tqdm import tqdm

In [3]:
from typing import Tuple


def prepare_dataset(DATA_PATH: str) -> Tuple[np.ndarray, np.ndarray]:
    """
    Prepare dataset.
    Load data, split into X and y, one-hot encode categorical

    Parameters
    ----------
    DATA_PATH: str :
        path to the dataset

    Returns
    -------
    Tuple[np.ndarray, np.ndarray] :
        X and y
    """
    df = pd.read_csv(DATA_PATH)
    df = df.drop(["ID"], axis=1)
    y = df.pop("y").values

    # select only numeric columns
    X_num = df.select_dtypes(include="number")

    # select only categorical columns and one-hot encode them
    X_cat = df.select_dtypes(exclude="number")
    X_cat = pd.get_dummies(X_cat)

    # combine numeric and categorical
    X = pd.concat([X_num, X_cat], axis=1)
    X = X.fillna(0).values

    return X, y

In [75]:
data_path = "train.csv.zip"
X, y = prepare_dataset(data_path)
params_list = [
    {"max_depth": 10},  # baseline
    {"max_depth": 2},
    # {"max_depth": 3},
    {"max_depth": 4},
    # {"max_depth": 5},
    {"max_depth": 9},
    # {"max_depth": 11},
    {"max_depth": 12},
    {"max_depth": 15},
]

In [11]:


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=200)
params_list = [
    {"max_depth": 10},  # baseline
    {"max_depth": 2},
    # {"max_depth": 3},
    {"max_depth": 4},
    # {"max_depth": 5},
    {"max_depth": 9},
    # {"max_depth": 11},
    {"max_depth": 12},
    {"max_depth": 15},
]
kf = KFold(n_splits=5)
all_scores = []
for n,params in tqdm(enumerate(params_list)):
    # fit
    model.set_params(**params)
    model_scores = []
    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, np.log1p(y_train))
    # predict
        y_pred = np.expm1(model.predict(X_test))
    #evaluate
        score = r2_score(y_test, y_pred)
        model_scores.append(score)
    all_scores.append(model_scores)
print(all_scores)

6it [03:43, 37.30s/it]

[[0.581683898800942, 0.4267465598901331, 0.5949987604440335, 0.5384463348178283, 0.6470269437990647], [0.5104896196506281, 0.40527786506308905, 0.5239997198396518, 0.49178391672386956, 0.56331438697447], [0.5996461409852857, 0.43408897211585784, 0.596770739830049, 0.5464681966780134, 0.6481774250686216], [0.5769582080242993, 0.42686598119688124, 0.5943729078875144, 0.5416860401397199, 0.6499092553385605], [0.5745752302679907, 0.4267420061096918, 0.5926948733292232, 0.5328928829444601, 0.6455285651389933], [0.5699904066784973, 0.42496350346921097, 0.5911914029879353, 0.5248566920014059, 0.6468154312039048]]





In [92]:
from typing import Callable, Dict

from typing import List


def cross_val_score(
    model: Callable,
    X: np.ndarray,
    y: np.ndarray,
    cv: int,
    params_list: List[Dict],
    scoring: Callable,
    random_state: int = 42,
    show_progress: bool = False,
) -> np.ndarray:
    kf = KFold(n_splits=cv,shuffle=True, random_state=random_state)
    all_scores = []
    for n,params in tqdm(enumerate(params_list)):
        # fit
        model.set_params(**params)
        model_scores = []
        for train_index, test_index in kf.split(X):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model.fit(X_train, np.log1p(y_train))
        # predict
            y_pred = np.expm1(model.predict(X_test))
        #evaluate
            score = scoring(y_test, y_pred)
            model_scores.append(score)
        all_scores.append(model_scores)
    return np.array(all_scores)

In [93]:
all_scores = cross_val_score(RandomForestRegressor(n_estimators=200), X=X, y=y, cv=4, params_list=params_list,random_state=42, scoring=r2_score)

0it [00:02, ?it/s]


KeyboardInterrupt: 

In [81]:
all_scores = np.array(all_scores)

In [82]:
all_scores_mean = all_scores.mean(axis=1)

In [101]:
compare_list = []
for i ,value in enumerate(all_scores_mean[1:]):
    compare_dict = {}
    compare_dict['model_index'] = i+1
    compare_dict['avg_score'] = value
    compare_dict['effect_sign'] = np.sign(value - all_scores_mean[0])
    compare_list.append(compare_dict)

sorted(compare_list, key=lambda x: x['avg_score'], reverse=True)

[{'model_index': 2, 'avg_score': 0.5580414196539715, 'effect_sign': 1.0},
 {'model_index': 3, 'avg_score': 0.549461402052844, 'effect_sign': 1.0},
 {'model_index': 4, 'avg_score': 0.5430076277693139, 'effect_sign': -1.0},
 {'model_index': 5, 'avg_score': 0.5344665533277149, 'effect_sign': -1.0},
 {'model_index': 1, 'avg_score': 0.4977098949041202, 'effect_sign': -1.0}]