In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
import os
data_folder_path = "./data"
print(os.listdir(data_folder_path))

['test.csv', 'train.csv']


In [3]:
df_train = pd.read_csv(f'{data_folder_path}/train.csv', delimiter=';', decimal=",")
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7639 entries, 0 to 7638
Data columns (total 92 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   target         7639 non-null   int64  
 1   v2a1           7639 non-null   float64
 2   hacdor         7639 non-null   int64  
 3   rooms          7639 non-null   int64  
 4   hacapo         7639 non-null   int64  
 5   v14a           7639 non-null   int64  
 6   refrig         7639 non-null   int64  
 7   v18q1          7639 non-null   int64  
 8   r4h1           7639 non-null   int64  
 9   r4h2           7639 non-null   int64  
 10  r4m1           7639 non-null   int64  
 11  r4m2           7639 non-null   int64  
 12  escolari       7639 non-null   int64  
 13  pared1         7639 non-null   int64  
 14  pared2         7639 non-null   int64  
 15  pared3         7639 non-null   int64  
 16  pared4         7639 non-null   int64  
 17  pared5         7639 non-null   int64  
 18  pared6  

In [4]:
x_train = df_train.iloc[:,1:]
y_train = df_train['target']


In [5]:
x_test = pd.read_csv(f'{data_folder_path}/test.csv', delimiter=';', decimal=",")
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Data columns (total 91 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   v2a1           1910 non-null   float64
 1   hacdor         1910 non-null   int64  
 2   rooms          1910 non-null   int64  
 3   hacapo         1910 non-null   int64  
 4   v14a           1910 non-null   int64  
 5   refrig         1910 non-null   int64  
 6   v18q1          1910 non-null   int64  
 7   r4h1           1910 non-null   int64  
 8   r4h2           1910 non-null   int64  
 9   r4m1           1910 non-null   int64  
 10  r4m2           1910 non-null   int64  
 11  escolari       1910 non-null   int64  
 12  pared1         1910 non-null   int64  
 13  pared2         1910 non-null   int64  
 14  pared3         1910 non-null   int64  
 15  pared4         1910 non-null   int64  
 16  pared5         1910 non-null   int64  
 17  pared6         1910 non-null   int64  
 18  pared7  

In [10]:
from sklearn.model_selection import StratifiedKFold

def kfold_cv(X, y, k, H, cv_fun, random_state):
    """
    Do stratified k-fold cross-validation with a dataset, to check how a model behaves as a function
    of the values in H (eg. a hyperparameter such as tree depth, or polynomial degree).

    :param X: feature matrix.
    :param y: response column.
    :param k: number of folds.
    :param H: values of the hyperparameter to cross-validate.
    :param cv_fun: function of the form (X_train, y_train, X_valid, y_valid, h) to evaluate the model in one split,
        as a function of h. It must return a dictionary with metric score values.
    :param random_state: controls the pseudo random number generation for splitting the data.
    :return: a Pandas dataframe with metric scores along values in H.
    """
    kf = StratifiedKFold(n_splits = k, shuffle = True, random_state = random_state)
    pr = []  # to store global results

    # for each value h in H, do CV
    for i, h in enumerate(H):
        scores = []  # to store the k results for this h
        # for each fold 1..K
        for train_index, valid_index in kf.split(X, y):
            # partition the data in training and validation
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

            # call cv_fun to train the model and compute performance
            fold_scores = cv_fun(X_train, y_train, X_valid, y_valid, h)
            scores.append(fold_scores)

        rowMeans = pd.DataFrame(scores).mean(axis = 0)  # average scores across folds
        pr.append(rowMeans)  # append to global results
        print(f'Training models: {i+1}/{len(H)}, with hyperparameters: {h}', end='\t\t\r')

    pr = pd.DataFrame(pr).assign(_h = H)
    return pr

In [26]:
def cv_rf_ntrees(X_train, y_train, X_valid, y_valid, h: dict):
    c = RandomForestClassifier(n_estimators = h.get('n_estimators', 100), criterion = h.get('criterion', "gini"), \
                               max_features = h.get('max_features', "sqrt"), random_state = 54321, \
                               n_jobs = -1) # set n_jobs to speed up by using more CPU cores
    m = c.fit(X_train, y_train)
    p = m.predict(X_valid)

    return {'accuracy': accuracy_score(y_valid, p),
            'precision': precision_score(y_valid, p, average = 'macro', zero_division = 0),
            'recall': recall_score(y_valid, p, average = 'macro'),
            'f': f1_score(y_valid, p, average = 'macro')}

In [27]:
from itertools import product

def get_H():

    params = {"n_estimators": range(5, 305, 20),
              "criterion": ["gini", "entropy", "log_loss"],
              "max_features": ["sqrt", "log2", None]}
    # Define the iterations
    keys, values = zip(*params.items())
    for bundle in product(*values):
        d = dict(zip(keys, bundle))
        yield d

In [28]:
pr = kfold_cv(x_train, y_train, k = 10, H = list(get_H()), cv_fun = cv_rf_ntrees, random_state = 12345)
pr

Training models: 135/135, with hyperparameters: {'n_estimators': 285, 'criterion': 'log_loss', 'max_features': None}				

Unnamed: 0,accuracy,precision,recall,f,_h
0,0.886897,0.883692,0.874380,0.878416,"{'n_estimators': 5, 'criterion': 'gini', 'max_..."
1,0.873020,0.870142,0.857671,0.862873,"{'n_estimators': 5, 'criterion': 'gini', 'max_..."
2,0.932976,0.931844,0.925323,0.928191,"{'n_estimators': 5, 'criterion': 'gini', 'max_..."
3,0.886240,0.883210,0.873653,0.877720,"{'n_estimators': 5, 'criterion': 'entropy', 'm..."
4,0.874328,0.870504,0.860281,0.864688,"{'n_estimators': 5, 'criterion': 'entropy', 'm..."
...,...,...,...,...,...
130,0.930619,0.935831,0.916794,0.924706,"{'n_estimators': 285, 'criterion': 'entropy', ..."
131,0.959419,0.960491,0.953153,0.956532,"{'n_estimators': 285, 'criterion': 'entropy', ..."
132,0.936380,0.941042,0.923863,0.931113,"{'n_estimators': 285, 'criterion': 'log_loss',..."
133,0.930619,0.935831,0.916794,0.924706,"{'n_estimators': 285, 'criterion': 'log_loss',..."


In [37]:
id = pr['accuracy'].idxmax()
print(pr.iloc[id,[0,-1]])
id = pr['f'].idxmax()
print(pr.iloc[id,[0,-1]])
print(pr.iloc[id]['_h'])


accuracy                                             0.959419
_h          {'n_estimators': 285, 'criterion': 'entropy', ...
Name: 131, dtype: object
accuracy                                             0.959419
_h          {'n_estimators': 285, 'criterion': 'entropy', ...
Name: 131, dtype: object
{'n_estimators': 285, 'criterion': 'entropy', 'max_features': None}
