In [1]:
%load_ext autoreload
%autoreload 3

In [None]:
import os
import sys

sys.path.append(os.path.dirname(os.path.abspath(os.path.abspath(""))))

# Implementación de K-folds cross validation

[Cross-validation: evaluating estimator performance](https://scikit-learn.org/stable/modules/cross_validation.html#k-fold)

[Cross-Val-Score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score)

Se hizo una implementacion básica de Cross validation para calcular un score.

- Determinamos en cuantas partes queremos dividir nuestro dataset.

- De cada uno obtenemos una lista de scores, estos son errores medios cuadraticos de cada split.

- Sacamos la media de esos errores

- Obtenemos el valor final real sacando su valor absoluto.


## Importamos librerías


In [7]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

## Cargamos datos:


In [2]:
dataset = pd.read_csv("../data/felicidad.csv")
data = dataset.drop(["country", "score", "rank"], axis=1)
target = dataset["score"]

## Implementación básica:


In [4]:
models = {"DecisionTreeRegressor": DecisionTreeRegressor()}
print("---- Easy Implementation ----")

for name, model in models.items():
    score = cross_val_score(model, data, target, cv=3, scoring="neg_mean_squared_error")
    print("Scores: ", score)
    print("Mean score: ", np.abs(np.mean(score)))

print("=" * 65)

---- Easy Implementation ----
Scores:  [-0.90219629 -0.15716254 -1.05852282]
Mean score:  0.7059605516801576


[Selection KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html?highlight=kfold#sklearn.model_selection.KFold)

Se hizo una implementación más sofisticada de K Fold Cross validation para calcular separar los datos en sus respectivos dataset y poder entrenar a nuestro modelo manualmente.


In [6]:
def train_test_split_kf(
    data: np.array, target: np.array, train: np.array, test: np.array
) -> np.array:
    X_train = data[train]
    X_test = data[test]
    y_train = target[train]
    y_test = target[test]
    return X_train, X_test, y_train, y_test

In [8]:
def evaluate_model(model, metric, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = metric(y_test, y_pred)
    return score

In [9]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
scores = []
print("---- Full Implementation ----")
for name, model in models.items():
    print(f"I'm evaluating {name}")
    for n_fold, (train, test) in enumerate(kf.split(data)):
        print(f"\tI'm running fold {n_fold + 1}")
        X_train, X_test, y_train, y_test = train_test_split_kf(
            data.values, target.values, train, test
        )
        score = evaluate_model(
            model, mean_squared_error, X_train, X_test, y_train, y_test
        )
        print("\t\t- Score: ", score)
        scores.append(score)
    print("=" * 65)
    print("Score: ", scores)
    print("Mean score: ", np.mean(scores))

---- Full Implementation ----
I'm evaluating DecisionTreeRegressor
	I'm running fold 1
		- Score:  0.007391197122189316
	I'm running fold 2
		- Score:  0.009079884722485148
	I'm running fold 3
		- Score:  0.006202626635767081
Score:  [0.007391197122189316, 0.009079884722485148, 0.006202626635767081]
Mean score:  0.007557902826813848
