In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold, LeaveOneOut, TimeSeriesSplit, RepeatedKFold

# Modelos de regresion lineal
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR


## Metricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, classification_report,accuracy_score

## Modelos de clasificacion
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

### SELECCION ENTRE DIEZ MODELOS DE CLASIFICACION CON VALIDACIONES

### Recoleccion y division de datos de datos

In [2]:
df = pd.read_csv('../4.3 csv_prep_ml/df_ml.csv')
target_clas = 'next_class_encoded'
target_reg = 'next_score'
X = df.drop(columns=[target_clas, target_reg], axis = 1)
y = df[target_clas]

### Lista de modelos a probar

In [13]:
models = [
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42, n_jobs=-1)),
    ('SVC', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier(n_jobs=-1)),
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=500, n_jobs=-1)),
    ('Naive Bayes', GaussianNB()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('MLP', MLPClassifier(random_state=42, max_iter=1000)),
    ('Extra Trees', ExtraTreesClassifier(random_state=42, n_jobs=-1)),
    ('Ridge Classifier', RidgeClassifier(random_state=42))
]

### Funcion para evaluar modelo con diferentes tecnicas de validacion

In [14]:
def evaluate_model(model, X, y):
    results = {}
    
    # Holdout
    accuracies = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.80)
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        accuracies.append(accuracy_test)
    results['Holdout'] = np.mean(accuracies)
    
    # K-Fold Cross Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train_index, val_index in kf.split(X):
        X_train_kf, X_val_kf = X.iloc[train_index], X.iloc[val_index]
        y_train_kf, y_val_kf = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train_kf, y_train_kf)
        y_pred_kf = model.predict(X_val_kf)
        accuracies.append(accuracy_score(y_val_kf, y_pred_kf))
    results['K-Fold'] = np.mean(accuracies)
    
    # Leave-One-Out Cross Validation (DEMASIADO TIEMPO DE PROCESAMIENTO!)
    #loo = LeaveOneOut()
    #accuracies = []
    #for train_index, val_index in loo.split(X):
    #    X_train_loo, X_val_loo = X.iloc[train_index], X.iloc[val_index]
    #    y_train_loo, y_val_loo = y.iloc[train_index], y.iloc[val_index]
    #    model.fit(X_train_loo, y_train_loo)
    #    y_pred_loo = model.predict(X_val_loo)
    #    accuracies.append(accuracy_score(y_val_loo, y_pred_loo))
    #results['Leave-One-Out'] = np.mean(accuracies)
    
    # Time Series Split
    tscv = TimeSeriesSplit()
    accuracies = []
    for train_index, val_index in tscv.split(X):
        X_train_tscv, X_val_tscv = X.iloc[train_index], X.iloc[val_index]
        y_train_tscv, y_val_tscv = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train_tscv, y_train_tscv)
        y_pred_tscv = model.predict(X_val_tscv)
        accuracies.append(accuracy_score(y_val_tscv, y_pred_tscv))
    results['TimeSeriesSplit'] = np.mean(accuracies)
    
    return results

## Evaluar todos los modelos

In [15]:
model_results = {}
for name, model in models:
    results = evaluate_model(model, X, y)
    model_results[name] = results
    print(f"{name}: {results}")

Decision Tree: {'Holdout': 0.8937101449275363, 'K-Fold': 0.9039469089838243, 'TimeSeriesSplit': 0.9013937282229965}
Random Forest: {'Holdout': 0.9244347826086957, 'K-Fold': 0.9245484949832775, 'TimeSeriesSplit': 0.9240418118466899}
SVC: {'Holdout': 0.9142028985507247, 'K-Fold': 0.9149727603542205, 'TimeSeriesSplit': 0.9076655052264808}
KNN: {'Holdout': 0.9189565217391304, 'K-Fold': 0.9184556488083968, 'TimeSeriesSplit': 0.9153310104529616}
Logistic Regression: {'Holdout': 0.9199130434782608, 'K-Fold': 0.9216465787425591, 'TimeSeriesSplit': 0.9080139372822298}
Naive Bayes: {'Holdout': 0.5627246376811594, 'K-Fold': 0.5679039145158915, 'TimeSeriesSplit': 0.2337979094076655}
Gradient Boosting: {'Holdout': 0.9173913043478259, 'K-Fold': 0.9193256347152984, 'TimeSeriesSplit': 0.9149825783972126}
MLP: {'Holdout': 0.9064927536231884, 'K-Fold': 0.9085858522117751, 'TimeSeriesSplit': 0.8979094076655052}
Extra Trees: {'Holdout': 0.925710144927536, 'K-Fold': 0.9259986117246166, 'TimeSeriesSplit': 0

## Seleccion de los dos mejores modelos segun el puntaje

In [16]:
average_accuracies = {name: np.mean(list(scores.values())) for name, scores in model_results.items()}
best_models = sorted(average_accuracies, key=average_accuracies.get, reverse=True)[:2]
print(f"Los dos mejores modelos: {best_models}")

Los dos mejores modelos: ['Extra Trees', 'Random Forest']


## SELECCION ENTRE DIEZ MODELOS DE REGRESION CON VALIDACIONES

In [17]:
%%time
df_preprocessed = pd.read_csv('../4.3 csv_prep_ml/df_ml.csv')
X = df_preprocessed.drop(columns=['next_score']).values
y = df_preprocessed['next_score'].values

# Lista de modelos a probar
models = [
    ('Linear Regression', LinearRegression(n_jobs=-1)),
    ('Ridge', Ridge(random_state=42)),
    ('Lasso', Lasso(random_state=42)),
    ('ElasticNet', ElasticNet(random_state=42)),
    ('Bayesian Ridge', BayesianRidge()),
    ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
    ('SVR', SVR()),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42)),
    ('Hist Gradient Boosting', HistGradientBoostingRegressor(random_state=42)),
    ('Extra Trees', ExtraTreesRegressor(random_state=42, n_jobs=-1))
]
# Función para evaluar el modelo con diferentes técnicas de validación
def evaluate_model(model, X, y):
    results = {}
    
    # Holdout
    accuracies = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=i)
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        r2_test = r2_score(y_test, y_pred_test)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        accuracies.append((r2_test, mae_test))
    results['Holdout'] = {'R2': np.mean([a[0] for a in accuracies]), 'MAE': np.mean([a[1] for a in accuracies])}
    
    # K-Fold Cross Validation
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    r2_scores = []
    mae_scores = []
    for train_index, val_index in kf.split(X):
        X_train_kf, X_val_kf = X[train_index], X[val_index]
        y_train_kf, y_val_kf = y[train_index], y[val_index]
        model.fit(X_train_kf, y_train_kf)
        y_pred_kf = model.predict(X_val_kf)
        r2_scores.append(r2_score(y_val_kf, y_pred_kf))
        mae_scores.append(mean_absolute_error(y_val_kf, y_pred_kf))
    results['K-Fold'] = {'R2': np.mean(r2_scores), 'MAE': np.mean(mae_scores)}
    
    # Repeated K-Fold Cross Validation
    rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
    r2_scores_rkf = []
    mae_scores_rkf = []
    for train_index, val_index in rkf.split(X):
        X_train_rkf, X_val_rkf = X[train_index], X[val_index]
        y_train_rkf, y_val_rkf = y[train_index], y[val_index]
        model.fit(X_train_rkf, y_train_rkf)
        y_pred_rkf = model.predict(X_val_rkf)
        r2_scores_rkf.append(r2_score(y_val_rkf, y_pred_rkf))
        mae_scores_rkf.append(mean_absolute_error(y_val_rkf, y_pred_rkf))
    results['Repeated K-Fold'] = {'R2': np.mean(r2_scores_rkf), 'MAE': np.mean(mae_scores_rkf)}
    
    return results

# Evaluar todos los modelos
model_results = {}
for name, model in models:
    results = evaluate_model(model, X, y)
    model_results[name] = results
    print(f"{name}: {results}")

# Seleccionar los dos mejores modelos basados en el R2 promedio (mayor es mejor)
average_r2 = {name: np.mean([scores['K-Fold']['R2'], scores['Repeated K-Fold']['R2']]) for name, scores in model_results.items()}
best_models = sorted(average_r2, key=average_r2.get, reverse=True)[:2]
print(f"Los dos mejores modelos: {best_models}")

Linear Regression: {'Holdout': {'R2': -1.8602320112067177e+18, 'MAE': 277300798.2430181}, 'K-Fold': {'R2': -1.442444024833319e+17, 'MAE': 95428550.5573503}, 'Repeated K-Fold': {'R2': -8.396359857595414e+18, 'MAE': 457347688.0636404}}
Ridge: {'Holdout': {'R2': 0.6501197041477269, 'MAE': 6.5390555623663715}, 'K-Fold': {'R2': 0.6539770049131249, 'MAE': 6.476879080410025}, 'Repeated K-Fold': {'R2': 0.6535813427719188, 'MAE': 6.469693274321204}}
Lasso: {'Holdout': {'R2': 0.5177204328911443, 'MAE': 7.606174804124723}, 'K-Fold': {'R2': 0.5251808154536912, 'MAE': 7.535211326854449}, 'Repeated K-Fold': {'R2': 0.523624657577252, 'MAE': 7.543520487643691}}
ElasticNet: {'Holdout': {'R2': 0.4080862550833421, 'MAE': 8.13218402226516}, 'K-Fold': {'R2': 0.41384970692326994, 'MAE': 8.087915595968711}, 'Repeated K-Fold': {'R2': 0.41341038712543104, 'MAE': 8.089355044042913}}
Bayesian Ridge: {'Holdout': {'R2': 0.6502666964276881, 'MAE': 6.542433154917656}, 'K-Fold': {'R2': 0.6549560935629435, 'MAE': 6.47