In [1]:
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor,HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import seaborn as sns
import numpy as np
import pandas as pd
import xgboost as xgb

In [6]:
match_data = pd.read_csv('..\\Data\\matchData2.csv')
wins = pd.read_csv('..\\Data\\matchDataWins.csv')

In [73]:
# Realizo el split de los datos con un random state de 42 para que siempre se divida igual
predictors_train, predictors_test, wins_team1_train, wins_team1_test, wins_team2_train, wins_team2_test = train_test_split(predictors, win_team1, win_team2, test_size=0.2, random_state=42)

# Transformo los datos a formato csv
predictors_train.to_csv('..\\Data\\predictors_train.csv', index=False)
predictors_test.to_csv('..\\Data\\predictors_test.csv', index=False)
wins_team1_train.to_csv('..\\Data\\wins_team1_train.csv', index=False)
wins_team1_test.to_csv('..\\Data\\wins_team1_test.csv', index=False)
wins_team2_train.to_csv('..\\Data\\wins_team2_train.csv', index=False)
wins_team2_test.to_csv('..\\Data\\wins_team2_test.csv', index=False)

In [None]:
match_data = match_data.drop(['win_team1', 'win_team2'], axis=1)

In [7]:
match_data = match_data.drop(['firstBaronTeam1', 'firstBaronTeam2'], axis=1)
predictors = match_data
win_team1 = pd.DataFrame(wins['win_team1'])
win_team2 = pd.DataFrame(wins['win_team2'])

In [8]:
# Modelos utilizados
RF_classifier = RandomForestClassifier()
RF_regressor = RandomForestRegressor()
B_classifier = xgb.XGBClassifier()
B_regressor = HistGradientBoostingRegressor()

In [5]:
# Aplicamos Cross Validation a los modelos
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Realizar la validación cruzada y obtener los puntajes de rendimiento para la predicción de partidas
RF_scores_win = cross_val_score(RF_classifier, predictors, wins, cv=kfold, n_jobs=-1)
B_scores_win1 = cross_val_score(B_classifier, predictors, win_team1, cv=kfold, n_jobs=-1)
B_scores_win2 = cross_val_score(B_classifier, predictors, win_team2, cv=kfold, n_jobs=-1)


# Imprimir los resultados
print("Precisión de Validación Cruzada en RF:", RF_scores_win)
print("Precisión Promedio en RF: {:.2f}%".format(100 * RF_scores_win.mean()))
print("Precisión de Validación Cruzada en Boosting con victorias del equipo 1:", B_scores_win1)
print("Precisión Promedio en Boosting con victorias del equipo 1: {:.2f}%".format(100 * B_scores_win1.mean()))
print("Precisión de Validación Cruzada en Boosting con victorias del equipo 2:", B_scores_win2)
print("Precisión Promedio en Boosting con victorias del equipo 2: {:.2f}%".format(100 * B_scores_win2.mean()))

Precisión de Validación Cruzada en RF: [0.74705324 0.74904033 0.7479113  0.74402746 0.74600307]
Precisión Promedio en RF: 74.68%
Precisión de Validación Cruzada en Boosting con victorias del equipo 1: [0.75351127 0.75608544 0.75734995 0.75184031 0.75467437]
Precisión Promedio en Boosting con victorias del equipo 1: 75.47%
Precisión de Validación Cruzada en Boosting con victorias del equipo 2: [0.75351127 0.75608544 0.75734995 0.75184031 0.75467437]
Precisión Promedio en Boosting con victorias del equipo 2: 75.47%


In [9]:
game_duration = match_data['gameDuration']
match_data = match_data.drop(['gameDuration'], axis=1)
predictors = match_data

In [10]:
# Aplicamos Cross Validation a los modelos para la predicción de la duración de las partidas
RF_scores_time = cross_val_score(RF_regressor, predictors, game_duration, cv=kfold, n_jobs=-1, scoring='neg_mean_squared_error')
B_scores_time = cross_val_score(B_regressor, predictors, game_duration, cv=kfold, n_jobs=-1, scoring='neg_mean_squared_error')

# Imprimir los resultados
print("Error cuadratico medio de Validación Cruzada en RF:", -1 * RF_scores_time)
print("Error cuadratico medio Promedio en RF: {:.2f}".format(-1 * RF_scores_time.mean()))
print("Error cuadratico medio de Validación Cruzada en Boosting:", -1 * B_scores_time)
print("Error cuadratico medio Promedio en Boosting: {:.2f}".format(-1 * B_scores_time.mean()))

Error cuadratico medio de Validación Cruzada en RF: [103449.03486657 104784.67593615 104292.78388293 104216.90298291
 103801.5763174 ]
Error cuadratico medio Promedio en RF: 104108.99
Error cuadratico medio de Validación Cruzada en Boosting: [102085.94799173 102713.90984556 102927.14190557 102568.64864911
 101321.97685493]
Error cuadratico medio Promedio en Boosting: 102323.53


In [11]:
# Buscamos los mejores parametros para el modelo de Random Forest para clasificacion (te detona la compu si lo ejecutas)
param_distributions = { 
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'max_features': randint(1, 20),
    'min_samples_leaf': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(RF_classifier, param_distributions=param_distributions, n_iter=25, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(predictors, wins)
random_search.best_params_

Fitting 3 folds for each of 25 candidates, totalling 75 fits


{'bootstrap': True,
 'max_depth': 15,
 'max_features': 11,
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 171}

In [18]:
# Buscamos los mejores parametros para el modelo de Random Forest para regresion (te detona la compu si lo ejecutas)
param_distributions = { 
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'max_features': randint(1, 20),
    'min_samples_leaf': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(RF_regressor, param_distributions=param_distributions, n_iter=25, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(predictors, game_duration)
random_search.best_params_

Fitting 3 folds for each of 25 candidates, totalling 75 fits


{'bootstrap': False,
 'max_depth': 18,
 'max_features': 8,
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'n_estimators': 439}

In [19]:
# Buscamos los mejores parametros para el modelo de Boosting para clasificacion (El resultado encontrado es peor que los parametros por defecto)
param_distributions = { 
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': randint(1, 20),
    'n_estimators': randint(50, 500),
    'gamma': [0, 0.1, 0.5, 1],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'eta': [0.01, 0.1, 0.5, 1],
}

random_search = RandomizedSearchCV(B_classifier, param_distributions=param_distributions, n_iter=150, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(predictors, wins)
random_search.best_params_

Fitting 5 folds for each of 150 candidates, totalling 750 fits


{'eta': 0.5,
 'gamma': 1,
 'learning_rate': 0.01,
 'max_depth': 15,
 'n_estimators': 156,
 'reg_alpha': 1}

In [22]:
# Buscamos los mejores parametros para el modelo de Boosting para regresion 
param_distributions = { 
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'max_depth': randint(1, 20),
    'max_iter': randint(1, 20),
    'max_leaf_nodes': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'l2_regularization': [0, 0.1, 0.5, 1]
}

random_search = RandomizedSearchCV(B_regressor, param_distributions=param_distributions, n_iter=500, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(predictors, game_duration)
random_search.best_params_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


{'l2_regularization': 0.5,
 'learning_rate': 0.5,
 'max_depth': 10,
 'max_iter': 16,
 'max_leaf_nodes': 16,
 'min_samples_leaf': 15}