# GENERACIÓN DE MODELOS

##### Autor: 
* Javier Tomás Fernández Martín

## 0. Preliminares

Antes de empezar con el código, importamos todas las librerias que vamos a necesitar.

In [1]:
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import check_scoring
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


from sklearn.compose import make_column_transformer


from keras import layers
from keras import models
import tensorflow.keras as keras
import tensorflow as tf



Además de definir una semilla para asegurar que sea reproducible

In [2]:
random_state = 42


Y definimos un par de funciones que usaremos para evaluar módelos con distintos parámetros y compararlos entre sí

In [3]:
def optimize_params(estimator, X, y, cv, scoring=None, refit=True, **param_grid):
    """Exhaustive search over specified parameter values for an estimator."""
    grid_search_cv = GridSearchCV(estimator,
                                  param_grid,
                                  scoring=scoring,
                                  refit=refit,
                                  cv=cv,
                                  return_train_score=True).fit(X, y)

    cv_results = pd.DataFrame(grid_search_cv.cv_results_)

    # Drop the results for each validation split and sort by the refit metric
    labels = cv_results.filter(regex="split")
    by = cv_results.filter(regex="rank_test").columns[0]
    cv_results = cv_results.drop(labels, axis=1).sort_values(by)

    display(cv_results)

    return grid_search_cv

## 1.Carga de datos

In [4]:
df = pd.read_csv('bbdd.csv', index_col=0)


  df = pd.read_csv('bbdd.csv', index_col=0)


Mostramos una pequeña muestra para asegurar que se ha cargado bien y de paso echarle un vistazo a nuestra BBDD

In [5]:
df.sample(5, random_state=random_state)

Unnamed: 0,P1_TFT8_Ashe_Tier,P1_TFT8_Ashe_Obj1,P1_TFT8_Ashe_Obj2,P1_TFT8_Ashe_Obj3,P1_TFT8_Blitzcrank_Tier,P1_TFT8_Blitzcrank_Obj1,P1_TFT8_Blitzcrank_Obj2,P1_TFT8_Blitzcrank_Obj3,P1_TFT8_Galio_Tier,P1_TFT8_Galio_Obj1,...,P2_TFT8_Syndra_Obj2,P2_TFT8_Syndra_Obj3,P2_TFT8_Urgot_Tier,P2_TFT8_Urgot_Obj1,P2_TFT8_Urgot_Obj2,P2_TFT8_Urgot_Obj3,P2_Augment1,P2_Augment2,P2_Augment3,P1_Win
3883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,TFT6_Augment_SecondWind1,TFT6_Augment_TradeSectorPlus,TFT8_Augment_KaisaCarry,1
2439,0,0,0,0,3,TFT_Item_BrambleVest,TFT_Item_IonicSpark,TFT8_Item_GenAEEmblemItem,0,0,...,TFT_Item_StatikkShiv,0,0,0,0,0,TFT8_Augment_HeartTrait,TFT8_Augment_AnnieSupport,TFT8_Augment_GenAEEmblem,0
1786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,TFT6_Augment_SecondWind2,TFT6_Augment_Electrocharge2,TFT8_Augment_NunuSupport,1
2329,0,0,0,0,2,0,0,0,0,0,...,0,0,2,0,0,0,TFT6_Augment_ClearMind,TFT6_Augment_PortableForge,TFT8_Augment_ApheliosCarry,1
2567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,TFT6_Augment_GrandGambler,TFT7_Augment_Preparation2,TFT8_Augment_SonaExile,0


Convertimos todas las columnas categóricas en tipo de datos de cadena de texto y separar las columnas numéricas y categóricas, para después tenerlas accesibles


In [6]:

cat_cols = [col for col in df.columns if 'Obj' in col or 'Augment' in col]
df[cat_cols] = df[cat_cols].astype(str)



num_cols = [col for col in df.columns if col not in cat_cols + ['P1_Win']]
cat_cols_idx = [df.columns.get_loc(col) for col in cat_cols]


Divido los datos

In [7]:

X = df.drop('P1_Win', axis=1)
X = X.filter(like='Tier')
y = df['P1_Win']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)

In [8]:

cat_cols = [col for col in X.columns if 'Obj' in col or 'Augment' in col]



num_cols = [col for col in X.columns if col not in cat_cols + ['P1_Win']]

Y comprobamos que se han cargado bien

In [9]:
X.shape

(4263, 118)

In [10]:
X_train.sample(5, random_state=random_state)

Unnamed: 0,P1_TFT8_Ashe_Tier,P1_TFT8_Blitzcrank_Tier,P1_TFT8_Galio_Tier,P1_TFT8_Gangplank_Tier,P1_TFT8_Kayle_Tier,P1_TFT8_Lulu_Tier,P1_TFT8_Nasus_Tier,P1_TFT8_Lux_Tier,P1_TFT8_Poppy_Tier,P1_TFT8_Renekton_Tier,...,P2_TFT8_Zac_Tier,P2_TFT8_Zed_Tier,P2_TFT8_Aphelios_Tier,P2_TFT8_Fiddlesticks_Tier,P2_TFT8_Janna_Tier,P2_TFT8_Leona_Tier,P2_TFT8_Mordekaiser_Tier,P2_TFT8_Nunu_Tier,P2_TFT8_Syndra_Tier,P2_TFT8_Urgot_Tier
1248,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,2,0
3730,0,0,0,2,0,0,0,0,0,0,...,2,0,0,1,0,1,0,2,0,1
879,2,0,0,0,0,0,0,0,0,2,...,0,2,0,2,1,0,0,0,0,0
4143,0,2,0,0,0,0,0,0,0,0,...,0,0,1,2,0,1,0,0,0,1


In [11]:
y_train.sample(5, random_state=random_state)

1248    0
12      1
3730    1
879     0
4143    1
Name: P1_Win, dtype: int64

In [12]:
X_test.sample(5, random_state=random_state)

Unnamed: 0,P1_TFT8_Ashe_Tier,P1_TFT8_Blitzcrank_Tier,P1_TFT8_Galio_Tier,P1_TFT8_Gangplank_Tier,P1_TFT8_Kayle_Tier,P1_TFT8_Lulu_Tier,P1_TFT8_Nasus_Tier,P1_TFT8_Lux_Tier,P1_TFT8_Poppy_Tier,P1_TFT8_Renekton_Tier,...,P2_TFT8_Zac_Tier,P2_TFT8_Zed_Tier,P2_TFT8_Aphelios_Tier,P2_TFT8_Fiddlesticks_Tier,P2_TFT8_Janna_Tier,P2_TFT8_Leona_Tier,P2_TFT8_Mordekaiser_Tier,P2_TFT8_Nunu_Tier,P2_TFT8_Syndra_Tier,P2_TFT8_Urgot_Tier
1531,0,2,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
3155,0,0,2,3,0,0,3,0,0,0,...,0,0,1,0,2,1,0,0,0,0
1264,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
637,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,1,0,2,0


In [13]:
y_test.sample(5, random_state=random_state)

1531    1
3155    1
1264    0
637     1
968     1
Name: P1_Win, dtype: int64

## 2. Generación de modelos

Primero vamos a crear el preprocesamiento que vamos a aplicar a los datos. 

In [14]:
preprocessor = make_column_transformer(
    ((MinMaxScaler(feature_range=(0, 1))), num_cols),
    (OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
    remainder="passthrough"
)

Voy a utilizar la técnica de validación cruzada estratificada de 1 x 10

In [15]:
n_splits = 10
n_repeats = 1

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)


### 2.1 DECISION TREE 

Crear el preprocesador y el pipeline

In [20]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier())
    ])


Entrenar, predecir y evaluar

In [58]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                

In [22]:
accTree = pipeline.score(X_test, y_test)
print('Train score:', pipeline.score(X_train, y_train))
print('Test score:', accTree)

Train score: 1.0
Test score: 0.5592028135990621


In [60]:
y_pred = pipeline.predict(X_test)

In [61]:
print(sklearn.metrics.confusion_matrix(y_test,y_pred))

[[254 173]
 [197 229]]


Una vez tenemos la estructura de un arbol básico, vamos a probar a evaluarlo con distintos parámetros

In [23]:
pipeline = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=random_state))

In [24]:
criterion = ["gini", "entropy"]
max_depth = [1, 2, 3, 4, 5, 6, None]
ccp_alpha = [0.0, 0.01, 0.02, 0.03, 0.04, 0.05]

decision_tree_classifier = optimize_params(pipeline, X_train, y_train, cv, decisiontreeclassifier__criterion=criterion, decisiontreeclassifier__max_depth=max_depth, decisiontreeclassifier__ccp_alpha=ccp_alpha)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeclassifier__ccp_alpha,param_decisiontreeclassifier__criterion,param_decisiontreeclassifier__max_depth,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
3,0.017366,0.001001,0.002407,0.000497,0.0,gini,4,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.571261,0.022094,1,0.591919,0.008649
10,0.019621,0.001241,0.002802,0.000400,0.0,entropy,4,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.567449,0.021076,2,0.588759,0.008446
12,0.024181,0.001408,0.002902,0.000300,0.0,entropy,6,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.567155,0.018370,3,0.614109,0.010874
13,0.052745,0.002628,0.002849,0.000553,0.0,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.564223,0.027858,4,1.000000,0.000000
5,0.023322,0.001419,0.002403,0.000490,0.0,gini,6,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.562170,0.019935,5,0.629651,0.011379
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,0.024104,0.001141,0.002803,0.000601,0.03,gini,6,"{'decisiontreeclassifier__ccp_alpha': 0.03, 'd...",0.500880,0.001173,22,0.500880,0.000130
48,0.049881,0.002888,0.002901,0.000700,0.03,gini,,"{'decisiontreeclassifier__ccp_alpha': 0.03, 'd...",0.500880,0.001173,22,0.500880,0.000130
49,0.015400,0.001267,0.002602,0.000490,0.03,entropy,1,"{'decisiontreeclassifier__ccp_alpha': 0.03, 'd...",0.500880,0.001173,22,0.500880,0.000130
51,0.019409,0.001322,0.002903,0.000831,0.03,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.03, 'd...",0.500880,0.001173,22,0.500880,0.000130


In [62]:
decision_tree_classifier

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=1, n_splits=10, random_state=42),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('minmaxscaler',
                                                                         MinMaxScaler(),
                                                                         ['P1_TFT8_Ashe_Tier',
                                                                          'P1_TFT8_Blitzcrank_Tier',
                                                                          'P1_TFT8_Galio_Tier',
                                                                          'P1_TFT8_Gangplank_Tier',
                                                                          'P1_TFT8_Kayle_Tier',
                                                                          'P1_TFT8_Lulu_Tier',
           

### 2.2 CAT BOOST

In [25]:
catBoost = CatBoostClassifier(
    learning_rate=0.1,
    random_state=random_state,
    l2_leaf_reg = 0.2,
    cat_features=cat_cols)



catBoost.fit(X_train, y_train)

0:	learn: 0.6887639	total: 150ms	remaining: 2m 29s
1:	learn: 0.6837628	total: 152ms	remaining: 1m 15s
2:	learn: 0.6785010	total: 154ms	remaining: 51.3s
3:	learn: 0.6729765	total: 156ms	remaining: 39s
4:	learn: 0.6693966	total: 159ms	remaining: 31.6s
5:	learn: 0.6655104	total: 160ms	remaining: 26.6s
6:	learn: 0.6618946	total: 163ms	remaining: 23.1s
7:	learn: 0.6583248	total: 165ms	remaining: 20.5s
8:	learn: 0.6551885	total: 167ms	remaining: 18.4s
9:	learn: 0.6513089	total: 173ms	remaining: 17.1s
10:	learn: 0.6479270	total: 175ms	remaining: 15.7s
11:	learn: 0.6445619	total: 177ms	remaining: 14.6s
12:	learn: 0.6407275	total: 179ms	remaining: 13.6s
13:	learn: 0.6380312	total: 181ms	remaining: 12.8s
14:	learn: 0.6363535	total: 183ms	remaining: 12s
15:	learn: 0.6340796	total: 185ms	remaining: 11.4s
16:	learn: 0.6314014	total: 187ms	remaining: 10.8s
17:	learn: 0.6290790	total: 189ms	remaining: 10.3s
18:	learn: 0.6269138	total: 191ms	remaining: 9.87s
19:	learn: 0.6243665	total: 193ms	remaining

<catboost.core.CatBoostClassifier at 0x262492b9510>

In [26]:
accCat = catBoost.score(X_test, y_test)

print('Train score:', catBoost.score(X_train, y_train))
print('Test score:', accCat)

Train score: 1.0
Test score: 0.6412661195779601


### 2.3 Adaptive boosting

In [16]:
adaboost_model = AdaBoostClassifier(random_state=random_state)


pipelineAdaBoost = make_pipeline(preprocessor, adaboost_model)

In [17]:
pipelineAdaBoost.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                

In [18]:
accAdaBoost = pipelineAdaBoost.score(X_test, y_test)

print('Train score:', pipelineAdaBoost.score(X_train, y_train))
print('Test score:', accAdaBoost)

Train score: 0.6777126099706745
Test score: 0.6565064478311841


In [49]:

base_estimator = DecisionTreeClassifier(random_state=random_state)
base_estimator = [base_estimator]
n_estimators = [20, 50, 100]
learning_rate = [0.95, 1.0]
max_depth = [1, 2, 3]
criterion = ["gini", "entropy"]


adaboost_classifier = optimize_params(pipelineAdaBoost, X_train, y_train, cv, adaboostclassifier__base_estimator=base_estimator, adaboostclassifier__n_estimators=n_estimators, adaboostclassifier__learning_rate=learning_rate, adaboostclassifier__base_estimator__criterion = criterion, adaboostclassifier__base_estimator__max_depth=max_depth)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adaboostclassifier__base_estimator,param_adaboostclassifier__base_estimator__criterion,param_adaboostclassifier__base_estimator__max_depth,param_adaboostclassifier__learning_rate,param_adaboostclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
5,0.393981,0.00448,0.014693,0.000615,"DecisionTreeClassifier(max_depth=1, random_sta...",gini,1,1.0,100,{'adaboostclassifier__base_estimator': Decisio...,0.654311,0.024568,1,0.697797,0.004278
2,0.405893,0.020678,0.015014,0.001416,"DecisionTreeClassifier(max_depth=1, random_sta...",gini,1,0.95,100,{'adaboostclassifier__base_estimator': Decisio...,0.652962,0.024158,2,0.697009,0.003853
23,0.395099,0.00736,0.014673,0.000587,"DecisionTreeClassifier(max_depth=1, random_sta...",entropy,1,1.0,100,{'adaboostclassifier__base_estimator': Decisio...,0.652903,0.024674,3,0.697322,0.003697
20,0.393197,0.003925,0.014633,0.000525,"DecisionTreeClassifier(max_depth=1, random_sta...",entropy,1,0.95,100,{'adaboostclassifier__base_estimator': Decisio...,0.652317,0.02389,4,0.697133,0.003492
1,0.206189,0.011798,0.008608,0.00104,"DecisionTreeClassifier(max_depth=1, random_sta...",gini,1,0.95,50,{'adaboostclassifier__base_estimator': Decisio...,0.641701,0.023582,5,0.678873,0.004661
4,0.204126,0.005625,0.008548,0.000499,"DecisionTreeClassifier(max_depth=1, random_sta...",gini,1,1.0,50,{'adaboostclassifier__base_estimator': Decisio...,0.641173,0.026411,6,0.677856,0.005158
19,0.200922,0.003818,0.008368,0.000481,"DecisionTreeClassifier(max_depth=1, random_sta...",entropy,1,0.95,50,{'adaboostclassifier__base_estimator': Decisio...,0.640528,0.025787,7,0.678488,0.004846
25,0.288382,0.004216,0.008428,0.000494,"DecisionTreeClassifier(max_depth=1, random_sta...",entropy,2,0.95,50,{'adaboostclassifier__base_estimator': Decisio...,0.640352,0.021161,8,0.740085,0.005489
22,0.200502,0.002151,0.008488,0.000539,"DecisionTreeClassifier(max_depth=1, random_sta...",entropy,1,1.0,50,{'adaboostclassifier__base_estimator': Decisio...,0.639531,0.027733,9,0.677067,0.005386
7,0.29061,0.005712,0.008748,0.000845,"DecisionTreeClassifier(max_depth=1, random_sta...",gini,2,0.95,50,{'adaboostclassifier__base_estimator': Decisio...,0.638827,0.025358,10,0.742092,0.005296


### 2.4 GRADIENT BOOSTING


In [31]:
gradient_boosting_model = GradientBoostingClassifier(random_state=random_state)


pipelineGradientBoost = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', gradient_boosting_model)
    ])


In [32]:
pipelineGradientBoost.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                     

In [33]:
accGradientBoost = pipelineGradientBoost.score(X_test, y_test)


print('Train score:', pipelineGradientBoost.score(X_train, y_train))
print('Test score:', accGradientBoost)

Train score: 0.7390029325513197
Test score: 0.6471277842907386


### 2.5 HISTOGRAM GRADIENT BOOSTING

In [34]:
hist_gradient_boosting_model = HistGradientBoostingClassifier(random_state=random_state)


pipelineHist = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', hist_gradient_boosting_model)
    ])


In [35]:
pipelineHist.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                     

In [36]:
accHist = pipelineHist.score(X_test, y_test)


print('Train score:', pipelineHist.score(X_train, y_train))
print('Test score:', accHist)

Train score: 0.895307917888563
Test score: 0.6283704572098476


### 2.6 VECINOS MÁS CERCANOS

In [43]:
n_neighbors = 5
k_neighbors_model = KNeighborsClassifier(n_neighbors)

pipelineKNN = make_pipeline(preprocessor, k_neighbors_model)

In [44]:
pipelineKNN.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                

In [45]:
accKNN = pipelineKNN.score(X_test, y_test)

print('Train score:', pipelineKNN.score(X_train, y_train))
print('Test score:', accKNN)

Train score: 0.7281524926686217
Test score: 0.5967174677608441


Como el número de vecinos puede ser muy relevante a la hora de evaluar este tipo de modelos, merece la pena comprobar que no se puede mejorar el resultado obtenido optimizando dicho parámetro

In [46]:
n_splits = 10
n_repeats = 5

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

In [48]:
n_neighbors = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
weights = ["uniform", "distance"]

k_neighbors_classifier = optimize_params(pipelineKNN, X_train, y_train,cv, kneighborsclassifier__weights=weights, kneighborsclassifier__n_neighbors=n_neighbors)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,param_kneighborsclassifier__weights,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
11,0.009745,0.001175,0.023698,0.003333,10,distance,"{'kneighborsclassifier__n_neighbors': 10, 'kne...",0.584282,0.02275,1,1.0,0.0
7,0.009699,0.001097,0.025045,0.002095,8,distance,"{'kneighborsclassifier__n_neighbors': 8, 'knei...",0.579765,0.0214,2,1.0,0.0
4,0.009506,0.001005,0.029902,0.003236,7,uniform,"{'kneighborsclassifier__n_neighbors': 7, 'knei...",0.579589,0.02181,3,0.700124,0.004062
9,0.010199,0.002021,0.024839,0.00179,9,distance,"{'kneighborsclassifier__n_neighbors': 9, 'knei...",0.579355,0.021447,4,1.0,0.0
5,0.009312,0.00083,0.025602,0.003309,7,distance,"{'kneighborsclassifier__n_neighbors': 7, 'knei...",0.579238,0.02175,5,1.0,0.0
15,0.009448,0.00085,0.023372,0.001326,12,distance,"{'kneighborsclassifier__n_neighbors': 12, 'kne...",0.57824,0.022592,6,1.0,0.0
13,0.009718,0.001257,0.023236,0.00184,11,distance,"{'kneighborsclassifier__n_neighbors': 11, 'kne...",0.577713,0.021693,7,1.0,0.0
8,0.009045,0.000762,0.02893,0.00213,9,uniform,"{'kneighborsclassifier__n_neighbors': 9, 'knei...",0.577654,0.021498,8,0.684503,0.004511
3,0.009098,0.000695,0.025738,0.002751,6,distance,"{'kneighborsclassifier__n_neighbors': 6, 'knei...",0.577126,0.022457,9,1.0,0.0
12,0.009387,0.000942,0.027304,0.001372,11,uniform,"{'kneighborsclassifier__n_neighbors': 11, 'kne...",0.576246,0.021552,10,0.67334,0.004757


### 2.7 RANDOM FOREST

In [41]:
random_forest_model = RandomForestClassifier(random_state=random_state)

pipelineRandom = make_pipeline(preprocessor, random_forest_model)

In [42]:
pipelineRandom.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                

In [43]:
accRandom = pipelineRandom.score(X_test, y_test)

print('Train score:', pipelineRandom.score(X_train, y_train))
print('Test score:', accRandom)

Train score: 1.0
Test score: 0.6307151230949589


### 2.8 PERCEPTRON

In [44]:
perceptron_model = Perceptron(random_state = random_state)
pipelinePerceptron = make_pipeline(preprocessor, perceptron_model)

In [45]:
pipelinePerceptron.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['P1_TFT8_Ashe_Tier',
                                                   'P1_TFT8_Blitzcrank_Tier',
                                                   'P1_TFT8_Galio_Tier',
                                                   'P1_TFT8_Gangplank_Tier',
                                                   'P1_TFT8_Kayle_Tier',
                                                   'P1_TFT8_Lulu_Tier',
                                                   'P1_TFT8_Nasus_Tier',
                                                   'P1_TFT8_Lux_Tier',
                                                   'P1_TFT8_Poppy_Tier',
                                                   'P1_TFT8_Renekton_Tier',
                                

In [46]:
accPerceptron = pipelinePerceptron.score(X_test, y_test)

print('Train score:', pipelinePerceptron.score(X_train, y_train))
print('Test score:', accPerceptron)

Train score: 0.6099706744868035
Test score: 0.5849941383352872


### 2.9 REDES NEURONALES

In [50]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [100]:
callbacks1 = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath="callbacks1.keras",
        save_best_only=True,
        monitor="val_loss")
]


In [101]:
model = models.Sequential()

model.add(layers.Dense(units=32, activation='relu', input_dim=X_train_preprocessed.shape[1]))

model.add(layers.Dropout(0.15))
model.add(layers.Dense(units=1, activation='sigmoid'))


In [102]:
model.compile(optimizer="Adam",
    loss="binary_crossentropy",
    metrics=["accuracy"])

In [103]:
history1 = model.fit(X_train_preprocessed, y_train, epochs=20, batch_size=32, validation_data=(X_test_preprocessed, y_test), callbacks=callbacks1)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [104]:
best1 = keras.models.load_model('callbacks1.keras')


In [119]:
model2 = models.Sequential()

model2.add(layers.Dense(units=32, activation='relu', input_dim=X_train_preprocessed.shape[1]))
model2.add(layers.Dropout(0.15))
model2.add(layers.Dense(units=64, activation='relu'))
model2.add(layers.Dropout(0.15))
model2.add(layers.Dense(units=128, activation='relu'))
model2.add(layers.Dropout(0.15))
model2.add(layers.Dense(units=64, activation='relu'))
model2.add(layers.Dense(units=1, activation='sigmoid'))


In [120]:
model2.compile(optimizer="Adam",
    loss="binary_crossentropy",
    metrics=["accuracy"])

In [121]:
callbacks2 = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath="callbacks2.keras",
        save_best_only=True,
        monitor="val_loss")
]


In [122]:
history2 = model2.fit(X_train_preprocessed, y_train, epochs=20, batch_size=32, validation_data=(X_test_preprocessed, y_test), callbacks=callbacks2)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [109]:
best2 = keras.models.load_model('callbacks2.keras')


## 3. EVALUACIÓN DE MODELOS

In [53]:
models = pd.DataFrame({
    'Model': ['Decision Tree', 'CatBoost', 'AdaBoost', 'Grad boost', 'Histogram Grad','KNN', 'Random Forest', 'Perceptron'],
    'Score': [accTree, accCat, accAdaBoost, accGradientBoost, accHist, accKNN, accRandom, accPerceptron]})

models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,AdaBoost,0.656506
3,Grad boost,0.647128
1,CatBoost,0.641266
6,Random Forest,0.630715
4,Histogram Grad,0.62837
5,KNN,0.596717
7,Perceptron,0.584994
0,Decision Tree,0.559203


Como se puede observar, no se obtienen buenas predicciones, y puede ser debido a la gran cantidad de 0 que tiene la base de datos. Lo comprobamos

In [123]:
print((df == "0").sum(axis="rows").to_string())


P1_TFT8_Ashe_Tier               0
P1_TFT8_Ashe_Obj1            4219
P1_TFT8_Ashe_Obj2            4248
P1_TFT8_Ashe_Obj3            4256
P1_TFT8_Blitzcrank_Tier         0
P1_TFT8_Blitzcrank_Obj1      4105
P1_TFT8_Blitzcrank_Obj2      4166
P1_TFT8_Blitzcrank_Obj3      4200
P1_TFT8_Galio_Tier              0
P1_TFT8_Galio_Obj1           4218
P1_TFT8_Galio_Obj2           4226
P1_TFT8_Galio_Obj3           4229
P1_TFT8_Gangplank_Tier          0
P1_TFT8_Gangplank_Obj1       4044
P1_TFT8_Gangplank_Obj2       4160
P1_TFT8_Gangplank_Obj3       4200
P1_TFT8_Kayle_Tier              0
P1_TFT8_Kayle_Obj1           4239
P1_TFT8_Kayle_Obj2           4245
P1_TFT8_Kayle_Obj3           4249
P1_TFT8_Lulu_Tier               0
P1_TFT8_Lulu_Obj1            4083
P1_TFT8_Lulu_Obj2            4136
P1_TFT8_Lulu_Obj3            4158
P1_TFT8_Nasus_Tier              0
P1_TFT8_Nasus_Obj1           4184
P1_TFT8_Nasus_Obj2           4230
P1_TFT8_Nasus_Obj3           4234
P1_TFT8_Lux_Tier                0
P1_TFT8_Lux_Ob

Y efectivamente, el 90% de los valores de los objetos de los campeones, al menos de coste 1, son valores nulos, introduciendo mucho ruido a los modelos y haciendo imposible crear modelos eficientes.