In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
df = pd.read_csv('./Bases de datos utilizadas/howtowin_10min.csv')
df.head()

Unnamed: 0,gameId,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,4519157822,0,28,2,1,9,6,11,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,4523371949,0,12,1,0,5,5,5,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,4521474530,0,15,0,0,7,11,4,1,1,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,4524384067,0,43,1,0,4,5,5,1,0,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,4436033771,0,75,4,0,6,6,6,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4


In [4]:
try:
    df.drop(['gameId'], axis=1, inplace=True)
except ValueError:
    pass

In [5]:
df.head()

Unnamed: 0,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueHeralds,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,0,28,2,1,9,6,11,0,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,0,12,1,0,5,5,5,0,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,0,15,0,0,7,11,4,1,1,0,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,0,43,1,0,4,5,5,1,0,1,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,0,75,4,0,6,6,6,0,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4


In [6]:
df.isnull().any().sum()

0

In [7]:
X = df.copy()
y = X.pop('blueWins')

print('blueWins' in X.columns)

False


In [8]:
# X = df[['blueGoldDiff', 'redGoldDiff','blueExperienceDiff','redExperienceDiff','blueTotalGold','redTotalGold',
#                         'blueGoldPerMin','redGoldPerMin','blueTotalExperience','redTotalExperience']]
# y = df['blueWins']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, train_size=.8)

In [10]:
#haciendo distintos pipes:

log_reg = Pipeline(steps=[
                        ("scaler",StandardScaler()),
                        ("reglog",LogisticRegression())])

rand_forest = RandomForestClassifier()

svm = Pipeline(steps=[
                    ("scaler",StandardScaler()),
                    ("selectkbest",SelectKBest()),
                    ('svm',SVC())])

# knclass = Pipeline(steps=[
#                         ("scaler",StandardScaler()),                        
#                         ("kneighbors", KNeighborsClassifier())])

# haciendo los parámetros:

log_reg_param = {"reglog__penalty": ["l1","l2"], 
                 "reglog__C": np.logspace(0, 4, 10)}

rand_forest_param = {'n_estimators':[50,100,150],
                    'min_samples_leaf':[10,30,50]}

# svm_param = {'C':[1.0, 0.5, 0.1]}

# knclass_params = {'n_neighbors':[5,10,15],
#                 'leaf_size':[10,30,50]}

# hacemos los Gridsearchs

gs_log_reg = GridSearchCV(log_reg,
                            log_reg_param,
                            cv=10,
                            scoring="accuracy",
                            verbose=1,
                            n_jobs=-1)

gs_rand_forest = GridSearchCV(rand_forest,
                            rand_forest_param,
                            cv=10,
                            scoring="accuracy",
                            verbose=1,
                            n_jobs=-1)

# gs_svm = GridSearchCV(svm,
#                         svm_param,
#                         cv=10,
#                         scoring="accuracy",
#                         verbose=1,
#                         n_jobs=-1)

# gs_knclass = GridSearchCV(knclass,
#                         knclass_params,
#                         cv=10,
#                         scoring="accuracy",
#                         verbose=1,
#                         n_jobs=-1)

grids = {"gs_reg_log":gs_log_reg,
         "gs_rand_forest":gs_rand_forest,   #"gs_svm":gs_svm, "gs_knclass":gs_knclass
         }

In [11]:
grids.items()

dict_items([('gs_reg_log', GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('reglog', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'reglog__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'reglog__penalty': ['l1', 'l2']},
             scoring='accuracy', verbose=1)), ('gs_rand_forest', GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'min_samples_leaf': [10, 30, 50],
                         'n_estimators': [50, 100, 150]},
             scoring='accuracy', verbose=1))])

In [12]:
for nombre, grid_search in grids.items():
    grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


100 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


In [13]:
KNeighborsClassifier().get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [14]:
best_grid = [(i, j.best_score_) for i, j in grids.items()]
best_grid

[('gs_reg_log', 0.730737409784122), ('gs_rand_forest', 0.732130454960073)]