In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("high_diamond_ranked_10min.csv")

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import LinearRegression

In [6]:
df.head(15)

Unnamed: 0,gameId,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,4519157822,0,28,2,1,9,6,11,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,4523371949,0,12,1,0,5,5,5,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,4521474530,0,15,0,0,7,11,4,1,1,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,4524384067,0,43,1,0,4,5,5,1,0,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,4436033771,0,75,4,0,6,6,6,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4
5,4475365709,1,18,0,0,5,3,6,1,1,...,0,15201,7.0,18060,221,59,-698,-101,22.1,1520.1
6,4493010632,1,18,3,1,7,6,7,1,1,...,0,14463,6.4,15404,164,35,-2411,-1563,16.4,1446.3
7,4496759358,0,16,2,0,5,13,3,0,0,...,0,17920,6.6,16938,157,54,2615,800,15.7,1792.0
8,4443048030,0,16,3,0,7,7,8,0,0,...,0,18380,7.2,19298,240,53,1979,771,24.0,1838.0
9,4509433346,1,13,1,1,4,5,5,1,1,...,0,16605,6.8,18379,247,43,1548,1574,24.7,1660.5


In [14]:
df.shape

(9879, 40)

In [15]:
df['blueWins'].value_counts(normalize=True)

0    0.500962
1    0.499038
Name: blueWins, dtype: float64

In [16]:
train = df
test = df.drop('blueWins',axis=1)

In [17]:
train, val = train_test_split(train, train_size=0.80, test_size=0.20, stratify=df['blueWins'], random_state=42)

In [18]:
target = 'blueWins'
train_features = train.drop(columns=[target])
numeric_features = train_features.select_dtypes(include='number').columns.tolist()
features = numeric_features

In [19]:
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [13]:
%%time
pipeline = make_pipeline(
    SelectKBest(),
    StandardScaler(),
    RandomForestClassifier(
        n_jobs=1,
        random_state=42,
        n_estimators=9,
    )
)

param_distributions = {
    'randomforestclassifier__criterion': ('gini', 'entropy'),
    'randomforestclassifier__max_depth': (14,15,16,20,25),
    'randomforestclassifier__max_features': (9,12,13,14,15,16),
    'randomforestclassifier__min_samples_split': (2,4,6,8,10)
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=150,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    return_train_score=True,
    n_jobs=-1,
)

search.fit(X_train,y_train)

print('Best Score:',search.best_score_)
print('Best param:',search.best_params_)
print('Best estimaator:', search.best_estimator_)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 727 out of 750 | elapsed:   11.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:   13.7s finished
Best Score: -0.2943167679485024
Best param: {'randomforestclassifier__min_samples_split': 10, 'randomforestclassifier__max_features': 9, 'randomforestclassifier__max_depth': 14, 'randomforestclassifier__criterion': 'entropy'}
Best estimaator: Pipeline(memory=None,
         steps=[('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_classif at 0x000001D440D8DC18>)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
            

In [20]:
%%time
pipe = make_pipeline(
    SelectKBest(k=10),
    StandardScaler(),
    RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=14, max_features=9,
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=10,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=999, n_jobs=1,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=False)
)
pipe.fit(X_train,y_train)

Wall time: 1min 5s


Pipeline(memory=None,
         steps=[('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_classif at 0x0000022B666E7AF8>)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=14, max_features=9,
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=10,
                                        min_weight_fraction_leaf=0.0,
                                        n_estim

In [21]:
print('training accuracy:', pipe.score(X_train, y_train))
print('validation accuracy:', pipe.score(X_val, y_val))

training accuracy: 0.8600531443755536
validation accuracy: 0.7074898785425101


In [25]:
X_train, y_train = pd.concat([X_train,X_val]), pd.concat([y_train,y_val])

In [26]:
pipe.fit(X_train,y_train)
print('training accuracy:', pipe.score(X_train, y_train))
print('validation accuracy:', pipe.score(X_val, y_val))

training accuracy: 0.8788696752425137
validation accuracy: 0.9367408906882592


In [27]:
y_pred = pipe.predict(X_test)

In [28]:
mean_squared_error(df[target], y_pred)

0.13270573944731248