In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
chess = pd.read_csv('chessTrimmed.csv')

# Creating a model with ratings, turns and victory_status

In [4]:
x = chess[['black_rating', 'white_rating', 'turns', 'victory_statusR']]
y = chess['winnerR']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

In [6]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)

RandomForestClassifier()

In [7]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[2159  857    0]
 [ 900 1794    1]
 [   5    9  293]]
              precision    recall  f1-score   support

           1       0.70      0.72      0.71      3016
           2       0.67      0.67      0.67      2695
           3       1.00      0.95      0.98       307

    accuracy                           0.71      6018
   macro avg       0.79      0.78      0.79      6018
weighted avg       0.71      0.71      0.71      6018



## A lot more accuracy than the model with only predictors known beforehand. 

## Tuning hyperparameters

In [8]:
from sklearn.model_selection import RandomizedSearchCV

In [9]:
n_estimators_array = [1, 4, 5, 8, 10, 20, 50, 75, 100]
results = []
for n in n_estimators_array:
    forest = RandomForestClassifier(n_estimators=n)
    forest.fit(x_train, y_train)
    result = accuracy_score(y_test, forest.predict(x_test))
    results.append(result) 
    print(n, ':', result)

1 : 0.6415752741774676
4 : 0.6681621801262878
5 : 0.6674975074775673
8 : 0.6753074111000332
10 : 0.6824526420737786
20 : 0.693419740777667
50 : 0.7033898305084746
75 : 0.7038883349950149
100 : 0.7042206713193752


## 100 is most accurate. Checking other parameters

In [10]:
max_features = ['auto', None, 'log2']
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, None]
min_samples_leaf = [1, 2, 4]
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf}

In [11]:
rf = RandomForestClassifier(n_estimators=100)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 90, cv = 3)

In [12]:
rf_random.fit(x_train, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=90,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, None],
                                        'max_features': ['auto', None, 'log2'],
                                        'min_samples_leaf': [1, 2, 4]})

In [13]:
rf_random.best_params_

{'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 80}

In [14]:
forest = RandomForestClassifier(n_estimators=100, min_samples_leaf=2, max_features="auto", max_depth=80)
forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=80, min_samples_leaf=2)

In [15]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[2171  845    0]
 [ 908 1787    0]
 [   4   10  293]]
              precision    recall  f1-score   support

           1       0.70      0.72      0.71      3016
           2       0.68      0.66      0.67      2695
           3       1.00      0.95      0.98       307

    accuracy                           0.71      6018
   macro avg       0.79      0.78      0.79      6018
weighted avg       0.71      0.71      0.71      6018



## Only slightly better than model before hyperparameter tuning. But I'll take it. 

## Checking feature importance

In [16]:
feature_importances = pd.Series(forest.feature_importances_, index=x.columns)
feature_importances

black_rating       0.292797
white_rating       0.286393
turns              0.247189
victory_statusR    0.173622
dtype: float64

## It looks like black's rating is the most significant predictor, with white rating close behind. 