In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
chess = pd.read_csv('chessTrimmed.csv')

In [3]:
chess.head()

Unnamed: 0.1,Unnamed: 0,turns,victory_statusR,winnerR,black_rating,white_rating
0,0,-1.413916,1,1,1191,1500
1,1,-1.324552,2,2,1261,1322
2,2,0.015907,3,1,1500,1496
3,3,0.015907,3,1,1454,1439
4,4,1.028698,3,1,1469,1523


# First, trying models with predictors that can be known beforehand. Starting with just ratings, then adding in player ids, then adding in opening move. 

In [4]:
x = chess[['black_rating', 'white_rating']]
y = chess['winnerR']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

In [6]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)

RandomForestClassifier()

In [7]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[1955 1018   47]
 [1003 1668   40]
 [ 126  125   36]]
              precision    recall  f1-score   support

           1       0.63      0.65      0.64      3020
           2       0.59      0.62      0.60      2711
           3       0.29      0.13      0.18       287

    accuracy                           0.61      6018
   macro avg       0.51      0.46      0.47      6018
weighted avg       0.60      0.61      0.60      6018



## Not very accurate

## Adding in player ids

In [8]:
chess1 = pd.read_csv('ChessRecodedWinnerAndVictory_status.csv')

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
chess1["white_id_cat"] = encoder.fit_transform(chess1["white_id"])
chess1["black_id_cat"] = encoder.fit_transform(chess1["black_id"])


In [10]:
chess1.white_id_cat.value_counts()

8213    72
1629    53
30      49
1196    48
7974    48
        ..
4180     1
8274     1
2119     1
70       1
2039     1
Name: white_id_cat, Length: 9438, dtype: int64

In [11]:
x1 = chess1[['black_rating', 'white_rating', 'black_id_cat', 'white_id_cat']]
y1 = chess1['winnerR']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x1,y1, test_size=0.3)

In [13]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)

RandomForestClassifier()

In [14]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[2061  996   11]
 [ 929 1707   13]
 [ 136  141   24]]
              precision    recall  f1-score   support

           1       0.66      0.67      0.67      3068
           2       0.60      0.64      0.62      2649
           3       0.50      0.08      0.14       301

    accuracy                           0.63      6018
   macro avg       0.59      0.47      0.47      6018
weighted avg       0.63      0.63      0.62      6018



## Slightly better. Now adding in opening move. 

In [15]:
chess1["opening_name_cat"] = encoder.fit_transform(chess1["opening_name"])

In [16]:
x2 = chess1[['black_rating', 'white_rating', 'black_id_cat', 'white_id_cat', 'opening_name_cat']]
y2 = chess1['winnerR']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x2,y2, test_size=0.3)

In [18]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)

RandomForestClassifier()

In [19]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[2085  868    2]
 [1051 1730    9]
 [ 148  102   23]]
              precision    recall  f1-score   support

           1       0.63      0.71      0.67      2955
           2       0.64      0.62      0.63      2790
           3       0.68      0.08      0.15       273

    accuracy                           0.64      6018
   macro avg       0.65      0.47      0.48      6018
weighted avg       0.64      0.64      0.63      6018



## Slightly better still. Will run hyperparameter tuning to see if can get better. 

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [21]:
n_estimators_array = [1, 4, 5, 8, 10, 20, 50, 75, 100, 250, 500]
results = []
for n in n_estimators_array:
    forest = RandomForestClassifier(n_estimators=n)
    forest.fit(x_train, y_train)
    result = accuracy_score(y_test, forest.predict(x_test))
    results.append(result) 
    print(n, ':', result)

1 : 0.53954802259887
4 : 0.5837487537387837
5 : 0.6048521103356597
8 : 0.6153207045530077
10 : 0.6100033233632436
20 : 0.6249584579594549
50 : 0.6321036889332003
75 : 0.6272848122299768
100 : 0.6402459288800266
250 : 0.6331006979062812
500 : 0.638251910933865


## Most accurate is 250 trees, but am going with 75 (only slightly less accurate and my machine got hung up while running 250)

In [22]:
max_features = ['auto', None, 'log2']
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, None]
min_samples_leaf = [1, 2, 4]
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf}

In [25]:
rf = RandomForestClassifier(n_estimators=50)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 90, cv = 3)

In [26]:
rf_random.fit(x_train, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=50),
                   n_iter=90,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, None],
                                        'max_features': ['auto', None, 'log2'],
                                        'min_samples_leaf': [1, 2, 4]})

In [27]:
rf_random.best_params_

{'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10}

In [28]:
forest = RandomForestClassifier(n_estimators=50, min_samples_leaf=2, max_features="auto", max_depth=10)
forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=2, n_estimators=50)

In [29]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[2156  799    0]
 [1188 1602    0]
 [ 173   99    1]]
              precision    recall  f1-score   support

           1       0.61      0.73      0.67      2955
           2       0.64      0.57      0.61      2790
           3       1.00      0.00      0.01       273

    accuracy                           0.62      6018
   macro avg       0.75      0.44      0.43      6018
weighted avg       0.64      0.62      0.61      6018



## Best model using these variables. The good news is it's 100 accurate at predicting a draw. The bad news is it's only 61% accurate at predicting a white win and 64% accurate at predicting a black one. Not reliable enough for real-world use. Will now create a machine learning model including more variables, though the additions are variables not known until after the game is played. Not much use for betting, etc., but could have applications. Anyway, I will do it (in a separate Python document I will call MachineLearning_RandomForest2). 