In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.metrics import confusion_matrix

In [3]:
# Creating our test dataframe
test_df = pd.read_csv(Path('games.csv'))

In [4]:
# Creating our train dataframe
chess_df = pd.read_csv(Path('games_edited.csv'))
chess_df = chess_df.drop('increment_code', axis = 1)
chess_df = chess_df.drop('black_rating', axis = 1)
chess_df = chess_df.drop('white_rating', axis = 1)
chess_df = chess_df.drop('opening_name', axis = 1)
chess_df = chess_df.drop('opening_ply', axis = 1)
chess_df = chess_df.drop('turns', axis = 1)
chess_df.head()

Unnamed: 0,id,rated,created_at,last_move_at,victory_status,winner,white_id,black_id,opening_eco,Move1A,Move1B,Move2A,Move2B,Move3A,Move3B
0,TZJHLljE,False,1500000000000.0,1500000000000.0,outoftime,white,bourgris,a-00,D10,d4,d5,c4,c6,cxd5,e6
1,l1NXvwaE,True,1500000000000.0,1500000000000.0,resign,black,a-00,skinnerua,B00,d4,Nc6,e4,e5,f4,f6
2,mIICvQHh,True,1500000000000.0,1500000000000.0,mate,white,ischia,a-00,C20,e4,e5,d3,d6,Be3,c6
3,kWKvrqYL,True,1500000000000.0,1500000000000.0,mate,white,daniamurashov,adivanov2009,D02,d4,d5,Nf3,Bf5,Nc3,Nf6
4,9tXo1AUZ,True,1500000000000.0,1500000000000.0,mate,white,nik221107,adivanov2009,C41,e4,e5,Nf3,d6,d4,Nc6


In [5]:
# Dropping the column that I want to predict on the training data
X_train = chess_df.drop('id',axis = 1)

In [6]:
# Creating training data X and Y
X_dummies = pd.get_dummies(X_train)
y_train_label = LabelEncoder().fit_transform(chess_df['winner'])

In [7]:
# Creating testing data X and Y
X_test = test_df.drop('winner',axis = 1)
X_dummies_test = pd.get_dummies(X_test).reindex(columns=X_dummies.columns,fill_value=0)
y_test_label = LabelEncoder().fit_transform(chess_df['winner'])

In [8]:
# Performing Logistic Regression on unscaled data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_dummies, y_train_label)

LogisticRegression()

In [9]:
# Scale the data
scaler = StandardScaler().fit(X_dummies)
X_train_scaled = scaler.transform(X_dummies)
X_test_scaled = scaler.transform(X_dummies_test)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test_label)}")
# Guess Logistic Regression isn't the best method 

Training Data Score: 0.3486389470535447
Testing Data Score: 0.35297636853125935


In [10]:
# Train a Random Forest Classifier model and print the model score
# Train a Random Forest Classifier model and print the model score
# For this example, I use 500 as my default n of estimators
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_dummies, y_train_label)
print(f'Training Score: {clf.score(X_dummies, y_train_label)}')
print(f'Testing Score: {clf.score(X_dummies_test, y_test_label)}')

Training Score: 1.0
Testing Score: 0.727091434838967


In [11]:
predictions = classifier.predict(X_dummies_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,2,2
1,2,0
2,2,2
3,2,2
4,2,2
...,...,...
20053,2,2
20054,2,0
20055,2,2
20056,2,2


In [12]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = clf.predict(X_dummies_test)
confusion_matrix(y_true, y_pred)

array([[7041,    0, 2066],
       [  34,  906,   10],
       [3364,    0, 6637]], dtype=int64)