In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.metrics import confusion_matrix

In [3]:
# Creating our test dataframe
test_df = pd.read_csv(Path('games.csv'))
test_df = test_df[test_df.winner != "draw"]

In [4]:
# Creating our train dataframe
chess_df = pd.read_csv(Path('games_edited.csv'))


In [5]:
# Since we had 3 potential outcomes in our target data, we need to remove all rows containing "draw" so that we have only 2 possible outcomes
chess_df = chess_df[chess_df.winner != "draw"]

In [6]:
# Dropping the column that I want to predict on the training data
X_train = chess_df.drop('id',axis = 1)

In [7]:
# Creating training data X and Y
X_dummies = pd.get_dummies(X_train)
y_train_label = LabelEncoder().fit_transform(chess_df['winner'])

In [8]:
# Creating testing data X and Y
X_test = test_df.drop('winner',axis = 1)
X_dummies_test = pd.get_dummies(X_test).reindex(columns=X_dummies.columns,fill_value=0)
y_test_label = LabelEncoder().fit_transform(chess_df['winner'])

In [9]:
# Performing Logistic Regression on unscaled data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_dummies, y_train_label)

LogisticRegression()

In [10]:
# Scale the data
scaler = StandardScaler().fit(X_dummies)
X_train_scaled = scaler.transform(X_dummies)
X_test_scaled = scaler.transform(X_dummies_test)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test_label)}")
# Guess Logistic Regression isn't the best method 

Training Data Score: 0.5014653548252042
Testing Data Score: 0.5013083525225036


In [11]:
# Train a Random Forest Classifier model and print the model score
# Train a Random Forest Classifier model and print the model score
# For this example, I use 500 as my default n of estimators
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_dummies, y_train_label)
print(f'Training Score: {clf.score(X_dummies, y_train_label)}')
print(f'Testing Score: {clf.score(X_dummies_test, y_test_label)}')

Training Score: 1.0
Testing Score: 0.8147372828134812


In [12]:
predictions = classifier.predict(X_dummies_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
...,...,...
19103,1,1
19104,1,0
19105,1,1
19106,1,1


In [13]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = clf.predict(X_dummies_test)
confusion_matrix(y_true, y_pred)

array([[8681,  426],
       [3114, 6887]], dtype=int64)

In [14]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
# Utilizing our understanding of components in a confusion matrix, apply the formula for accuracy
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.95      0.83      9107
           1       0.94      0.69      0.80     10001

    accuracy                           0.81     19108
   macro avg       0.84      0.82      0.81     19108
weighted avg       0.84      0.81      0.81     19108



In [15]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.8147372828134812
