In [10]:
import pandas as pd
import os

chess_games = pd.read_csv(os.path.join("chess_games.csv"))

chess_games.head()

Unnamed: 0,rated,turns,victory_status,winner,time_increment_1,time_increment_2,white_rating,black_rating,opening_moves
0,0,13,3,1,15,2,1500,1191,5
1,1,16,2,0,5,10,1322,1261,4
2,1,61,1,1,5,10,1496,1500,3
3,1,61,1,1,20,0,1439,1454,3
4,1,95,1,1,30,3,1523,1469,5


In [11]:
X = chess_games[:]
y = chess_games[:]

X.drop(["winner"], axis=1, inplace=True)
y.drop(["rated", "turns", "victory_status", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(["winner"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.drop(["rated", "turns", "victory_status", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"], axis=1, inplace=True)


In [12]:
print(len(X[X.isna().any(axis=1)]))
print(len(y[y.isna().any(axis=1)]))

0
0


In [13]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler().fit(X[["turns", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"]])

def scale(dataframe, columns, scaler):
    scaled = scaler.transform(dataframe[columns])
    dataframe[columns] = pd.DataFrame(scaled, columns=columns)
    return dataframe

X = scale(X, ["turns", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"], standard_scaler)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[columns] = pd.DataFrame(scaled, columns=columns)


Unnamed: 0,rated,turns,victory_status,time_increment_1,time_increment_2,white_rating,black_rating,opening_moves
0,0,-1.413952,3,0.068526,-0.227418,-0.331788,-1.366985,0.065432
1,1,-1.324585,2,-0.514233,0.332444,-0.942955,-1.126459,-0.292083
2,1,0.015907,1,-0.514233,0.332444,-0.345522,-0.305234,-0.649599
3,1,0.015907,1,0.359906,-0.367383,-0.541233,-0.463294,-0.649599
4,1,1.028724,1,0.942665,-0.157435,-0.252817,-0.411753,0.065432


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

len(X_train), len(X_test)

(16046, 4012)

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

algorithms = {
    "Nearest Neighbors" : KNeighborsClassifier(3),
    "Stochastic Gradient Descent" : SGDClassifier(),
    "Linear SVM" : SVC(kernel="linear", C=0.025),
    "Decision Tree" : DecisionTreeClassifier(max_depth=5),
    "Random Forest" : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net" : MLPClassifier(alpha=1, max_iter=1000),
    "Naive Bayes" : GaussianNB(),
    "LDA" : LinearDiscriminantAnalysis(),
}

for name, algorithm in algorithms.items():
    algorithm.fit(X_train, y_train.values.ravel())
    score = algorithm.score(X_test, y_test)
    print(name, round(score, 2))

Nearest Neighbors 0.62
Stochastic Gradient Descent 0.67
Linear SVM 0.67
Decision Tree 0.63
Random Forest 0.6
Neural Net 0.67
Naive Bayes 0.66
LDA 0.67


Der Algorithmus "Neural Net" ist am besten geeignet, da der Score bei ihm am höchsten ist.

In [16]:
import joblib
joblib.dump(algorithms["Neural Net"], 'chess_games_model.joblib')
model = joblib.load('chess_games_model.joblib')
model

MLPClassifier(alpha=1, max_iter=1000)

In [17]:
test1 = pd.DataFrame([[0, 49, 2, 15, 0, 1500, 978, 3]], columns=X_train.columns)
test1 = scale(test1, ["turns", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"], standard_scaler)
test1.head()

test2 = pd.DataFrame([[1, 40, 1, 10, 0, 1523, 1523, 5]], columns=X_train.columns)
test2 = scale(test2, ["turns", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"], standard_scaler)
test2.head()

test3 = pd.DataFrame([[0, 56, 2, 20, 5, 1587, 1500, 4]], columns=X_train.columns)
test3 = scale(test3, ["turns", "time_increment_1", "time_increment_2", "white_rating", "black_rating", "opening_moves"], standard_scaler)
test3.head()

Unnamed: 0,rated,turns,victory_status,time_increment_1,time_increment_2,white_rating,black_rating,opening_moves
0,0,-0.133036,2,0.359906,-0.01747,-0.033071,-0.305234,-0.292083


In [18]:
predictions = model.predict(pd.concat([test1, test2, test3]))
for game, prediction in zip(["Test1", "Test2", "Test3"], predictions):
    if prediction == 0:
        print(game + ": Schwarz gewinnt")
    elif prediction == 1:
        print(game + ": Weiss gewinnt")
    else:
        print(game + ": Unentschieden")

Test1: Weiss gewinnt
Test2: Weiss gewinnt
Test3: Weiss gewinnt


# Erkenntnisse
Ich habe einige Tests durchgeführt, indem ich mehrere Zeilen in der Tabelle zufällig ausgewählt und die Ergebnisse manuell überprüft habe. Mehrheitlich wird das Ergebnis korrekt vorhergesagt, aber man merkt trotzdem, dass die Wahrscheinlichkeit für ein korrektes Resultat nicht sehr hoch ist. Als ich die Algorithmen ausgewählt habe, war die Wahrscheinlichkeit des Algorithmus "Neural Net" auf 67% - 68%.