In [1]:
#load all necessary libraries
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

# Data Preparation

In [2]:
data = pd.read_csv('games.csv')

Splitting of moves to white and black first moves

In [None]:
data['white_first_move'] = data['moves'].str.split().str[0]
data['black_first_move'] = data['moves'].str.split().str[1]

Conversion of categorical values ('e4', 'e5', 'TRUE' etc.) to numeric values for regression; functions to extract and drop variables to test different sets of variables with the model

In [3]:
move_to_int = {
        'a3': 0,
        'a4': 1,
        'b3': 2,
        'b4': 3,
        'c3': 4,
        'c4': 5,
        'd3': 6,
        'd4': 7,
        'e3': 8,
        'e4': 9,
        'f3': 10,
        'f4': 11,
        'g3': 12,
        'g4': 13,
        'h3': 14,
        'h4': 15,
        'Na3': 16,
        'Nc3': 17,
        'Nf3': 18,
        'Nh3': 19
    }

def WhiteMoveConvert(MOVE):    
    return move_to_int[MOVE]

def WhiteIntConvert(INT):
    for move, i in move_to_int.items():
        if i == INT:
            return move
    
    return ''

def RatedConvert(IS_RATED):
    return 1 if IS_RATED == 'TRUE' else 0

#default already has 'white_first_move', 'black_first_move'
def get_extract_data(add_columns=[]):
    extract_columns = ['white_first_move', 'black_first_move'] + add_columns
    extract_data = pd.DataFrame(data[extract_columns])
    extract_data['white_first_move_int'] = extract_data['white_first_move'].apply(WhiteMoveConvert)
    if 'rated' in extract_columns:
        extract_data['rated'] = extract_data['rated'].apply(RatedConvert)

    extract_data = extract_data.dropna()
    
    return extract_data

#default already drops 'white_first_move', 'black_first_move'
def get_pred_resp(extract_data, more_drop_columns=[]):
    return extract_data.drop(['black_first_move', 'white_first_move'] + more_drop_columns, axis=1), extract_data['black_first_move']

# Model Training and Results
80:20 train and test split (non-random for score comparison); training of MLR model; generation of output .csv

In [4]:
def do_model(pred, resp):
    pred_train, pred_test, resp_train, resp_test = sklearn.model_selection.train_test_split(pred, resp, test_size = 0.20, shuffle=False)

    model = LogisticRegression(random_state=0, multi_class='multinomial', solver='saga', C=0.7, max_iter=10000).fit(pred_train, resp_train)

    prob_arr = model.predict_proba(pred_test)

    total_score = 0
    cases = 0
    
    data_dict = {
        'white_first_move': [],
        'black_move_1': [],
        'black_move_1_prob': [],
        'black_move_2': [],
        'black_move_2_prob': [],
        'black_move_3': [],
        'black_move_3_prob': [],
        'black_move_4': [],
        'black_move_4_prob': [],
        'black_move_5': [],
        'black_move_5_prob': [],
        'black_first_move': [],
        'score': [],
        'rating_diff': [],
        'turns': []
    }
    
    for white_move_int, rating_diff, turns, probs, black_move in zip(pred_test['white_first_move_int'], pred_test['rating_diff'], pred_test['turns'], prob_arr, resp_test):
        cases += 1

        black_probs, black_moves = zip(*sorted(zip(probs, model.classes_)))


        data_dict['white_first_move'].append(WhiteIntConvert(white_move_int))
        score = 0
        move_count = 1
        for move, prob in zip(black_moves[19:14:-1], black_probs[19:14:-1]):
            if move == black_move:
                score = 100 * (prob/sum(black_probs[15:]))
                total_score += score
            
            data_dict['black_move_{}'.format(move_count)].append(move)
            data_dict['black_move_{}_prob'.format(move_count)].append(prob)
            move_count += 1
            
        data_dict['black_first_move'].append(black_move)
        data_dict['score'].append(score)
        data_dict['rating_diff'].append(rating_diff)
        data_dict['turns'].append(turns)
    
    print('Final average score:', round(total_score/cases, 2), '/100')
    return pd.DataFrame(data=data_dict)

# Different Variable Cases

In [5]:
#Rating Diff, Turns
extract_data = get_extract_data(add_columns=['turns', 'white_rating', 'black_rating'])
extract_data['rating_diff'] = extract_data['white_rating'] - extract_data['black_rating']
pred, resp = get_pred_resp(extract_data, more_drop_columns=['white_rating', 'black_rating'])

do_model(pred, resp).to_csv('output/ratingDiff_turns.csv')

Final average score: 22.3 /100


In [6]:
#Ratings, Rating Diff, Turns
extract_data = get_extract_data(add_columns=['turns', 'white_rating', 'black_rating'])
extract_data['rating_diff'] = extract_data['white_rating'] - extract_data['black_rating']
pred, resp = get_pred_resp(extract_data, more_drop_columns=[])

do_model(pred, resp).to_csv('output/ratings_ratingDiff_turns.csv')

Final average score: 22.5 /100


In [7]:
#Ratings, Rating Diff, Turns, Is Rated
extract_data = get_extract_data(add_columns=['turns', 'white_rating', 'black_rating', 'rated'])
extract_data['rating_diff'] = extract_data['white_rating'] - extract_data['black_rating']
pred, resp = get_pred_resp(extract_data, more_drop_columns=[])

do_model(pred, resp).to_csv('output/ratings_ratingDiff_turns_rated.csv')

Final average score: 22.5 /100
