In [0]:
import pandas as pd
import math

from pyspark.ml.classification import GBTClassifier

In [0]:
from datetime import datetime

import numpy as np

##Pre-Processing
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

##Models
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression

##Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [0]:
spark = SparkSession.builder.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1").getOrCreate()

## Loading Data from MongoDB as SparkDFs

In [0]:
database = "MongoDBAtlas"
user_name = "praharajhp"
password = "Fuq58CAMd6k0sPT8"
ip_address = "chesscluster.ar0uw.mongodb.net"
collection_pos_eval = "pos_evals"
collection_elo_eval = "elo_eval"
connection_string_pos = f"mongodb+srv://{user_name}:{password}@{ip_address}/{database}.{collection_pos_eval}"
connection_string_elo = f"mongodb+srv://{user_name}:{password}@{ip_address}/{database}.{collection_elo_eval}"

In [0]:
df_pos = spark.read.format("mongo").option("uri",connection_string_pos).load()
df_eval = spark.read.format("mongo").option("uri",connection_string_elo).load()

## Data Processing and Feature Engineering

Creating a column 'elo_diff' which calculates the difference in ELO of the player with White pieces and black pieces

In [0]:
df_eval = df_eval.withColumn('elo_diff',df_eval['White Elo'] - df_eval['Black Elo'])

Creating a User Defined Function (UDF) to record the Expected scores. This metric is estimated using a formula that FIDE(Governing body of chess) uses to define the expected score of a game.

In [0]:
def calculate_fide_expected_score(x):
    return math.erfc(-x / ((2000.0/7) * math.sqrt(2))) / 2 ## Formula that FIDE(Governing body of chess) uses to calculate expected score of a game.

xScore = udf(calculate_fide_expected_score, FloatType())

In [0]:
df_eval = df_eval.select('Black Elo', 'White Elo', 'Result', 'Time Class', 'Time Control','elo_diff',xScore("elo_diff").alias("expected_score_fide"))

Transforming categorical variables through StringIndexing followed by OneHotEncoding

In [0]:
def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num").setHandleInvalid("keep")
        sm = si.fit(newdf)
        
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        # and then drops the original columns.
        # and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        # For each given colum, create OneHotEncoder. 
        # dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

In [0]:
categorical_cols = ["Time Class","Time Control"]
df_eval_sti = indexStringColumns(df_eval, categorical_cols)

In [0]:
df_eval_ohe = oneHotEncodeColumns(df_eval_sti, categorical_cols)

Creating a UDF to convert string target variable to FloatType

In [0]:
def convert_res_to_binary(x):
    if x == '1-0':
        return 1
    elif x == '0-1':
        return 0
    elif '5' in x:
        return 2

result_conv = udf(convert_res_to_binary, IntegerType())
    

In [0]:
df_eval = df_eval_ohe.withColumn('result_int',result_conv('Result'))

Converting two string features("Black Elo" and "White Elo") to Integer

In [0]:
def convert_to_int(x):
    try:
        return int(x)
    except ValueError:
        return None
    
int_conv = udf(convert_to_int, IntegerType())
    

In [0]:
df_eval = df_eval.select(int_conv('Black Elo').alias('Black Elo'), int_conv('White Elo').alias('White Elo'), 'Result', 'elo_diff', 'expected_score_fide', 'Time Class', 'Time Control', 'result_int')

In [0]:
df_eval = df_eval.where(df_eval.result_int != 2)

In [0]:
df_eval.show()

Create a dataframe with features and label

In [0]:
# va = VectorAssembler(outputCol="features", inputCols=["Black Elo", "White Elo","elo_diff", "Time Class", "Time Control"])
va = VectorAssembler(outputCol="features", inputCols=["elo_diff"])
# va = VectorAssembler(outputCol="features", inputCols=["Black Elo","White Elo"])
va_df = va.transform(df_eval).select("features", "result_int").withColumnRenamed("result_int", "label")

In [0]:
va_df.show()

## Modeling Using Decision Trees

Splitting Data into train and validation/test sets

In [0]:
splits = va_df.randomSplit([0.8, 0.2])

train = splits[0].cache()
validation = splits[1].cache()

Defining Decision Tree model and fitting train data to it.

In [0]:
dt = DecisionTreeClassifier(maxDepth= 20, maxBins= 32, minInstancesPerNode= 1, minInfoGain= 0 )
dtmodel = dt.fit(train)

In [0]:
dtmodel.featureImportances

Out[40]: SparseVector(1, {0: 1.0})

In [0]:
print(dtmodel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e77d8ab75f67, depth=7, numNodes=15, numClasses=2, numFeatures=1
  If (feature 0 <= 8.5)
   If (feature 0 <= -156.5)
    Predict: 0.0
   Else (feature 0 > -156.5)
    If (feature 0 <= -29.5)
     Predict: 0.0
    Else (feature 0 > -29.5)
     If (feature 0 <= 1.5)
      If (feature 0 <= -4.5)
       If (feature 0 <= -12.5)
        If (feature 0 <= -21.5)
         Predict: 0.0
        Else (feature 0 > -21.5)
         Predict: 1.0
       Else (feature 0 > -12.5)
        Predict: 0.0
      Else (feature 0 > -4.5)
       Predict: 1.0
     Else (feature 0 > 1.5)
      Predict: 0.0
  Else (feature 0 > 8.5)
   Predict: 1.0



Predicting on Validation data

In [0]:
dt_preds = dtmodel.transform(validation)

#### Model Evaluation

Area under ROC/PR curve

In [0]:
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(dt_preds)))


areaUnderROC:0.6854022430890164


In [0]:
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(dt_preds)))

areaUnderPR:0.6906670471256425


F1 score

In [0]:
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(dt_preds)

Out[45]: 0.7141534587468523

### The above was an attempt to fit a baseline model. I will now perform cross validation to obtain a better score.

In [0]:
evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")

In [0]:
dt = DecisionTreeClassifier()
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [2,3,4,5,10,15,20,25,30]).build()

cv = CrossValidator(estimator=dt, 
                    evaluator=evaluator, 
                    numFolds=5, 
                    estimatorParamMaps=paramGrid)


In [0]:
cvmodel = cv.fit(train)
dtpredicts = cvmodel.bestModel.transform(validation)


In [0]:
print("Best Max Depth : %s" % cvmodel.bestModel.getMaxDepth)

Best Max Depth : <bound method _DecisionTreeParams.getMaxDepth of DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5d1fe41592f7, depth=7, numNodes=15, numClasses=2, numFeatures=1>


In [0]:
print("Accuracy : %s" % evaluator.evaluate(dtpredicts))

Accuracy : 0.7141534587468523


In [0]:
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(dtpredicts)))


areaUnderROC:0.6854022430890164


In [0]:
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(dtpredicts)))

areaUnderPR:0.6906670471256425


In [0]:
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(dtpredicts)

Out[53]: 0.7141534587468523

In [0]:
df_eval.groupBy('result').count().show()