In [0]:
import pandas as pd
import math



In [0]:
from datetime import datetime

import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [0]:
spark = SparkSession.builder.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.2")\
                            .config("spark.network.timeout", "36000000s")\
                            .config("spark.executor.heartbeatInterval", "3600s")\
                            .getOrCreate()

## Import Data from MongoDB Atlas

In [0]:
database = 'MongoDBAtlas'
collection_pos = 'pos_evals'
collection_elo = 'elo_eval'
user_name = 'Javier'
password = 'chess'
address = 'chesscluster.ar0uw.mongodb.net'
connection_string_pos = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection_pos}"
connection_string_elo = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection_elo}"

In [0]:
df_pos = spark.read.format("mongo").option("uri",connection_string_pos).load()
df_elo = spark.read.format("mongo").option("uri",connection_string_elo).load()

# Data Processing and Feature Engineering

In [0]:
df_elo = df_elo.withColumn('elo_diff',df_elo['Black Elo'] - df_elo['White Elo'])

In [0]:
def calculate_fide_expected_score(x):
    return math.erfc(-x / ((2000.0/7) * math.sqrt(2))) / 2 ## Formula that FIDE(Governing body of chess) uses to calculate expected score of a game.

xScore = udf(calculate_fide_expected_score, FloatType())

In [0]:
df_elo = df_elo.select('Black Elo', 'White Elo', 'Result', 'Time Class', 'Time Control','elo_diff',xScore("elo_diff").alias("expected_score_fide"))

In [0]:
def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num").setHandleInvalid("keep")
        sm = si.fit(newdf)
        
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        # and then drops the original columns.
        # and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        # For each given colum, create OneHotEncoder. 
        # dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

In [0]:
categorical_cols = ["Time Class","Time Control"]
df_elo_str = indexStringColumns(df_elo, categorical_cols)

In [0]:
df_elo_ohe = oneHotEncodeColumns(df_elo_str, categorical_cols)

In [0]:
def convert_res_to_binary(x):
    if x == '1-0':
        return 1
    elif x == '0-1':
        return 0
    elif '5' in x:
        return 2

result_conv = udf(convert_res_to_binary, IntegerType())

In [0]:
df_elo = df_elo_ohe.withColumn('result_int',result_conv('Result'))

In [0]:
def convert_to_int(x):
    try:
        return int(x)
    except ValueError:
        return None
    
int_conv = udf(convert_to_int, IntegerType())

In [0]:
df_elo = df_elo.select(int_conv('Black Elo').alias('Black Elo'), int_conv('White Elo').alias('White Elo'), 'Result', 'elo_diff', 'expected_score_fide', 'Time Class', 'Time Control', 'result_int')

In [0]:
df_elo = df_elo.where(df_elo.result_int != 2)

In [0]:
df_elo.show()

+---------+---------+------+--------+-------------------+-------------+--------------+----------+
|Black Elo|White Elo|Result|elo_diff|expected_score_fide|   Time Class|  Time Control|result_int|
+---------+---------+------+--------+-------------------+-------------+--------------+----------+
|     2350|     2500|   1-0|  -150.0|          0.2997916|(5,[2],[1.0])|(27,[4],[1.0])|         1|
|     2646|     2331|   0-1|   315.0|          0.8648778|(5,[2],[1.0])|(27,[4],[1.0])|         0|
|     2287|     2317|   0-1|   -30.0|          0.4581879|(5,[2],[1.0])|(27,[4],[1.0])|         0|
|     2440|     2406|   1-0|    34.0|          0.5473623|(5,[2],[1.0])|(27,[4],[1.0])|         1|
|     2386|     2544|   1-0|  -158.0|         0.29013172|(5,[2],[1.0])|(27,[4],[1.0])|         1|
|     2778|     2746|   1-0|    32.0|         0.54458827|(5,[0],[1.0])|(27,[0],[1.0])|         1|
|     2646|     2736|   0-1|   -90.0|         0.37638083|(5,[0],[1.0])|(27,[0],[1.0])|         0|
|     2767|     2665

In [0]:
va = VectorAssembler(outputCol="features", inputCols=["Black Elo", "White Elo", "elo_diff", "Time Class", "Time Control"])
va_df = va.transform(df_elo).select("features", "result_int").withColumnRenamed("result_int", "label")

## Random Forest

In [0]:
va_df.show(1, truncate=False)

+------------------------------------------------+-----+
|features                                        |label|
+------------------------------------------------+-----+
|(35,[0,1,2,5,12],[2350.0,2500.0,-150.0,1.0,1.0])|1    |
+------------------------------------------------+-----+
only showing top 1 row



In [0]:
splits = va_df.randomSplit([0.8, 0.2])

train = splits[0].cache()
validation = splits[1].cache()

In [0]:
rf = RandomForestClassifier(maxDepth=25)

In [0]:
rf_model = rf.fit(train)

In [0]:
rf_model.featureImportances

Out[23]: SparseVector(35, {0: 0.1827, 1: 0.137, 2: 0.6033, 3: 0.006, 4: 0.0047, 5: 0.0068, 6: 0.0023, 8: 0.0068, 9: 0.0053, 10: 0.0092, 11: 0.0054, 12: 0.0039, 13: 0.0049, 14: 0.0013, 15: 0.0065, 16: 0.0021, 17: 0.0016, 18: 0.0018, 19: 0.0023, 20: 0.001, 21: 0.0013, 23: 0.0004, 24: 0.0001, 25: 0.0007, 26: 0.0008, 27: 0.0001, 29: 0.0002, 30: 0.0007, 32: 0.0008})

In [0]:
print(rf_model.toDebugString)

RandomForestClassificationModel: uid=RandomForestClassifier_9e39c7e3a3d9, numTrees=20, numClasses=2, numFeatures=35
  Tree 0 (weight 1.0):
    If (feature 5 in {1.0})
     If (feature 2 <= 20.5)
      If (feature 2 <= -97.5)
       Predict: 1.0
      Else (feature 2 > -97.5)
       If (feature 1 <= 2474.5)
        Predict: 1.0
       Else (feature 1 > 2474.5)
        If (feature 1 <= 2596.5)
         Predict: 0.0
        Else (feature 1 > 2596.5)
         If (feature 12 in {1.0})
          Predict: 0.0
         Else (feature 12 not in {1.0})
          Predict: 1.0
     Else (feature 2 > 20.5)
      Predict: 0.0
    Else (feature 5 not in {1.0})
     If (feature 29 in {1.0})
      Predict: 1.0
     Else (feature 29 not in {1.0})
      If (feature 20 in {1.0})
       If (feature 2 <= -349.5)
        Predict: 1.0
       Else (feature 2 > -349.5)
        Predict: 0.0
      Else (feature 20 not in {1.0})
       If (feature 1 <= 2814.5)
        If (feature 2 <= -9.5)
         If (feature 0 <

In [0]:
rf_preds = rf_model.transform(validation)

In [0]:
evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")

In [0]:
evaluator.setMetricName('accuracy')
print("Accuracy : %s" % evaluator.evaluate(rf_preds))

Accuracy : 0.6820040899795501


In [0]:
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(rf_preds)))

areaUnderROC:0.7583279162909683


In [0]:
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(rf_preds)))

areaUnderPR:0.7728006863442387


In [0]:
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(rf_preds)

Out[30]: 0.6807248394719112

## Random Forest (Black Elo, White Elo, Elo Diff, Time Class, Time Control)

In [0]:
# Use to get model accuracy
evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
evaluator.setMetricName('accuracy')

In [0]:
rf = RandomForestClassifier()
paramGrid = ParamGridBuilder()\
                .addGrid(rf.numTrees, [10, 20, 30, 40, 50])\
                .addGrid(rf.minInstancesPerNode, [1, 2, 3, 4, 5, 10])\
                .addGrid(rf.maxDepth, [2,3,4,5,10,15,20,30])\
                .build()

cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    numFolds=5, 
                    estimatorParamMaps=paramGrid)

In [0]:
# Fit a cross-validated Random Forest model on training data
cvmodel = cv.fit(train)
rfpredicts = cvmodel.bestModel.transform(validation)

In [0]:
print("Best Max Depth : %s" % cvmodel.bestModel.getMaxDepth)

In [0]:
print("Best Num Trees : %s" % cvmodel.bestModel.getNumTrees)

In [0]:
print("Accuracy : %s" % evaluator.evaluate(rfpredicts))

In [0]:
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(rfpredicts)))

In [0]:
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(rfpredicts)))

In [0]:
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(rfpredicts)

## Random Forest (Black Elo, White Elo)

In [0]:
va = VectorAssembler(outputCol="features", inputCols=["Black Elo","White Elo"])
va_df = va.transform(df_elo).select("features", "result_int").withColumnRenamed("result_int", "label")

In [0]:
splits = va_df.randomSplit([0.8, 0.2])

train = splits[0].cache()
validation = splits[1].cache()

In [0]:
evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")

In [0]:
rf = RandomForestClassifier()
paramGrid = ParamGridBuilder()\
                .addGrid(rf.numTrees, [10, 20, 30, 40, 50])\
                .addGrid(rf.minInstancesPerNode, [1, 2, 3, 4, 5, 10])\
                .addGrid(rf.maxDepth, [2,3,4,5,10,15,20,30])\
                .build()

cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    numFolds=5, 
                    estimatorParamMaps=paramGrid)

In [0]:
# Fit a cross-validated Random Forest model on training data
cvmodel = cv.fit(train)
rfpredicts = cvmodel.bestModel.transform(validation)

In [0]:
print("Best Max Depth : %s" % cvmodel.bestModel.getMaxDepth)
print("Best Num Trees : %s" % cvmodel.bestModel.getNumTrees)

Best Max Depth : <bound method _DecisionTreeParams.getMaxDepth of RandomForestClassificationModel: uid=RandomForestClassifier_1953c16dd364, numTrees=20, numClasses=2, numFeatures=2>
Best Num Trees : 20


In [0]:
print("Accuracy : %s" % evaluator.evaluate(rfpredicts))
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(rfpredicts)))
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(rfpredicts)))

Accuracy : 0.6841491306900274
areaUnderROC:0.7509996424808062
areaUnderPR:0.7938620582628833


In [0]:
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(rfpredicts)

## Random Forest (Elo Diff)

In [0]:
va = VectorAssembler(outputCol="features", inputCols=["elo_diff"])
va_df = va.transform(df_elo).select("features", "result_int").withColumnRenamed("result_int", "label")

In [0]:
splits = va_df.randomSplit([0.8, 0.2])

train = splits[0].cache()
validation = splits[1].cache()

In [0]:
evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")

In [0]:
rf = RandomForestClassifier()
paramGrid = ParamGridBuilder()\
                .addGrid(rf.numTrees, [10, 20, 30, 40, 50])\
                .addGrid(rf.minInstancesPerNode, [1, 2, 3, 4, 5, 10])\
                .addGrid(rf.maxDepth, [2,3,4,5,10,15,20,30])\
                .build()

cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    numFolds=5, 
                    estimatorParamMaps=paramGrid)

In [0]:
# Fit a cross-validated Random Forest model on training data
cvmodel = cv.fit(train)
rfpredicts = cvmodel.bestModel.transform(validation)

In [0]:
print("Best Max Depth : %s" % cvmodel.bestModel.getMaxDepth)
print("Best Num Trees : %s" % cvmodel.bestModel.getNumTrees)

In [0]:
print("Accuracy : %s" % evaluator.evaluate(rfpredicts))
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(rfpredicts)))
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(rfpredicts)))

In [0]:
metric_name = "f1"
metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")
metrics.setMetricName(metric_name) 

metrics.evaluate(rfpredicts)