# Task 3 - Chess ML Project - Kevin Kimmel
## Produce a ML model capable of predicting chess games better than elo

### Import packages

In [None]:
from datetime import datetime

import numpy as np

##Pre-Processing
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

##Models
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression

##Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

import math

from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler

### Start up spark and connect to Mongo

In [None]:
spark = SparkSession.builder.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.2")\
                            .config("spark.network.timeout", "36000000s")\
                            .config("spark.executor.heartbeatInterval", "3600s")\
                            .getOrCreate()

In [None]:
database = 'MongoDBAtlas'
collection_pos = 'pos_evals'
collection_elo = 'elo_eval'
user_name = 'Kevin'
password = 'chess1'
address = 'chesscluster.ar0uw.mongodb.net'
connection_string_pos = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection_pos}"
connection_string_elo = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection_elo}"

In [None]:
df_pos = spark.read.format("mongo").option("uri",connection_string_pos).load()
df_eval = spark.read.format("mongo").option("uri",connection_string_elo).load()

## Data Processing and Feature Engineering

In [None]:
df_eval = df_eval.withColumn('elo_diff',df_eval['White Elo'] - df_eval['Black Elo'])

In [None]:
def calculate_fide_expected_score(x):
    return math.erfc(-x / ((2000.0/7) * math.sqrt(2))) / 2 ## Formula that FIDE(Governing body of chess) uses to calculate expected score of a game.

xScore = udf(calculate_fide_expected_score, FloatType())

In [None]:
df_eval = df_eval.select('Black Elo', 'White Elo', 'Result', 'Time Class', 'Time Control','elo_diff',xScore("elo_diff").alias("expected_score_fide"))

In [None]:
def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num").setHandleInvalid("keep")
        sm = si.fit(newdf)
        
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        # and then drops the original columns.
        # and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        # For each given colum, create OneHotEncoder. 
        # dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

In [None]:
categorical_cols = ["Time Class","Time Control"]
df_eval_sti = indexStringColumns(df_eval, categorical_cols)

In [None]:
# create the one hot encoded columns
df_eval_ohe = oneHotEncodeColumns(df_eval_sti, categorical_cols)

In [None]:
# convert the result string to a number for easier encoding

def convert_res_to_binary(x):
    if x == '1-0': # if black won
        return 1
    elif x == '0-1': # if white won
        return 0
    elif '5' in x: # if it was a draw
        return 2

result_conv = udf(convert_res_to_binary, IntegerType())

In [None]:
df_eval = df_eval_ohe.withColumn('result_int',result_conv('Result'))

In [None]:
# create a safe convert to int
def convert_to_int(x):
    try:
        return int(x)
    except ValueError:
        return None
    
int_conv = udf(convert_to_int, IntegerType())

In [None]:
# select the values we want to use in the model
df_eval = df_eval.select(int_conv('Black Elo').alias('Black Elo'), int_conv('White Elo').alias('White Elo'), 'Result', 'elo_diff', 'expected_score_fide', 'Time Class', 'Time Control', 'result_int')

In [None]:
# get rid of draws, we are only looking at wins and losses
df_eval = df_eval.where(df_eval.result_int != 2)

### What does df look like after all the feature engineering?

In [None]:
df_eval.show()

## Metrics from using all of the given features

In [None]:
# create vector assembler and scalers
va = VectorAssembler(outputCol="va_features", inputCols=["Black Elo", "White Elo","elo_diff", "Time Class", "Time Control"])
scaler = StandardScaler(inputCol = "va_features", outputCol="features")

# create train/validation sets
splits = df_eval.randomSplit([0.8, 0.2])
train = splits[0].cache()
validation = splits[1].cache()

# get the transformed datasets from the va
train_transformed = va.transform(train).select("va_features", "result_int").withColumnRenamed("result_int", "label")
val_transformed = va.transform(validation).select("va_features", "result_int").withColumnRenamed("result_int", "label")

# make the SVM model
svm = LinearSVC(maxIter=100, regParam=0.1)

# make the scaled model
scaler_model = scaler.fit(train_transformed)

# scale the data
scaled_data = scaler_model.transform(train_transformed)
scaled_val = scaler_model.transform(val_transformed)

# fit the data
model = svm.fit(scaled_data)
predictions = model.transform(scaled_val)

# calculate accuracy
evaluator = MulticlassClassificationEvaluator().setMetricName('accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# calculate PR
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderPR')
areaunderPR = evaluator.evaluate(predictions)
print("Area under PR:", areaunderPR)

# calculate ROC
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderROC')
areaunderROC = evaluator.evaluate(predictions)
print("Area under ROC:", areaunderROC)

## Metrics from using only "Black Elo" and "White Elo"

In [None]:
# create vector assembler and scalers
va = VectorAssembler(outputCol="va_features", inputCols=["Black Elo", "White Elo"])
scaler = StandardScaler(inputCol = "va_features", outputCol="features")

# create train/validation sets
splits = df_eval.randomSplit([0.8, 0.2])
train = splits[0].cache()
validation = splits[1].cache()

# get the transformed datasets from the va
train_transformed = va.transform(train).select("va_features", "result_int").withColumnRenamed("result_int", "label")
val_transformed = va.transform(validation).select("va_features", "result_int").withColumnRenamed("result_int", "label")

# make the SVM model
svm = LinearSVC(maxIter=100, regParam=0.1)

# make the scaled model
scaler_model = scaler.fit(train_transformed)

# scale the data
scaled_data = scaler_model.transform(train_transformed)
scaled_val = scaler_model.transform(val_transformed)

# fit the data
model = svm.fit(scaled_data)
predictions = model.transform(scaled_val)

# calculate accuracy
evaluator = MulticlassClassificationEvaluator().setMetricName('accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# calculate PR
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderPR')
areaunderPR = evaluator.evaluate(predictions)
print("Area under PR:", areaunderPR)

# calculate ROC
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderROC')
areaunderROC = evaluator.evaluate(predictions)
print("Area under ROC:", areaunderROC)

## Metrics from using only "elo_diff"

In [None]:
# create vector assembler and scalers
va = VectorAssembler(outputCol="va_features", inputCols=["elo_diff"])
scaler = StandardScaler(inputCol = "va_features", outputCol="features")

# create train/validation sets
splits = df_eval.randomSplit([0.8, 0.2])
train = splits[0].cache()
validation = splits[1].cache()

# get the transformed datasets from the va
train_transformed = va.transform(train).select("va_features", "result_int").withColumnRenamed("result_int", "label")
val_transformed = va.transform(validation).select("va_features", "result_int").withColumnRenamed("result_int", "label")

# make the SVM model
svm = LinearSVC(maxIter=100, regParam=0.1)

# make the scaled model
scaler_model = scaler.fit(train_transformed)

# scale the data
scaled_data = scaler_model.transform(train_transformed)
scaled_val = scaler_model.transform(val_transformed)

# fit the data
model = svm.fit(scaled_data)
predictions = model.transform(scaled_val)

# calculate accuracy
evaluator = MulticlassClassificationEvaluator().setMetricName('accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# calculate PR
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderPR')
areaunderPR = evaluator.evaluate(predictions)
print("Area under PR:", areaunderPR)

# calculate ROC
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderROC')
areaunderROC = evaluator.evaluate(predictions)
print("Area under ROC:", areaunderROC)

## Conclusion

Using only "elo_diff" and using all of the features gives pretty similar metrics. Since simpler models tend to be preferable we should probably use only "elo_diff" if we choose the SVM model.