In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
import os

In [4]:
data = spark.read.option("mode", 'DROPMALFORMED').csv("games.csv", header=True, inferSchema=True)

In [5]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- rated: boolean (nullable = true)
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- victory_status: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [6]:
data.count()

20058

Check for null values

In [7]:
from pyspark.sql.functions import isnan, when, count, col
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+
| id|rated|created_at|last_move_at|turns|victory_status|winner|increment_code|white_id|white_rating|black_id|black_rating|moves|opening_eco|opening_name|opening_ply|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+
|  0|    0|         0|           0|    0|             0|     0|             0|       0|           0|       0|           0|    0|          0|           0|          0|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+



Remove rows where players drawed

In [8]:
data.createOrReplaceTempView("data")

In [9]:
data = spark.sql("SELECT * FROM data WHERE winner != 'draw'")

In [10]:
data.count()

19108

Remove unnecessary columns and rows

In [11]:
data = data.drop("id","victory_status","moves","rated")
data.count()

19108

Indexing

In [12]:
data.printSchema()

root
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
categoricalColumns = [item[0] for item in data.dtypes if item[1].startswith('string') ]

stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_index')
    stages += [stringIndexer]

pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(data)
data = pipelineModel.transform(data)

data = data.drop("winner", "increment_code", "white_id", "black_id", "opening_eco", "opening_name")
data.show()

+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------------+------------------+
|created_at|last_move_at|turns|white_rating|black_rating|opening_ply|winner_index|increment_code_index|white_id_index|black_id_index|opening_eco_index|opening_name_index|
+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------------+------------------+
|1.50421E12|  1.50421E12|   13|        1500|        1191|          5|         0.0|                21.0|        3068.0|        1114.0|             44.0|             226.0|
|1.50413E12|  1.50413E12|   16|        1322|        1261|          4|         1.0|                16.0|        2131.0|        7830.0|              7.0|             613.0|
|1.50413E12|  1.50413E12|   61|        1496|        1500|          3|         0.0|                16.0|        4941.0|        1114.0|            

Minmax Scaler

In [14]:
data.printSchema()

root
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- opening_ply: integer (nullable = true)
 |-- winner_index: double (nullable = false)
 |-- increment_code_index: double (nullable = false)
 |-- white_id_index: double (nullable = false)
 |-- black_id_index: double (nullable = false)
 |-- opening_eco_index: double (nullable = false)
 |-- opening_name_index: double (nullable = false)



In [15]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

columns_to_scale = ["created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col+"_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(data)
scaledData = scalerModel.transform(data)

data = scaledData.drop("created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply", "rated_vec", "created_at_vec", "last_move_at_vec", "turns_vec", "white_rating_vec", "black_rating_vec", "opening_ply_vec")
data.show()

+------------+--------------------+--------------+--------------+-----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_index|white_id_index|black_id_index|opening_eco_index|opening_name_index|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|
+------------+--------------------+--------------+--------------+-----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         0.0|                21.0|        3068.0|        1114.0|             44.0|             226.0|[0.9977831158695807]|[0.9977777724801767]|[0.03448275862068...|[0.3736951983298538]|[0.20785935884177...|[0.14814814814814...|
|         1.0|                16.0|        2131.0|        7830.0|              7.0| 

In [16]:
data.count()

19108

Implement Neural Network

In [17]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [18]:
layers = [11, 7, 4, 2]

In [19]:
assembler = VectorAssembler(inputCols=["increment_code_index", "white_id_index", "black_id_index", "opening_eco_index", "opening_name_index", "created_at_scaled", "last_move_at_scaled", "turns_scaled", "white_rating_scaled", "black_rating_scaled", "opening_ply_scaled"], outputCol="features")

In [20]:
output = assembler.transform(data)

In [21]:
output.printSchema()

root
 |-- winner_index: double (nullable = false)
 |-- increment_code_index: double (nullable = false)
 |-- white_id_index: double (nullable = false)
 |-- black_id_index: double (nullable = false)
 |-- opening_eco_index: double (nullable = false)
 |-- opening_name_index: double (nullable = false)
 |-- created_at_scaled: vector (nullable = true)
 |-- last_move_at_scaled: vector (nullable = true)
 |-- turns_scaled: vector (nullable = true)
 |-- white_rating_scaled: vector (nullable = true)
 |-- black_rating_scaled: vector (nullable = true)
 |-- opening_ply_scaled: vector (nullable = true)
 |-- features: vector (nullable = true)



In [22]:
output.show()

+------------+--------------------+--------------+--------------+-----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_index|white_id_index|black_id_index|opening_eco_index|opening_name_index|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|            features|
+------------+--------------------+--------------+--------------+-----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         0.0|                21.0|        3068.0|        1114.0|             44.0|             226.0|[0.9977831158695807]|[0.9977777724801767]|[0.03448275862068...|[0.3736951983298538]|[0.20785935884177...|[0.14814814814814...|[21.0,3068.0,1114...|


In [23]:
data = output.drop("increment_code_index", "white_id_index", "black_id_index", "opening_eco_index", "opening_name_index", "created_at_scaled", "last_move_at_scaled", "turns_scaled", "white_rating_scaled", "black_rating_scaled", "opening_ply_scaled")
data = data.withColumnRenamed("winner_index", "label")

In [24]:
data = data.dropDuplicates() #dropping duplicates slow program down a lot
data.show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                           |
+-----+---------------------------------------------------------------------------------------------------------------------------------------------------+
|1.0  |[12.0,0.0,705.0,9.0,33.0,0.9941032345580497,0.994097904214679,0.09482758620689655,0.2635699373695198,0.35935884177869704,0.18518518518518517]      |
|0.0  |[0.0,3656.0,6589.0,19.0,127.0,0.9803232534765721,0.9803179719864322,0.27586206896551724,0.3736951983298538,0.2942088934850052,0.18518518518518517] |
|0.0  |[44.0,559.0,2715.0,57.0,1.0,0.7385472217742834,0.7385427974362834,0.08045977011494253,0.38048016701461373,0.34488107549120994,0.14814814814814814] |
|1.0  |[21.0,286.0,6735.0,103.0,281.0,0.9767999628591488,0.97679

In [25]:
[train, test] = data.randomSplit([0.6, 0.4], 1234)

In [26]:
trainer = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layers, blockSize=128, seed=1234)

In [27]:
model = trainer.fit(train)

In [None]:
result = model.transform(test)

In [None]:
print(result.columns)

In [None]:
result.printSchema()

In [None]:
result.show()

In [None]:
predictionAndLabels = result.select("prediction", "label")

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))