In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
import os

In [4]:
data = spark.read.option("mode", 'DROPMALFORMED').csv("games.csv", header=True, inferSchema=True)

In [5]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- rated: boolean (nullable = true)
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- victory_status: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [6]:
data.count()

20058

In [7]:
data.createOrReplaceTempView("data")

In [8]:
from pyspark.sql.functions import length
data = data.where(length("moves") >= 90)

In [9]:
data.count()

17957

In [10]:
from pyspark.sql.functions import split
split_col = split(data['moves'], ' ')
data = data.withColumn('move1', split_col.getItem(0)) \
       .withColumn('move2', split_col.getItem(1))   \
       .withColumn('move3', split_col.getItem(2))   \
       .withColumn('move4', split_col.getItem(3))   \
       .withColumn('move5', split_col.getItem(4))   \
       .withColumn('move6', split_col.getItem(5))   \
       .withColumn('move7', split_col.getItem(6))   \
       .withColumn('move8', split_col.getItem(7))   \
       .withColumn('move9', split_col.getItem(8))   \
       .withColumn('move10', split_col.getItem(9))  \
       .withColumn('move11', split_col.getItem(10)) \
       .withColumn('move12', split_col.getItem(11)) \
       .withColumn('move13', split_col.getItem(12)) \
       .withColumn('move14', split_col.getItem(13)) \
       .withColumn('move15', split_col.getItem(14)) \
       .withColumn('move16', split_col.getItem(15)) \
       .withColumn('move17', split_col.getItem(16)) \
       .withColumn('move18', split_col.getItem(17)) \
       .withColumn('move19', split_col.getItem(18)) \
       .withColumn('move20', split_col.getItem(19)) 
data = data.drop("moves")
data.show(truncate=False) 

+--------+-----+----------+------------+-----+--------------+------+--------------+------------------+------------+------------------+------------+-----------+---------------------------------------------------------------------+-----------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+
|id      |rated|created_at|last_move_at|turns|victory_status|winner|increment_code|white_id          |white_rating|black_id          |black_rating|opening_eco|opening_name                                                         |opening_ply|move1|move2|move3|move4|move5|move6|move7|move8|move9|move10|move11|move12|move13|move14|move15|move16|move17|move18|move19|move20|
+--------+-----+----------+------------+-----+--------------+------+--------------+------------------+------------+------------------+------------+-----------+---------------------------------------------------------------------+-----------+-----+-----+-

Check for null values

In [11]:
from pyspark.sql.functions import isnan, when, count, col
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----------+------------+-----------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+
| id|rated|created_at|last_move_at|turns|victory_status|winner|increment_code|white_id|white_rating|black_id|black_rating|opening_eco|opening_name|opening_ply|move1|move2|move3|move4|move5|move6|move7|move8|move9|move10|move11|move12|move13|move14|move15|move16|move17|move18|move19|move20|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----------+------------+-----------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+
|  0|    0|         0|           0|    0|             0|     0|             0|       0|           0|       0|           0|     

Remove rows where players drawed

In [12]:
data.createOrReplaceTempView("data")

In [13]:
data = spark.sql("SELECT * FROM data WHERE winner != 'draw'")

In [14]:
data.count()

17106

Remove unnecessary columns and rows

In [15]:
data = data.drop("id","victory_status","rated")
data.count()

17106

One Hot Encoding

In [16]:
data.printSchema()

root
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)
 |-- move1: string (nullable = true)
 |-- move2: string (nullable = true)
 |-- move3: string (nullable = true)
 |-- move4: string (nullable = true)
 |-- move5: string (nullable = true)
 |-- move6: string (nullable = true)
 |-- move7: string (nullable = true)
 |-- move8: string (nullable = true)
 |-- move9: string (nullable = true)
 |-- move10: string (nullable = true)
 |-- move11: string (nullable = true)
 |-- move12: string (nullable = true)
 |-- move13: string (nullable = true)
 

In [17]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
import numpy as np
categoricalColumns = [item[0] for item in data.dtypes if item[1].startswith('string') ]

stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_index')
    stages += [stringIndexer]

pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(data)
data = pipelineModel.transform(data)

cols = np.array(categoricalColumns)
data = data.drop(*cols)
data.show()

+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------------+------------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+
|created_at|last_move_at|turns|white_rating|black_rating|opening_ply|winner_index|increment_code_index|white_id_index|black_id_index|opening_eco_index|opening_name_index|move1_index|move2_index|move3_index|move4_index|move5_index|move6_index|move7_index|move8_index|move9_index|move10_index|move11_index|move12_index|move13_index|move14_index|move15_index|move16_index|move17_index|move18_index|move19_index|move20_index|
+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------------+-----

Minmax Scaler

In [18]:
data.printSchema()

root
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- opening_ply: integer (nullable = true)
 |-- winner_index: double (nullable = false)
 |-- increment_code_index: double (nullable = false)
 |-- white_id_index: double (nullable = false)
 |-- black_id_index: double (nullable = false)
 |-- opening_eco_index: double (nullable = false)
 |-- opening_name_index: double (nullable = false)
 |-- move1_index: double (nullable = false)
 |-- move2_index: double (nullable = false)
 |-- move3_index: double (nullable = false)
 |-- move4_index: double (nullable = false)
 |-- move5_index: double (nullable = false)
 |-- move6_index: double (nullable = false)
 |-- move7_index: double (nullable = false)
 |-- move8_index: double (nullable = false)
 |-- move9_index: double (nullable = false)
 |-- move10_index: double (nullable = false)
 |-

In [19]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

columns_to_scale = ["created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col+"_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(data)
scaledData = scalerModel.transform(data)

data = scaledData.drop("created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply", "rated_vec", "created_at_vec", "last_move_at_vec", "turns_vec", "white_rating_vec", "black_rating_vec", "opening_ply_vec")
data.show()

+------------+--------------------+--------------+--------------+-----------------+------------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_index|white_id_index|black_id_index|opening_eco_index|opening_name_index|move1_index|move2_index|move3_index|move4_index|move5_index|move6_index|move7_index|move8_index|move9_index|move10_index|move11_index|move12_index|move13_index|move14_index|move15_index|move16_index|move17_index|move18_index|move19_index|move20_index|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|
+------------+--------------------+-----

In [20]:
data.count()

17106

Implement Neural Network

In [21]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
assembler = VectorAssembler(inputCols=[s for s in data.columns if s != "winner_index"], outputCol="features")

In [23]:
output = assembler.transform(data)

In [24]:
output.printSchema()

root
 |-- winner_index: double (nullable = false)
 |-- increment_code_index: double (nullable = false)
 |-- white_id_index: double (nullable = false)
 |-- black_id_index: double (nullable = false)
 |-- opening_eco_index: double (nullable = false)
 |-- opening_name_index: double (nullable = false)
 |-- move1_index: double (nullable = false)
 |-- move2_index: double (nullable = false)
 |-- move3_index: double (nullable = false)
 |-- move4_index: double (nullable = false)
 |-- move5_index: double (nullable = false)
 |-- move6_index: double (nullable = false)
 |-- move7_index: double (nullable = false)
 |-- move8_index: double (nullable = false)
 |-- move9_index: double (nullable = false)
 |-- move10_index: double (nullable = false)
 |-- move11_index: double (nullable = false)
 |-- move12_index: double (nullable = false)
 |-- move13_index: double (nullable = false)
 |-- move14_index: double (nullable = false)
 |-- move15_index: double (nullable = false)
 |-- move16_index: double (nullable 

In [25]:
columns = [s for s in output.columns]
columns.remove("winner_index")
columns.remove("features")
cols = np.array(columns)
data = output.drop(*cols)
data = data.withColumnRenamed("winner_index", "label")

In [26]:
data = data.dropDuplicates() #dropping duplicates slow program down a lot
data.show(truncate=False)

+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                               |
+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1.0  |[9.0,710.0,0.0,9.0,12.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,2.0,2.0,4.0,5.0,5.0,5.0,42.0,7.0,2.0,54.0,7.0,108.0,0.9945729966490177,0.9945676541756724,0.25914634146341464,0.10010881392818281,0.2816593886462882,0.25925925925925924]

In [27]:
data.count()

16762

In [28]:
[train, test] = data.randomSplit([0.7, 0.3])

In [29]:
lsvc = LinearSVC(labelCol="label", maxIter=50)

In [30]:
lsvc = lsvc.fit(train)

In [31]:
result = lsvc.transform(test)

In [32]:
print(result.columns)

['label', 'features', 'rawPrediction', 'prediction']


In [33]:
result.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [34]:
result.show()

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[0.0,4.0,599.0,1....|[-0.4237595776060...|       1.0|
|  0.0|[0.0,4.0,639.0,4....|[-0.6364696299870...|       1.0|
|  0.0|[0.0,10.0,1533.0,...|[-0.1545761433105...|       1.0|
|  0.0|[0.0,10.0,1872.0,...|[-0.5918704472065...|       1.0|
|  0.0|[0.0,12.0,1225.0,...|[-0.3293647210588...|       1.0|
|  0.0|[0.0,13.0,1362.0,...|[-0.6394320518151...|       1.0|
|  0.0|[0.0,13.0,1495.0,...|[0.62477517758835...|       0.0|
|  0.0|[0.0,13.0,1678.0,...|[2.20660771253204...|       0.0|
|  0.0|[0.0,14.0,824.0,1...|[0.01182897681639...|       0.0|
|  0.0|[0.0,18.0,2747.0,...|[0.96610359791955...|       0.0|
|  0.0|[0.0,19.0,1222.0,...|[0.23376584920398...|       0.0|
|  0.0|[0.0,19.0,1553.0,...|[-0.1453194509006...|       1.0|
|  0.0|[0.0,20.0,766.0,8...|[-1.0725172959374...|       1.0|
|  0.0|[0.0,21.0,5784.0,

In [35]:
predictionAndLabels = result.select("prediction", "label")

In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

In [None]:
accuracy = evaluator.evaluate(result)

In [None]:
print("Test set accuracy = " + str(accuracy))