In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
import os

In [4]:
data = spark.read.option("mode", 'DROPMALFORMED').csv("games.csv", header=True, inferSchema=True)

In [5]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- rated: boolean (nullable = true)
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- victory_status: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [6]:
data.count()

20058

Check for null values

In [7]:
from pyspark.sql.functions import isnan, when, count, col
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+
| id|rated|created_at|last_move_at|turns|victory_status|winner|increment_code|white_id|white_rating|black_id|black_rating|moves|opening_eco|opening_name|opening_ply|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+
|  0|    0|         0|           0|    0|             0|     0|             0|       0|           0|       0|           0|    0|          0|           0|          0|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+



Remove rows where players drawed

In [8]:
data.createOrReplaceTempView("data")

In [9]:
data = spark.sql("SELECT * FROM data WHERE winner != 'draw'")

In [10]:
data.count()

19108

Remove unnecessary columns and rows

In [11]:
data = data.drop("id","victory_status")
data.count()

19108

One Hot Encoding

In [12]:
data.printSchema()

root
 |-- rated: boolean (nullable = true)
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
categoricalColumns = [item[0] for item in data.dtypes if item[1].startswith('string') ]

stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_index')
    stages += [stringIndexer]

pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(data)
data = pipelineModel.transform(data)

data = data.drop("winner", "increment_code", "white_id", "black_id", "moves", "opening_eco", "opening_name")
data.show()

+-----+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------+-----------------+------------------+
|rated|created_at|last_move_at|turns|white_rating|black_rating|opening_ply|winner_index|increment_code_index|white_id_index|black_id_index|moves_index|opening_eco_index|opening_name_index|
+-----+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------+-----------------+------------------+
|false|1.50421E12|  1.50421E12|   13|        1500|        1191|          5|         0.0|                21.0|        3068.0|        1114.0|     4910.0|             44.0|             226.0|
| true|1.50413E12|  1.50413E12|   16|        1322|        1261|          4|         1.0|                16.0|        2131.0|        7830.0|     2601.0|              7.0|             613.0|
| true|1.50413E12|  1.50413E12|   61|        1496|     

In [14]:
encoder = OneHotEncoder(inputCols=["increment_code_index", "white_id_index", "black_id_index", "moves_index", "opening_eco_index", "opening_name_index"],
                                 outputCols=["increment_code_encoded", "white_id_encoded", "black_id_encoded", "moves_encoded", "opening_eco_encoded", "opening_name_encoded"])
model = encoder.fit(data)
data = model.transform(data)
data = data.drop("increment_code_index", "white_id_index", "black_id_index", "moves_index", "opening_eco_index", "opening_name_index")
data.show()

+-----+----------+------------+-----+------------+------------+-----------+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+
|rated|created_at|last_move_at|turns|white_rating|black_rating|opening_ply|winner_index|increment_code_encoded|   white_id_encoded|   black_id_encoded|       moves_encoded|opening_eco_encoded|opening_name_encoded|
+-----+----------+------------+-----+------------+------------+-----------+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+
|false|1.50421E12|  1.50421E12|   13|        1500|        1191|          5|         0.0|      (393,[21],[1.0])|(9105,[3068],[1.0])|(8974,[1114],[1.0])|(18034,[4910],[1.0])|   (358,[44],[1.0])|  (1452,[226],[1.0])|
| true|1.50413E12|  1.50413E12|   16|        1322|        1261|          4|         1.0|      (393,[16],[1.0])|(9105,[2131],[1.0])|(8974,[7830],

Minmax Scaler

In [15]:
data.printSchema()

root
 |-- rated: boolean (nullable = true)
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- opening_ply: integer (nullable = true)
 |-- winner_index: double (nullable = false)
 |-- increment_code_encoded: vector (nullable = true)
 |-- white_id_encoded: vector (nullable = true)
 |-- black_id_encoded: vector (nullable = true)
 |-- moves_encoded: vector (nullable = true)
 |-- opening_eco_encoded: vector (nullable = true)
 |-- opening_name_encoded: vector (nullable = true)



In [16]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

columns_to_scale = ["rated", "created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col+"_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(data)
scaledData = scalerModel.transform(data)

data = scaledData.drop("rated", "created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply", "rated_vec", "created_at_vec", "last_move_at_vec", "turns_vec", "white_rating_vec", "black_rating_vec", "opening_ply_vec")
data.show()

+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_encoded|   white_id_encoded|   black_id_encoded|       moves_encoded|opening_eco_encoded|opening_name_encoded|rated_scaled|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|
+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         0.0|      (393,[21],[1.0])|(9105,[3068],[1.0])|(8974,[1114],[1.0])|(18034,[4910],[1.0])|   (358,[44],[1.0])|  (1452,[226],[1.0])|       [0.0]|[0.99778311

In [17]:
data.count()

19108

Implement Neural Network

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
assembler = VectorAssembler(inputCols=[s for s in data.columns if s != "winner_index"], outputCol="features")

In [20]:
output = assembler.transform(data)

In [21]:
output.printSchema()

root
 |-- winner_index: double (nullable = false)
 |-- increment_code_encoded: vector (nullable = true)
 |-- white_id_encoded: vector (nullable = true)
 |-- black_id_encoded: vector (nullable = true)
 |-- moves_encoded: vector (nullable = true)
 |-- opening_eco_encoded: vector (nullable = true)
 |-- opening_name_encoded: vector (nullable = true)
 |-- rated_scaled: vector (nullable = true)
 |-- created_at_scaled: vector (nullable = true)
 |-- last_move_at_scaled: vector (nullable = true)
 |-- turns_scaled: vector (nullable = true)
 |-- white_rating_scaled: vector (nullable = true)
 |-- black_rating_scaled: vector (nullable = true)
 |-- opening_ply_scaled: vector (nullable = true)
 |-- features: vector (nullable = true)



In [22]:
output.show()

+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_encoded|   white_id_encoded|   black_id_encoded|       moves_encoded|opening_eco_encoded|opening_name_encoded|rated_scaled|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|            features|
+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         0.0|      (393,[21],[1.0])|(9105,[3068],[1.0])|(8974,[1114],[1.0])|(18034,[4910],[1.0])|  

In [23]:
data = output.drop("increment_code_encoded", "white_id_encoded", "black_id_encoded", "moves_encoded", "opening_eco_encoded", "opening_name_encoded", "rated_scaled", "created_at_scaled", "last_move_at_scaled", "turns_scaled", "white_rating_scaled", "black_rating_scaled", "opening_ply_scaled")
data = data.withColumnRenamed("winner_index", "label")

In [24]:
data = data.dropDuplicates() #dropping duplicates slow program down a lot
data.show(truncate=False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                              |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1.0  |(38323,[10,625,12484,23411,36550,37000,38316,38317,38318,38319,38320,38321,38322],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9865868812408801,0.9865815775447262,0.14942528735632182,0.14196242171189977,0.18976215098241986,0.1111111111111111]) |
|0.0  |(38323,[2,652,15677,30989,36515,36902

In [25]:
data.head(1)

[Row(label=0.0, features=SparseVector(38323, {21: 1.0, 3461: 1.0, 10612: 1.0, 23382: 1.0, 36550: 1.0, 37090: 1.0, 38317: 0.9978, 38318: 0.9978, 38319: 0.0345, 38320: 0.3737, 38321: 0.2079, 38322: 0.1481}))]

In [26]:
[train, test] = data.randomSplit([0.7, 0.3], 1234)

In [27]:
lsvc = LinearSVC(labelCol="label", maxIter=50)

In [28]:
lsvc = lsvc.fit(train)

In [29]:
result = lsvc.transform(test)

In [30]:
print(result.columns)

['label', 'features', 'rawPrediction', 'prediction']


In [31]:
result.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [32]:
result.show()

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|(38323,[0,393,103...|[0.31013126698247...|       0.0|
|  0.0|(38323,[0,393,105...|[1.4429388202881,...|       0.0|
|  0.0|(38323,[0,395,101...|[1.42287742265103...|       0.0|
|  0.0|(38323,[0,398,147...|[0.26456509899997...|       0.0|
|  0.0|(38323,[0,401,108...|[-0.0923372023441...|       1.0|
|  0.0|(38323,[0,401,108...|[-0.0923396989898...|       1.0|
|  0.0|(38323,[0,401,111...|[-0.3621603456916...|       1.0|
|  0.0|(38323,[0,401,111...|[-0.3621634433874...|       1.0|
|  0.0|(38323,[0,401,111...|[3.25695970120836...|       0.0|
|  0.0|(38323,[0,401,115...|[3.13602125890288...|       0.0|
|  0.0|(38323,[0,405,108...|[2.63381399687083...|       0.0|
|  0.0|(38323,[0,406,109...|[0.59306810728625...|       0.0|
|  0.0|(38323,[0,410,149...|[2.05261584038535...|       0.0|
|  0.0|(38323,[0,412,109

In [33]:
predictionAndLabels = result.select("prediction", "label")

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

In [35]:
accuracy = evaluator.evaluate(result)

In [36]:
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.6165868108835055
