In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
import os

In [4]:
data = spark.read.option("mode", 'DROPMALFORMED').csv("games.csv", header=True, inferSchema=True)

In [5]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- rated: boolean (nullable = true)
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- victory_status: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [6]:
data.count()

20058

Check for null values

In [7]:
from pyspark.sql.functions import isnan, when, count, col
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+
| id|rated|created_at|last_move_at|turns|victory_status|winner|increment_code|white_id|white_rating|black_id|black_rating|moves|opening_eco|opening_name|opening_ply|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+
|  0|    0|         0|           0|    0|             0|     0|             0|       0|           0|       0|           0|    0|          0|           0|          0|
+---+-----+----------+------------+-----+--------------+------+--------------+--------+------------+--------+------------+-----+-----------+------------+-----------+



Remove rows where players drawed

In [8]:
data.createOrReplaceTempView("data")

In [9]:
data = spark.sql("SELECT * FROM data WHERE winner != 'draw'")

In [10]:
data.count()

19108

Remove unnecessary columns and rows

In [11]:
data = data.drop("id","victory_status","rated")
data.count()

19108

One Hot Encoding

In [12]:
data.printSchema()

root
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: integer (nullable = true)



In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
import numpy as np
categoricalColumns = [item[0] for item in data.dtypes if item[1].startswith('string') ]

stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_index')
    stages += [stringIndexer]

pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(data)
data = pipelineModel.transform(data)

cols = np.array(categoricalColumns)
data = data.drop(*cols)
data.show()

+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------+-----------------+------------------+
|created_at|last_move_at|turns|white_rating|black_rating|opening_ply|winner_index|increment_code_index|white_id_index|black_id_index|moves_index|opening_eco_index|opening_name_index|
+----------+------------+-----+------------+------------+-----------+------------+--------------------+--------------+--------------+-----------+-----------------+------------------+
|1.50421E12|  1.50421E12|   13|        1500|        1191|          5|         0.0|                21.0|        3068.0|        1114.0|     4910.0|             44.0|             226.0|
|1.50413E12|  1.50413E12|   16|        1322|        1261|          4|         1.0|                16.0|        2131.0|        7830.0|     2601.0|              7.0|             613.0|
|1.50413E12|  1.50413E12|   61|        1496|        1500|          3|         0.0|   

In [14]:
encoder = OneHotEncoder(inputCols=["increment_code_index", "white_id_index", "black_id_index", "moves_index", "opening_eco_index", "opening_name_index"],
                                 outputCols=["increment_code_encoded", "white_id_encoded", "black_id_encoded", "moves_encoded", "opening_eco_encoded", "opening_name_encoded"])
model = encoder.fit(data)
data = model.transform(data)
data = data.drop("increment_code_index", "white_id_index", "black_id_index", "moves_index", "opening_eco_index", "opening_name_index")
data.show()

+----------+------------+-----+------------+------------+-----------+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+
|created_at|last_move_at|turns|white_rating|black_rating|opening_ply|winner_index|increment_code_encoded|   white_id_encoded|   black_id_encoded|       moves_encoded|opening_eco_encoded|opening_name_encoded|
+----------+------------+-----+------------+------------+-----------+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+
|1.50421E12|  1.50421E12|   13|        1500|        1191|          5|         0.0|      (393,[21],[1.0])|(9105,[3068],[1.0])|(8974,[1114],[1.0])|(18034,[4910],[1.0])|   (358,[44],[1.0])|  (1452,[226],[1.0])|
|1.50413E12|  1.50413E12|   16|        1322|        1261|          4|         1.0|      (393,[16],[1.0])|(9105,[2131],[1.0])|(8974,[7830],[1.0])|(18034,[2601],[1.0])|  

Minmax Scaler

In [15]:
data.printSchema()

root
 |-- created_at: double (nullable = true)
 |-- last_move_at: double (nullable = true)
 |-- turns: integer (nullable = true)
 |-- white_rating: integer (nullable = true)
 |-- black_rating: integer (nullable = true)
 |-- opening_ply: integer (nullable = true)
 |-- winner_index: double (nullable = false)
 |-- increment_code_encoded: vector (nullable = true)
 |-- white_id_encoded: vector (nullable = true)
 |-- black_id_encoded: vector (nullable = true)
 |-- moves_encoded: vector (nullable = true)
 |-- opening_eco_encoded: vector (nullable = true)
 |-- opening_name_encoded: vector (nullable = true)



In [16]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

columns_to_scale = ["created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col+"_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(data)
scaledData = scalerModel.transform(data)

data = scaledData.drop("created_at", "last_move_at", "turns", "white_rating", "black_rating", "opening_ply", "rated_vec", "created_at_vec", "last_move_at_vec", "turns_vec", "white_rating_vec", "black_rating_vec", "opening_ply_vec")
data.show()

+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_encoded|   white_id_encoded|   black_id_encoded|       moves_encoded|opening_eco_encoded|opening_name_encoded|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|
+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         0.0|      (393,[21],[1.0])|(9105,[3068],[1.0])|(8974,[1114],[1.0])|(18034,[4910],[1.0])|   (358,[44],[1.0])|  (1452,[226],[1.0])|[0.9977831158695807]|[0.9977777724801767]|[0.03448275862068...|

Implement Neural Network

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
assembler = VectorAssembler(inputCols=[s for s in data.columns if s != "winner_index"], outputCol="features")

In [19]:
output = assembler.transform(data)

In [20]:
output.printSchema()

root
 |-- winner_index: double (nullable = false)
 |-- increment_code_encoded: vector (nullable = true)
 |-- white_id_encoded: vector (nullable = true)
 |-- black_id_encoded: vector (nullable = true)
 |-- moves_encoded: vector (nullable = true)
 |-- opening_eco_encoded: vector (nullable = true)
 |-- opening_name_encoded: vector (nullable = true)
 |-- created_at_scaled: vector (nullable = true)
 |-- last_move_at_scaled: vector (nullable = true)
 |-- turns_scaled: vector (nullable = true)
 |-- white_rating_scaled: vector (nullable = true)
 |-- black_rating_scaled: vector (nullable = true)
 |-- opening_ply_scaled: vector (nullable = true)
 |-- features: vector (nullable = true)



In [21]:
output.show()

+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|winner_index|increment_code_encoded|   white_id_encoded|   black_id_encoded|       moves_encoded|opening_eco_encoded|opening_name_encoded|   created_at_scaled| last_move_at_scaled|        turns_scaled| white_rating_scaled| black_rating_scaled|  opening_ply_scaled|            features|
+------------+----------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         0.0|      (393,[21],[1.0])|(9105,[3068],[1.0])|(8974,[1114],[1.0])|(18034,[4910],[1.0])|   (358,[44],[1.0])|  (1452,[226],[1.0])|

In [22]:
columns = [s for s in output.columns]
columns.remove("winner_index")
columns.remove("features")
cols = np.array(columns)
data = output.drop(*cols)
data = data.withColumnRenamed("winner_index", "label")

In [23]:
data = data.dropDuplicates() #dropping duplicates slow program down a lot
data.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                    |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(38322,[0,7878,9867,32675,36562,37043,38316,38317,38318,38319,38320,38321],[1.0,1.0,1.0,1.0,1.0,1.0,0.9957474368461806,0.9957421006737311,0.25287356321839083,0.42797494780793316,0.4265770423991727,0.25925925925925924])  |
|1.0  |(38322,[0,5607,11908,35078,36507,36868,38316,38317,38318,38319,38320,38321],[

In [24]:
data.count()

18711

In [25]:
[train, test] = data.randomSplit([0.7, 0.3])

In [26]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=10)

In [27]:
model = rf.fit(train)

In [28]:
result = model.transform(test)

In [29]:
print(result.columns)

['label', 'features', 'rawPrediction', 'probability', 'prediction']


In [30]:
result.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [None]:
result.show()

In [None]:
predictionAndLabels = result.select("prediction", "label")

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

In [None]:
accuracy = evaluator.evaluate(result)

In [None]:
print("Test set accuracy = " + str(accuracy))