## Import libraries

In [1]:
# pyspark libraries
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, FloatType, ArrayType
from pyspark.sql.functions import col, udf, when
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.tuning import ParamGridBuilder

# python libraries
import numpy as np

## User Defined Functions

In [2]:
def convert_string_to_float(x):
    x_replace_minus = x.replace(u'\u2212', '-')
    if x_replace_minus == '-':
        return np.nan
    else:
        return float(x_replace_minus)

udf_convert_string_to_float = udf(lambda x: convert_string_to_float(x), FloatType())

In [3]:
udf_get_percentage_game = udf(lambda x, y: x / y, FloatType())

In [4]:
udf_create_features = udf(lambda s,t,u,v,w,x,y,z: Vectors.dense([s,t,u,v,w,x,y,z]), VectorUDT())

In [5]:
def get_date_string(date, month, year):
    return year + "/" + month + "/" + date

udf_get_date_string = udf(lambda date, month, year: get_date_string(date, month, year), StringType())

In [6]:
def win_team_1(score_team_1, score_team_2):
    if score_team_1 > score_team_2:
        return 2.0
    elif score_team_1 < score_team_2:
        return 1.0
    else:
        return 0.0
    
udf_win_team_1 = udf(lambda team_1, team_2: win_team_1(team_1, team_2), FloatType())

In [7]:
udf_diff_features = udf(lambda features_1, features_2: features_1 - features_2, VectorUDT())

In [8]:
schema = StructType([
    StructField("rankGroup_local", StringType(), True),
    StructField("rankGroup_global", StringType(), True),
    StructField("teamGroup_team", StringType(), True),
    StructField("ratingGroup_rating", StringType(), True),
    StructField("highestGroup_rank_max", StringType(), True),
    StructField("highestGroup_rating_max", StringType(), True),
    StructField("averageGroup_rank_avg", StringType(), True),
    StructField("averageGroup_rating_avg", StringType(), True),
    StructField("lowestGroup_rank_min", StringType(), True),
    StructField("lowestGroup_rating_min", StringType(), True),
    StructField("change3mGroup_rank_three_month_change", StringType(), True),
    StructField("change3mGroup_rating_three_month_change", StringType(), True),
    StructField("change6mGroup_rank_six_month_change", StringType(), True),
    StructField("change6mGroup_rating_six_month_change", StringType(), True),
    StructField("change1yGroup_rank_one_year_change", StringType(), True),
    StructField("change1yGroup_rating_one_year_change", StringType(), True),
    StructField("change2yGroup_rank_two_year_change", StringType(), True),
    StructField("change2yGroup_rating_two_year_change", StringType(), True),
    StructField("change5yGroup_rank_five_year_change", StringType(), True),
    StructField("change5yGroup_rating_five_year_change", StringType(), True),
    StructField("change10yGroup_rank_ten_year_change", StringType(), True),
    StructField("change10yGroup_rating_ten_year_change", StringType(), True),
    StructField("matchesGroup_total", StringType(), True),
    StructField("matchesGroup_home", StringType(), True),
    StructField("matchesGroup_away", StringType(), True),
    StructField("matchesGroup_neutral", StringType(), True),
    StructField("matchesGroup_wins", StringType(), True),
    StructField("matchesGroup_losses", StringType(), True),
    StructField("matchesGroup_draws", StringType(), True),
    StructField("goalsGroup_for", StringType(), True),
    StructField("goalsGroup_against", StringType(), True)
])

names_to_convert = schema.names
names_to_convert.remove("teamGroup_team")


AFC_qualifying_start = spark.read.csv("../data/AFC/2014_World_Cup_AFC_qualifying_start.tsv", sep="\t", 
                                      schema=schema, header=False)\
                                 .select([udf_convert_string_to_float(col(name)).alias(name) for name in names_to_convert] + ["teamGroup_team"])

In [9]:
AFC_qualifying_start = AFC_qualifying_start\
.withColumn("matches_home",    udf_get_percentage_game(col("matchesGroup_home"), col("matchesGroup_total")))\
.withColumn("matches_away",    udf_get_percentage_game(col("matchesGroup_away"), col("matchesGroup_total")))\
.withColumn("matches_neutral", udf_get_percentage_game(col("matchesGroup_neutral"), col("matchesGroup_total")))\
.withColumn("matches_wins",    udf_get_percentage_game(col("matchesGroup_wins"), col("matchesGroup_total")))\
.withColumn("matches_losses",  udf_get_percentage_game(col("matchesGroup_losses"), col("matchesGroup_total")))\
.withColumn("matches_draws",  udf_get_percentage_game(col("matchesGroup_draws"), col("matchesGroup_total")))\
.withColumn("matches_for",    udf_get_percentage_game(col("goalsGroup_for"), col("matchesGroup_total")))\
.withColumn("matches_against",  udf_get_percentage_game(col("goalsGroup_against"), col("matchesGroup_total")))\
.select(col("teamGroup_team").alias("team"), col("matches_home"), col("matches_away"), col("matches_neutral"), 
        col("matches_wins"), col("matches_losses"), col("matches_draws"),
        col("matches_for"), col("matches_against"))

AFC_qualifying_start.show(5)


+----+------------+------------+---------------+------------+--------------+-------------+-----------+---------------+
|team|matches_home|matches_away|matches_neutral|matches_wins|matches_losses|matches_draws|matches_for|matches_against|
+----+------------+------------+---------------+------------+--------------+-------------+-----------+---------------+
|  JP|  0.37785017|    0.252443|     0.36970684|  0.45114008|    0.32899022|    0.2198697|  1.6905538|       1.223127|
|  KR|  0.31050768|  0.23966943|      0.4498229|   0.5478158|    0.20070839|    0.2514758|   1.853601|      0.9020071|
|  AU|   0.4437086|   0.3620309|      0.1942605|   0.5121413|    0.27593818|   0.21192053|   2.039735|      1.1037527|
|  IR|        0.34|       0.278|          0.382|       0.546|         0.218|        0.236|       1.87|          0.826|
|  CN|  0.33032492|  0.33754513|     0.33212996|  0.51805055|    0.27436823|   0.20758122|  1.9801444|      1.0361011|
+----+------------+------------+---------------+

In [10]:
AFC_qualifying_start = AFC_qualifying_start\
.withColumn("features", udf_create_features(col("matches_home"), col("matches_away"), col("matches_neutral"),
                                            col("matches_wins"), col("matches_losses"), col("matches_draws"),
                                            col("matches_for"),  col("matches_against")))\
.select("team", "features")

# AFC_qualifying_start.show(5, truncate=True)

In [11]:
Vectors.dense([1,2,3,5])

DenseVector([1.0, 2.0, 3.0, 5.0])

In [12]:
schema = StructType([
    StructField("year", StringType(), True),
    StructField("month", StringType(), True),
    StructField("date", StringType(), True),
    StructField("team_1", StringType(), True),
    StructField("team_2", StringType(), True),
    StructField("score_team_1", IntegerType(), True),
    StructField("score_team_2", IntegerType(), True),
    StructField("tournament", StringType(), True),
    StructField("country_played", StringType(), True),
    StructField("rating_moved", StringType(), True),
    StructField("rating_team_1", StringType(), True),
    StructField("rating_team_2", StringType(), True),
    StructField("rank_moved_team_1", StringType(), True),
    StructField("rank_moved_team_2", StringType(), True),
    StructField("rank_team_1", StringType(), True),
    StructField("rank_team_2", StringType(), True)
])

AFC_qualifying_results = spark.read.csv("../data/AFC/2014_World_Cup_AFC_qualifying_results.tsv", sep="\t", 
                                        schema=schema, header=False)\
                              .withColumn("new_date", udf_get_date_string(col("date"), col("month"), col("year")))\
                              .drop("date").drop("month").drop("year").withColumnRenamed("new_date", "date")

names_to_convert = AFC_qualifying_results.schema.names
names_to_remove = ["date",  "team_1", "team_2", "score_team_1", "score_team_2", "tournament", "country_played"]
for name in names_to_remove: names_to_convert.remove(name)


AFC_qualifying_results = AFC_qualifying_results\
                         .select([udf_convert_string_to_float(col(name)).alias(name) for name in names_to_convert] + names_to_remove)\
                         .select("team_1", "team_2", "score_team_1", "score_team_2")\
                         .withColumn("label", udf_win_team_1(col("score_team_1"), col("score_team_2")))\
                         .select("team_1", "team_2", "label")    

In [13]:
data = AFC_qualifying_results.join(AFC_qualifying_start, AFC_qualifying_results.team_1 == AFC_qualifying_start.team)\
.withColumnRenamed("features", "features_1").drop("team")\
.join(AFC_qualifying_start, AFC_qualifying_results.team_2 == AFC_qualifying_start.team)\
.withColumnRenamed("features", "features_2").drop("team")\
.withColumn("features", udf_diff_features(col("features_1"), col("features_2")))\
.select("label", "features")

In [14]:
data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|[-0.0924882665276...|
|  2.0|[0.02913619577884...|
|  2.0|[0.08815789222717...|
|  0.0|[-0.0671749860048...|
|  2.0|[-0.1738087832927...|
+-----+--------------------+
only showing top 5 rows



## Multi-layer Perceptron

### Define estimator

In [20]:
multi_layer = MultilayerPerceptronClassifier(featuresCol="features", labelCol="label")

### Define evaluator

In [21]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")

### Define grid parameter

In [26]:
grid = ParamGridBuilder()\
.addGrid(multi_layer.layers, [[8, 5, 3]])\
.build()

In [27]:
grid

[{Param(parent=u'MultilayerPerceptronClassifier_4d099aeb4e53af520785', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.'): [8,
   5,
   3]}]

### Defined Cross Validator

In [28]:
cv = CrossValidator(estimator=multi_layer, estimatorParamMaps=grid, evaluator=evaluator, numFolds=4)

In [29]:
cv_model = cv.fit(data)

In [23]:
prediction = cv_model.transform(data)

In [24]:
print("Accuracy on the train dataset: {0}".format(evaluator.evaluate(prediction)))

Accuracy on the train dataset: 0.602540834846


In [25]:
prediction.printSchema()

root
 |-- label: float (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [26]:
prediction.groupBy("prediction").count().show()



+----------+-----+
|prediction|count|
+----------+-----+
|       1.0|   12|
|       2.0|  539|
+----------+-----+



## Load model

In [27]:
model = MultilayerPerceptronClassificationModel.load("../test/classification_model/multilayer_perceptron")

In [28]:
model.transform(data).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  2.0|[-0.0924882665276...|[-0.2515240083865...|[0.22615074316625...|       2.0|
|  2.0|[0.02913619577884...|[0.04093503191387...|[0.17838547825541...|       2.0|
|  2.0|[0.08815789222717...|[0.28757311407884...|[0.24803099879128...|       2.0|
|  0.0|[-0.0671749860048...|[-0.4962876004275...|[0.08012678447024...|       2.0|
|  2.0|[-0.1738087832927...|[-0.0050817187468...|[0.31332524043115...|       2.0|
|  2.0|[0.07562661916017...|[0.16382241732012...|[0.20537887918069...|       2.0|
|  2.0|[0.00278192758560...|[-0.1408865614700...|[0.13181188240285...|       2.0|
|  2.0|[0.01073707640171...|[0.04316879032472...|[0.17837488386460...|       2.0|
|  2.0|[0.10961537808179...|[-0.3448605051066...|[0.22586817129160...|       2.0|
|  1.0|[-0.08755