In [2]:
import pandas as pd
import numpy as np

import pyspark
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions
from pyspark.sql.functions import lit, desc, col, size, array_contains\
, isnan, udf, hour, array_min, array_max, countDistinct
from pyspark.sql.types import *

from pyspark.ml  import Pipeline     
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit

In [3]:
#Create session and allocate additional driver memory
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "16g") \
    .appName('Hazbuk') \
    .getOrCreate()

In [4]:
fm_csv = 'C:/Users/Hazbuk/Documents/University/7153CEM/datafm20.csv'

In [5]:
fm_data = spark.read.csv(fm_csv, 
                  inferSchema=True, 
                  header=True)

In [6]:
#Quick view of the data
fm_data.show(5)

+---+-----------------+--------------------+---------+--------------------+--------------------+------+------+------+---+--------------+--------+---------+--------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|_c0|             Name|            Position|     Club|            Division|               Based|Nation|Height|Weight|Age|Preferred Foot|Best Pos|Best Role|   Value|   Wage| CA| PA|Wor|Vis|Thr|Tec|Tea|Tck|Str|Sta|TRO|Ref|Pun|Pos|Pen|Pas|Pac|1v1|OtB|Nat|Mar|L Th|Lon|Ldr|Kic|Jum|Hea|Han|Fre|Fla|Fir|Fin|Ecc|Dri|Det|Dec|Cro|Cor|Cnt|Cmp|Com|Cmd|Bra|Bal|Ant|Agi|Agg|Aer|Acc|
+---+-----------------+--------------------+---------+--------------------+--------------------+------+------+------+---+--------------+--------+---------+--------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----

In [7]:
#count the number rows
Original = fm_data.count()
print("Original Rows: ", Original)
#count the number of potential duplicates (removing later if necessary)
dupe_count = fm_data.dropDuplicates().count()

#Calculate how many duplicate rows exist
Duplicated = Original - dupe_count
print("Duplicated Rows: ", Duplicated)

Original Rows:  144750
Duplicated Rows:  0


In [15]:
#count the columns
print("Number of columns: ", len(fm_data.columns))

Number of columns:  64


In [17]:
NA = fm_data.dropDuplicates().dropna(
    how="any")# use how="all" for missing data in the entire column
MissingVals = Original - NA.count()
print("Number of N/As: ", MissingVals)

Number of N/As:  0


In [18]:
#The number of players by Nation
fm_data.groupBy('Nation').count().sort("count",ascending=False).show()

+------+-----+
|Nation|count|
+------+-----+
|   ARG|10254|
|   BRA| 7935|
|   FRA| 5546|
|   ENG| 5050|
|   ESP| 4763|
|   POL| 4220|
|   ITA| 3833|
|   GRE| 3452|
|   GER| 3327|
|   CHN| 2979|
|   HUN| 2928|
|   URU| 2825|
|   COL| 2709|
|   AUS| 2708|
|   DEN| 2692|
|   SRB| 2550|
|   FIN| 2490|
|   CHI| 2481|
|   ISR| 2459|
|   POR| 2455|
+------+-----+
only showing top 20 rows



In [19]:
#The average value of players by Nation
fm_data.groupBy('Nation').mean('Value').sort("avg(Value)",ascending=False).show()

+------+------------------+
|Nation|        avg(Value)|
+------+------------------+
|   EGY|      2142448.4375|
|   GAB| 1721617.013888889|
|   ALG| 1687238.743169399|
|   ARM|1530733.5714285714|
|   CRC|1447689.2857142857|
|   SUR|1095454.5454545454|
|   SKN|1060571.4285714286|
|   ECU|1057350.8142857142|
|   UAE|1053695.1219512196|
|   CTA|1044086.6071428572|
|   SEN|1029317.6704545454|
|   DOM|1027591.2962962963|
|   KSA|1024111.5107913669|
|   ESP|1006478.0342221289|
|   CIV| 983811.8086560365|
|   ENG| 915278.8336633664|
|   JAM| 843449.8198198198|
|   QAT| 825354.8387096775|
|   TAN| 814333.3333333334|
|   GER| 810274.5082657048|
+------+------------------+
only showing top 20 rows



In [20]:
#The number of players by Club
fm_data.groupBy('Club').count().sort("count",ascending=False).show()

+----------+-----+
|      Club|count|
+----------+-----+
|   Unknown|20717|
|  Selangor|  137|
|    VÃ©lez|  124|
|     Perak|  122|
|Terengganu|  116|
|     Genoa|  107|
|  Sassuolo|  107|
|    Torino|  106|
|       VIT|  105|
|  Johor DT|  103|
|  Juventus|  103|
|     Inter|  102|
| Sampdoria|  102|
|  EC Bahia|  101|
|    Dinamo|  101|
|  Aldosivi|   95|
|  Sporting|   93|
|      Boca|   93|
|     Milan|   93|
|      Roma|   91|
+----------+-----+
only showing top 20 rows



In [34]:
#Remove players with "Unknown" clubs from the data set
fm_data_filtered = fm_data.filter(fm_data['Club']!="Unknown")

unknowns = Original - fm_data_filtered.count()
print("Unknowns removed: ", unknowns)

Unknowns removed:  20717


In [35]:
#The number of players by Club
fm_data_filtered.groupBy('Club').count().sort("count",ascending=False).show()

+----------+-----+
|      Club|count|
+----------+-----+
|  Selangor|  137|
|    VÃ©lez|  124|
|     Perak|  122|
|Terengganu|  116|
|     Genoa|  107|
|  Sassuolo|  107|
|    Torino|  106|
|       VIT|  105|
|  Juventus|  103|
|  Johor DT|  103|
| Sampdoria|  102|
|     Inter|  102|
|  EC Bahia|  101|
|    Dinamo|  101|
|  Aldosivi|   95|
|  Sporting|   93|
|     Milan|   93|
|      Boca|   93|
|      Roma|   91|
|  Atalanta|   90|
+----------+-----+
only showing top 20 rows



In [36]:
#The average value of players by Club
fm_data_filtered.groupBy('Club').mean('Value').sort("avg(Value)",ascending=False).show()

+-----------------+--------------------+
|             Club|          avg(Value)|
+-----------------+--------------------+
|        FC Bayern|2.4425742424242426E7|
|         Man City|1.8380380555555556E7|
|        R. Madrid|        1.78641875E7|
|        Barcelona|1.6751056603773585E7|
|        A. Madrid|1.5329563829787234E7|
|        Liverpool|1.5294201515151516E7|
|Borussia Dortmund|          1.404375E7|
|          Man Utd|1.3684573636363637E7|
|        Tottenham|1.3497091666666666E7|
|          Arsenal|1.2733009615384616E7|
|          Sevilla|1.2623111842105264E7|
|         Paris SG|     1.08052890625E7|
|       Real Betis|1.0724103448275862E7|
|          Everton|           9638800.0|
|         Bayer 04|   9244176.470588235|
|        Leicester|   9113058.035714285|
|       RB Leipzig|   8516366.633333333|
|       Villarreal|   8381359.756097561|
|           Wolves|   7559096.153846154|
|    Real Sociedad|   7543831.707317073|
+-----------------+--------------------+
only showing top

In [37]:
#Using tableau as part of the pre-processing analysis we see that the "Based" column actually contains 
#several duplications with respect to location (requiring the division for example)
#we will attempt to clean this up as best as possible using regular expressions

from pyspark.sql.functions import regexp_replace

fm_data_filtered = fm_data_filtered.withColumn('Based',regexp_replace('Based','\W\(.+',''))

In [38]:
#Checking this below we can see that the column has now been cleansed and will more accurately
#represent the country in which a player is based
fm_data_filtered.show(5)

+---+-----------------+--------------------+---------+--------------------+-------+------+------+------+---+--------------+--------+---------+--------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|_c0|             Name|            Position|     Club|            Division|  Based|Nation|Height|Weight|Age|Preferred Foot|Best Pos|Best Role|   Value|   Wage| CA| PA|Wor|Vis|Thr|Tec|Tea|Tck|Str|Sta|TRO|Ref|Pun|Pos|Pen|Pas|Pac|1v1|OtB|Nat|Mar|L Th|Lon|Ldr|Kic|Jum|Hea|Han|Fre|Fla|Fir|Fin|Ecc|Dri|Det|Dec|Cro|Cor|Cnt|Cmp|Com|Cmd|Bra|Bal|Ant|Agi|Agg|Aer|Acc|
+---+-----------------+--------------------+---------+--------------------+-------+------+------+------+---+--------------+--------+---------+--------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+--

In [39]:
#Position details every possible position a player can play, and contains a large variety of combinations,
#781, as shown below, this is clearly too many to properly categorise, as such "Best Pos" may be a better
#column to use. As such we will drop "Position" as it is very messy when compared to "Best Pos"
print('Combinations of Position:', fm_data_filtered.select('Position').distinct().count())

fm_data_filtered = fm_data_filtered.drop(*['Position'])

Combinations of Position: 781


In [40]:
#As a check, the below shows the distinct best role and best position categories, these are clearly better
print('Distinct Best Pos:', fm_data_filtered.select('Best Pos').distinct().count())
print('Distinct Best Role:', fm_data_filtered.select('Best Role').distinct().count())

Distinct Best Pos: 14
Distinct Best Role: 44


In [41]:
print('Distinct Clubs:', fm_data_filtered.select('Club').distinct().count())
print('Distinct Divisions:', fm_data_filtered.select('Division').distinct().count())

Distinct Clubs: 10220
Distinct Divisions: 1215


In [42]:
#Nation and the country a player is Based in would potentially have some impact on the value
#of a player, as such these will be left in the analysis
print('Distinct Nations:', fm_data_filtered.select('Nation').distinct().count())
print('Distinct "Based":', fm_data_filtered.select('Based').distinct().count())
print('-----')
#However Club and Division, while having impact, are more an indicator of the fact a division / club 
#is more affluent (and thus can afford said player), rather than a direct indicator of a 
#particular players value. As such, these will be excluded also. Particularly also as they are very 
#large in terms of unique values. One posibility would be to restrict to a particular division for
#a separate analysis and perhaps build individual models for each division.
print('Distinct Clubs:', fm_data_filtered.select('Club').distinct().count())
print('Distinct Divisions:', fm_data_filtered.select('Division').distinct().count())

fm_data_filtered = fm_data_filtered.drop(*['Club','Division'])

Distinct Nations: 212
Distinct "Based": 185
-----
Distinct Clubs: 10220
Distinct Divisions: 1215


In [44]:
#Now that general cleansing has been completed, we must encode our categorical data for use in 
#Machine Learning models. Simply because these models will only accept numerical values.

#Print the Schema to check which columns need encoding
fm_data_filtered.printSchema()
#Based, Nation, Best Pos, Best Role, Preferred Foot

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Based: string (nullable = true)
 |-- Nation: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- Best Pos: string (nullable = true)
 |-- Best Role: string (nullable = true)
 |-- Value: integer (nullable = true)
 |-- Wage: integer (nullable = true)
 |-- CA: integer (nullable = true)
 |-- PA: integer (nullable = true)
 |-- Wor: integer (nullable = true)
 |-- Vis: integer (nullable = true)
 |-- Thr: integer (nullable = true)
 |-- Tec: integer (nullable = true)
 |-- Tea: integer (nullable = true)
 |-- Tck: integer (nullable = true)
 |-- Str: integer (nullable = true)
 |-- Sta: integer (nullable = true)
 |-- TRO: integer (nullable = true)
 |-- Ref: integer (nullable = true)
 |-- Pun: integer (nullable = true)
 |-- Pos: integer (nullable = true)
 |-- Pen: integer (nullable = 

In [45]:
#Set a new frame for the ML Data Set
fm_data_ml = fm_data_filtered

#Dropping the Name column as this will need removing for ML analysis
fm_data_ml = fm_data_ml.drop('Name')

In [46]:
#We can see which variables are "string" type and thus may need encoding above.
#Clearly name will not be encoded as it will not be a predictor in our model

from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorIndexer

#Use StringIndexer to encode the categorical variables:
#Based, Nation, Best Pos, Best Role, Preferred Foot
SI_Based = StringIndexer(inputCol='Based',outputCol='Based_Index')
SI_Nation = StringIndexer(inputCol='Nation',outputCol='Nation_Index')
SI_BestPos = StringIndexer(inputCol='Best Pos',outputCol='BestPos_Index')
SI_BestRole = StringIndexer(inputCol='Best Role',outputCol='BestRole_Index')
SI_PreferredFoot = StringIndexer(inputCol='Preferred Foot',outputCol='PreferredFoot_Index')

#Perform the indexing transformations
fm_data_ml = SI_Based.fit(fm_data_ml).transform(fm_data_ml)
fm_data_ml = SI_Nation.fit(fm_data_ml).transform(fm_data_ml)
fm_data_ml = SI_BestPos.fit(fm_data_ml).transform(fm_data_ml)
fm_data_ml = SI_BestRole.fit(fm_data_ml).transform(fm_data_ml)
fm_data_ml = SI_PreferredFoot.fit(fm_data_ml).transform(fm_data_ml)




In [47]:
#Show the resulting Indexes next to their original categories
fm_data_ml.select('Based','Based_Index',
                  'Nation','Nation_Index',
                  'Best Pos','BestPos_Index',
                  'Best Role','BestRole_Index',
                  'Preferred Foot','PreferredFoot_Index').show()

+-------+-----------+------+------------+--------+-------------+---------+--------------+--------------+-------------------+
|  Based|Based_Index|Nation|Nation_Index|Best Pos|BestPos_Index|Best Role|BestRole_Index|Preferred Foot|PreferredFoot_Index|
+-------+-----------+------+------------+--------+-------------+---------+--------------+--------------+-------------------+
|  Spain|        4.0|   ARG|         0.0|  AM (R)|          5.0|       IF|          33.0|          Left|                2.0|
|  Italy|        5.0|   POR|        13.0|  ST (C)|          1.0|       CF|          39.0|        Either|                4.0|
| France|        2.0|   FRA|         2.0|  ST (C)|          1.0|       AF|          11.0|         Right|                0.0|
|Germany|        7.0|   GER|         7.0|      GK|          3.0|       SK|           5.0|        Either|                4.0|
| France|        2.0|   BRA|         1.0|  AM (L)|          8.0|       IW|           7.0|         Right|                0.0|


In [48]:
#Next we use One Hot Encoding 
OHE = OneHotEncoderEstimator(inputCols=['Based_Index', 
                                        'Nation_Index',
                                        'BestPos_Index',
                                        'BestRole_Index',
                                        'PreferredFoot_Index'],
                             outputCols=['Based_OHE', 
                                         'Nation_OHE',
                                         'BestPos_OHE',
                                         'BestRole_OHE',
                                         'PreferredFoot_OHE'])

#Perform the OHE transformation
fm_data_ml = OHE.fit(fm_data_ml).transform(fm_data_ml)



In [49]:
#Show the resulting Indexes next to their original categories
fm_data_ml.select('Based','Based_OHE',
                  'Nation','Nation_OHE',
                  'Best Pos','BestPos_OHE',
                  'Best Role','BestRole_OHE',
                  'Preferred Foot','PreferredFoot_OHE').show()

+-------+----------------+------+----------------+--------+---------------+---------+---------------+--------------+-----------------+
|  Based|       Based_OHE|Nation|      Nation_OHE|Best Pos|    BestPos_OHE|Best Role|   BestRole_OHE|Preferred Foot|PreferredFoot_OHE|
+-------+----------------+------+----------------+--------+---------------+---------+---------------+--------------+-----------------+
|  Spain| (184,[4],[1.0])|   ARG| (211,[0],[1.0])|  AM (R)| (13,[5],[1.0])|       IF|(43,[33],[1.0])|          Left|    (4,[2],[1.0])|
|  Italy| (184,[5],[1.0])|   POR|(211,[13],[1.0])|  ST (C)| (13,[1],[1.0])|       CF|(43,[39],[1.0])|        Either|        (4,[],[])|
| France| (184,[2],[1.0])|   FRA| (211,[2],[1.0])|  ST (C)| (13,[1],[1.0])|       AF|(43,[11],[1.0])|         Right|    (4,[0],[1.0])|
|Germany| (184,[7],[1.0])|   GER| (211,[7],[1.0])|      GK| (13,[3],[1.0])|       SK| (43,[5],[1.0])|        Either|        (4,[],[])|
| France| (184,[2],[1.0])|   BRA| (211,[1],[1.0])|  AM 

In [50]:
# features_to_scale = fm_data_ml.drop(*['_c0',
#                                         'Based',
#                                         'Nation',
#                                         'Preferred Foot',
#                                         'Best Pos',
#                                         'Best Role',
#                                         'Based_Index',
#                                         'Nation_Index',
#                                         'BestPos_Index',
#                                         'BestRole_Index',
#                                         'PreferredFoot_Index',
#                                         'Based_OHE',
#                                         'BestPos_OHE',
#                                         'PreferredFoot_OHE',
#                                         'BestRole_OHE',
#                                         'Nation_OHE'])

In [51]:
#Finally we assemble the vectors for input into our ML models
from pyspark.ml.feature import VectorAssembler

#We will exclude Value and Wage from our input columns, since these are what we would like to predict
assembler = VectorAssembler(inputCols=[  'Height',
                                         'Weight',
                                         'Age',
                                         'CA',
                                         'PA',
                                         'Wor',
                                         'Vis',
                                         'Thr',
                                         'Tec',
                                         'Tea',
                                         'Tck',
                                         'Str',
                                         'Sta',
                                         'TRO',
                                         'Ref',
                                         'Pun',
                                         'Pos',
                                         'Pen',
                                         'Pas',
                                         'Pac',
                                         '1v1',
                                         'OtB',
                                         'Nat',
                                         'Mar',
                                         'L Th',
                                         'Lon',
                                         'Ldr',
                                         'Kic',
                                         'Jum',
                                         'Hea',
                                         'Han',
                                         'Fre',
                                         'Fla',
                                         'Fir',
                                         'Fin',
                                         'Ecc',
                                         'Dri',
                                         'Det',
                                         'Dec',
                                         'Cro',
                                         'Cor',
                                         'Cnt',
                                         'Cmp',
                                         'Com',
                                         'Cmd',
                                         'Bra',
                                         'Bal',
                                         'Ant',
                                         'Agi',
                                         'Agg',
                                         'Aer',
                                         'Acc',
                                         'Based_Index',
                                         'Nation_Index',
                                         'BestPos_Index',
                                         'BestRole_Index',
                                         'PreferredFoot_Index',
                                         'Based_OHE',
                                         'BestPos_OHE',
                                         'PreferredFoot_OHE',
                                         'BestRole_OHE',
                                         'Nation_OHE'],
                           outputCol='features')

#Fill any null values
fm_data_ml = fm_data_ml.fillna(0)

#Perform the transformation
final_data = assembler.transform(fm_data_ml)

# view the transformed vector
final_data.select('features').show()

+--------------------+
|            features|
+--------------------+
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
|(512,[0,1,2,3,4,5...|
+--------------------+
only showing top 20 rows



In [52]:
final_ml = final_data.select(['features','Value'])

# Linear regression is bad but have it in anyway

In [53]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='Value', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(final_ml)

# Print the coefficients and intercept for generalized linear regression lr_model
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lr_model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [10955.340508804866,1605.0793744915725,-20159.370321053313,25882.86913312065,7631.504076242643,-0.0,3648.1626826556735,-16006.04968668395,7044.9003681136,-4921.843847287381,10135.523742677837,933.6792194541514,14485.4723955621,12981.744908994315,5223.3490740770685,4712.2775973054195,-10122.961751763274,14903.263383445967,8169.253617469579,54399.91206855639,-4676.5499874616735,6375.884202314384,36936.4331411457,-8456.012466898746,-54469.111368731334,2302.165667151223,13078.553785059441,2955.2215453522294,0.0,-6410.734463256241,5855.012871761621,-12090.252140147877,12337.211343793797,17791.826926299582,11787.018493270174,-12697.849820711983,10987.018347604631,15238.035734752555,86475.60949746006,1193.5326182593894,-18767.214478070477,11209.650739265631,18535.348630595927,-0.0,3066.90594335547,11543.854911544175,4102.402207688948,27007.747168847865,54444.75231170215,26798.276849660964,3845.8977275986845,59087.036985382125,-2123.539422434548,1123.6987077467866,-5109.843179145

RMSE: 2286509.532704
r2: 0.214446


In [54]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(featuresCol = 'features', labelCol='Value', family="gaussian", link="identity", maxIter=10, regParam=0.3)

glr_model = glr.fit(final_ml)

# Print the coefficients and intercept for generalized linear regression glr_model
print("Coefficients: " + str(glr_model.coefficients))
print("Intercept: " + str(glr_model.intercept))

# Summarize the glr_model over the training set and print out some metrics
summary = glr_model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

Coefficients: [7176.94695139132,-153.56611011707875,-25332.124243816826,77646.12444118629,-2621.7293660953856,-14630.995283845734,-12257.966645387942,-30145.385096355745,-12909.937026674525,-21777.84853024615,5242.171311829227,-30746.571442179462,-14182.127229147485,18289.19315413066,-12522.923850594756,1894.3260726563283,-47938.473666131584,15379.432954040549,-16538.41520613714,-7751.669629286961,-23935.314267477825,-9877.369576604828,26252.432046310638,-43951.63011029955,-48436.99773285849,-7027.289599804997,5159.8971805416395,-12382.050895136059,-19075.171778794574,-32854.913408605214,-17221.860929951774,-19800.436794088517,9452.627313709383,10664.751321776537,6090.125456362984,-4757.240487449825,-7842.653879472656,12407.30832657307,16914.510137648354,1667.9278940492486,-33545.518634709275,-15083.360643615131,560.2154813199716,-24899.10280644532,-17342.437196803057,-24499.46563765626,-9830.689662380062,3065.8288087342653,-11917.703081803655,19888.6685053814,-16649.895555698196,-8882

Coefficient Standard Errors: [2080.225837031197, 1734.4659561531873, 1625.7906501736186, 1150.6211576298078, 591.2497537079116, 4122.98796974221, 3689.0888206609015, 5694.926487765469, 3979.502800957985, 4841.115855253518, 3663.618107886844, 3676.1713400132485, 3114.8037737352324, 5308.038304044322, 5894.197699175321, 4696.92837460541, 3656.8797309779834, 2919.5054071179406, 3738.0691665627633, 4911.741308509258, 5851.243039748829, 3577.5589133612684, 2227.0279853660513, 4000.851763476083, 3064.168976833439, 3514.532342554113, 1935.2382290237688, 5575.337542912432, 3653.5121238917336, 3308.146833465175, 5878.15137348723, 3772.3503355405364, 2666.679613361579, 3720.39251089622, 3781.547297414357, 4163.860123819519, 3657.115887056242, 1620.1321467802925, 3388.118274601255, 3757.0429313204804, 4118.092305655022, 3782.0953860170343, 3726.9575024461988, 5893.330477770877, 5827.362388310646, 3297.822035370246, 3967.466101080736, 3716.8880110964715, 3961.5044542617306, 2094.4623893500466, 556

Null Deviance: 8.2548139366403e+17
Residual Degree Of Freedom Null: 124032
Deviance: 6.270929388324694e+17
Residual Degree Of Freedom: 123520
AIC: 3981178.1664834376
Deviance Residuals: 
+--------------------+
|   devianceResiduals|
+--------------------+
|   6.1645949168386E7|
|1.8517328015243582E7|
| 7.832395554016127E7|
|3.9107746732837245E7|
| 8.284070009479019E7|
| 5.993915342674188E7|
|  7.66175157262437E7|
| 4.710816004203102E7|
| -1468074.7191943284|
| 6.844705442608067E7|
| 6.830982286653216E7|
| 6.689867165495475E7|
| 6.855151948214096E7|
| -1091030.5405739062|
| 6.467074035530068E7|
| 7.194385456814158E7|
| 6.688648482572856E7|
| 5.304623409169915E7|
| -2724021.3623700268|
|1.6728101094896624E7|
+--------------------+
only showing top 20 rows



# Clustering

In [55]:
#As before we assemble a Vector to use for prediction, however, we will attempt to predict some 
#categorical variables from player attributes
#Can we predict from the data what nationality a player is, what their best position is,
#or what division they belong to?

#We will start with Best Pos and exclude this from our input columns
assembler = VectorAssembler(inputCols=[  'Height',
                                         'Weight',
                                         'Age',
                                         'Wage',
                                         'Value',
                                         'CA',
                                         'PA',
                                         'Wor',
                                         'Vis',
                                         'Thr',
                                         'Tec',
                                         'Tea',
                                         'Tck',
                                         'Str',
                                         'Sta',
                                         'TRO',
                                         'Ref',
                                         'Pun',
                                         'Pos',
                                         'Pen',
                                         'Pas',
                                         'Pac',
                                         '1v1',
                                         'OtB',
                                         'Nat',
                                         'Mar',
                                         'L Th',
                                         'Lon',
                                         'Ldr',
                                         'Kic',
                                         'Jum',
                                         'Hea',
                                         'Han',
                                         'Fre',
                                         'Fla',
                                         'Fir',
                                         'Fin',
                                         'Ecc',
                                         'Dri',
                                         'Det',
                                         'Dec',
                                         'Cro',
                                         'Cor',
                                         'Cnt',
                                         'Cmp',
                                         'Com',
                                         'Cmd',
                                         'Bra',
                                         'Bal',
                                         'Ant',
                                         'Agi',
                                         'Agg',
                                         'Aer',
                                         'Acc',
                                         'Based_OHE',
                                         'PreferredFoot_OHE',
                                         'BestRole_OHE',
                                         'Nation_OHE'],
                           outputCol='features')

#Perform the transformation
final_data_cluster_bestpos = assembler.transform(fm_data_ml)

# view the transformed vector
final_data_cluster_bestpos.select('features').show()

+--------------------+
|            features|
+--------------------+
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
|(496,[0,1,2,3,4,5...|
+--------------------+
only showing top 20 rows



In [56]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

In [57]:
#Vectorise the features
vectorindexor = VectorIndexer(inputCol="features", 
                              outputCol="indexedFeatures", 
                              maxCategories=10).fit(final_data_cluster_bestpos)

dt_data = vectorindexor.transform(final_data_cluster_bestpos)

In [58]:
#Vectorise the label output
labelIndexer = StringIndexer(inputCol="Best Pos", outputCol="indexedLabel").fit(dt_data)

dt_data = labelIndexer.transform(dt_data)

In [59]:
#split the data into test & train
test, train = dt_data.randomSplit([0.8,0.2])

In [60]:
#Initialise the Decision Tree Classification model
dt = DecisionTreeClassifier(labelCol="indexedLabel", 
                            featuresCol="indexedFeatures")

#fit it to the training data
dt_model = dt.fit(train)

In [61]:
#perform predictions on the data
predictions = dt_model.transform(test)

In [62]:
#select statement to display some of the predictions next to the actuals
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         5.0|(496,[0,1,2,3,4,5...|
|       1.0|         1.0|(496,[0,1,2,3,4,5...|
|       3.0|         3.0|(496,[0,1,2,3,4,5...|
|       1.0|         8.0|(496,[0,1,2,3,4,5...|
|       1.0|         2.0|(496,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [63]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.49321 


# Pipelines

Decision Tree

In [64]:
#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered


In [65]:
#it is also possible to perform the above steps via a pipeline, as below

#Stages 1-5: Perform indexation on categorical variables that we will use
stage_1 = StringIndexer(inputCol='Based',outputCol='Based_Index')
stage_2 = StringIndexer(inputCol='Nation',outputCol='Nation_Index')
stage_3 = StringIndexer(inputCol='Best Pos',outputCol='BestPos_Index')
stage_4 = StringIndexer(inputCol='Best Role',outputCol='BestRole_Index')
stage_5 = StringIndexer(inputCol='Preferred Foot',outputCol='PreferredFoot_Index')

#Stage 6: Perform OHE on these indexed variables
stage_6 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), 
                                            stage_2.getOutputCol(), 
                                            stage_3.getOutputCol(), 
                                            stage_4.getOutputCol(), 
                                            stage_5.getOutputCol()], 
                                 outputCols= ['Based_encoded', 
                                              'Nation_encoded',
                                              'BestPos_encoded',
                                              'BestRole_encoded',
                                              'PreferredFoot_encoded'])

#Stage 7: Assemble all predictors into a vector for use by the model 
stage_7 = VectorAssembler(inputCols=[  'Height',
                                         'Weight',
                                         'Age',
                                         'CA',
                                         'PA',
                                         'Wor',
                                         'Vis',
                                         'Thr',
                                         'Tec',
                                         'Tea',
                                         'Tck',
                                         'Str',
                                         'Sta',
                                         'TRO',
                                         'Ref',
                                         'Pun',
                                         'Pos',
                                         'Pen',
                                         'Pas',
                                         'Pac',
                                         '1v1',
                                         'OtB',
                                         'Nat',
                                         'Mar',
                                         'L Th',
                                         'Lon',
                                         'Ldr',
                                         'Kic',
                                         'Jum',
                                         'Hea',
                                         'Han',
                                         'Fre',
                                         'Fla',
                                         'Fir',
                                         'Fin',
                                         'Ecc',
                                         'Dri',
                                         'Det',
                                         'Dec',
                                         'Cro',
                                         'Cor',
                                         'Cnt',
                                         'Cmp',
                                         'Com',
                                         'Cmd',
                                         'Bra',
                                         'Bal',
                                         'Ant',
                                         'Agi',
                                         'Agg',
                                         'Aer',
                                         'Acc',
                                         'Based_encoded',
                                         'Nation_encoded',
                                         'BestPos_encoded',
                                         'BestRole_encoded',
                                         'PreferredFoot_encoded'],
                           outputCol='features')

#Stage 8: Index this vector for use in the model with 10 max categories
stage_8 = VectorIndexer(inputCol="features", 
                        outputCol="indexedFeatures", 
                        maxCategories=10)

#Stage 9: Index the response variable (label)
stage_9 = StringIndexer(inputCol="Best Pos", 
                        outputCol="indexedLabel")


#Stage 10: Initialise the model of our choosing                     
stage_10 = DecisionTreeClassifier(featuresCol='indexedFeatures',
                                  labelCol='indexedLabel')

#Setup the Pipeline
dt_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline
model = dt_pipeline.fit(fm_data_ml)
#Transform the data
fm_data_ml = model.transform(fm_data_ml)

#View Predicted values vs the Actuals
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       5.0|         5.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       3.0|         3.0|(507,[0,1,2,3,4,5...|
|       5.0|         8.0|(507,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [66]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.301065 


Logistic Regression

In [67]:
from pyspark.ml.classification import LogisticRegression
#Using Pipelines we can more easily test several models at once, as below,
#all other stages remain the same but we change the classification model used and continue

#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])



#Stage 10: Initialise the model of our choosing                         
stage_10 = LogisticRegression(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the pipeline
lr_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline
model = lr_pipeline.fit(train)
#Transform the data
fm_data_ml = model.transform(train)

#View the predictions
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(459,[0,1,2,3,4,5...|
|       8.0|         8.0|(459,[0,1,2,3,4,5...|
|       1.0|         1.0|(459,[0,1,2,3,4,5...|
|       3.0|         3.0|(459,[0,1,2,3,4,5...|
|       3.0|         3.0|(459,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [68]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


Random Forest

In [69]:
from pyspark.ml.classification import RandomForestClassifier

#Using Pipelines we can more easily test several models at once, as below,
#all other stages remain the same but we change the classification model used and continue

#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])

#Stage 10: Initialise the model of our choosing                             
stage_10 = RandomForestClassifier(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the pipeline
rf_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline
model = rf_pipeline.fit(train)
#Transform the data
fm_data_ml = model.transform(train)

#View the predictions
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       6.0|         6.0|(461,[0,1,2,3,4,5...|
|       1.0|         1.0|(461,[0,1,2,3,4,5...|
|       8.0|         8.0|(461,[0,1,2,3,4,5...|
|       2.0|         2.0|(461,[0,1,2,3,4,5...|
|       6.0|         6.0|(461,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [70]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.123588


## Predicting another categorical variable

In [71]:
#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])

#Predicting Nation using Pipeline with various models
#Stages 1-5: Perform indexation on categorical variables that we will use
stage_1 = StringIndexer(inputCol='Based',outputCol='Based_Index')
stage_2 = StringIndexer(inputCol='Nation',outputCol='Nation_Index')
stage_3 = StringIndexer(inputCol='Best Pos',outputCol='BestPos_Index')
stage_4 = StringIndexer(inputCol='Best Role',outputCol='BestRole_Index')
stage_5 = StringIndexer(inputCol='Preferred Foot',outputCol='PreferredFoot_Index')

#Stage 6: Perform OHE on these indexed variables
stage_6 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), 
                                            stage_2.getOutputCol(), 
                                            stage_3.getOutputCol(), 
                                            stage_4.getOutputCol(), 
                                            stage_5.getOutputCol()], 
                                 outputCols= ['Based_encoded', 
                                              'Nation_encoded',
                                              'BestPos_encoded',
                                              'BestRole_encoded',
                                              'PreferredFoot_encoded'])

#Stage 7: Assemble all predictors into a vector for use by the model 
stage_7 = VectorAssembler(inputCols=[  'Height',
                                         'Weight',
                                         'Age',
                                         'CA',
                                         'PA',
                                         'Wor',
                                         'Vis',
                                         'Thr',
                                         'Tec',
                                         'Tea',
                                         'Tck',
                                         'Str',
                                         'Sta',
                                         'TRO',
                                         'Ref',
                                         'Pun',
                                         'Pos',
                                         'Pen',
                                         'Pas',
                                         'Pac',
                                         '1v1',
                                         'OtB',
                                         'Nat',
                                         'Mar',
                                         'L Th',
                                         'Lon',
                                         'Ldr',
                                         'Kic',
                                         'Jum',
                                         'Hea',
                                         'Han',
                                         'Fre',
                                         'Fla',
                                         'Fir',
                                         'Fin',
                                         'Ecc',
                                         'Dri',
                                         'Det',
                                         'Dec',
                                         'Cro',
                                         'Cor',
                                         'Cnt',
                                         'Cmp',
                                         'Com',
                                         'Cmd',
                                         'Bra',
                                         'Bal',
                                         'Ant',
                                         'Agi',
                                         'Agg',
                                         'Aer',
                                         'Acc',
                                         'Based_encoded',
                                         'Nation_encoded',
                                         'BestPos_encoded',
                                         'BestRole_encoded',
                                         'PreferredFoot_encoded'],
                           outputCol='features')

#Stage 8: Index this vector for use in the model with 10 max categories
stage_8 = VectorIndexer(inputCol="features", 
                        outputCol="indexedFeatures", 
                        maxCategories=10)

#Stage 9: Index the response variable (label)
stage_9 = StringIndexer(inputCol="Nation", 
                        outputCol="indexedLabel")


#Stage 10: Initialise the model of our choosing                        
stage_10 = DecisionTreeClassifier(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the Pipeline
dt_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline
model = dt_pipeline.fit(fm_data_ml)
#Transform the data
fm_data_ml = model.transform(fm_data_ml)

#View Predicted values vs the Actuals
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(507,[0,1,2,3,4,5...|
|       5.0|        13.0|(507,[0,1,2,3,4,5...|
|       2.0|         2.0|(507,[0,1,2,3,4,5...|
|       5.0|         7.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [72]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.742109 


Logistic Regression

In [73]:
from pyspark.ml.classification import LogisticRegression

#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])



#Stage 10: Initialise the model of our choosing                          
stage_10 = LogisticRegression(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the pipeline
lr_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline for the training data
model = lr_pipeline.fit(train)
#Transform the data
fm_data_ml = model.transform(train)

#View the predictions
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|      28.0|        28.0|(450,[0,1,2,3,4,5...|
|      33.0|        33.0|(450,[0,1,2,3,4,5...|
|      33.0|        33.0|(450,[0,1,2,3,4,5...|
|      58.0|        58.0|(450,[0,1,2,3,4,5...|
|       6.0|         6.0|(450,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [74]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


Random Forest

In [75]:
from pyspark.ml.classification import RandomForestClassifier

#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])

#Stage 10: Initialise the model of our choosing                                
stage_10 = RandomForestClassifier(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the pipeline
rf_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline for the training data
model = rf_pipeline.fit(train)
#Transform the data
fm_data_ml = model.transform(train)

#View the predictions
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|      11.0|        11.0|(468,[0,1,2,3,4,5...|
|       7.0|         7.0|(468,[0,1,2,3,4,5...|
|       3.0|         2.0|(468,[0,1,2,3,4,5...|
|       4.0|        91.0|(468,[0,1,2,3,4,5...|
|       5.0|         5.0|(468,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [76]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.316412 


# Finally also for Based

In [77]:
#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])

#Predicting Based using Pipeline with various models
#Stages 1-5: Perform indexation on categorical variables that we will use
stage_1 = StringIndexer(inputCol='Based',outputCol='Based_Index')
stage_2 = StringIndexer(inputCol='Nation',outputCol='Nation_Index')
stage_3 = StringIndexer(inputCol='Best Pos',outputCol='BestPos_Index')
stage_4 = StringIndexer(inputCol='Best Role',outputCol='BestRole_Index')
stage_5 = StringIndexer(inputCol='Preferred Foot',outputCol='PreferredFoot_Index')

#Stage 6: Perform OHE on these indexed variables
stage_6 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), 
                                            stage_2.getOutputCol(), 
                                            stage_3.getOutputCol(), 
                                            stage_4.getOutputCol(), 
                                            stage_5.getOutputCol()], 
                                 outputCols= ['Based_encoded', 
                                              'Nation_encoded',
                                              'BestPos_encoded',
                                              'BestRole_encoded',
                                              'PreferredFoot_encoded'])

#Stage 7: Assemble all predictors into a vector for use by the model 
stage_7 = VectorAssembler(inputCols=[  'Height',
                                         'Weight',
                                         'Age',
                                         'CA',
                                         'PA',
                                         'Wor',
                                         'Vis',
                                         'Thr',
                                         'Tec',
                                         'Tea',
                                         'Tck',
                                         'Str',
                                         'Sta',
                                         'TRO',
                                         'Ref',
                                         'Pun',
                                         'Pos',
                                         'Pen',
                                         'Pas',
                                         'Pac',
                                         '1v1',
                                         'OtB',
                                         'Nat',
                                         'Mar',
                                         'L Th',
                                         'Lon',
                                         'Ldr',
                                         'Kic',
                                         'Jum',
                                         'Hea',
                                         'Han',
                                         'Fre',
                                         'Fla',
                                         'Fir',
                                         'Fin',
                                         'Ecc',
                                         'Dri',
                                         'Det',
                                         'Dec',
                                         'Cro',
                                         'Cor',
                                         'Cnt',
                                         'Cmp',
                                         'Com',
                                         'Cmd',
                                         'Bra',
                                         'Bal',
                                         'Ant',
                                         'Agi',
                                         'Agg',
                                         'Aer',
                                         'Acc',
                                         'Based_encoded',
                                         'Nation_encoded',
                                         'BestPos_encoded',
                                         'BestRole_encoded',
                                         'PreferredFoot_encoded'],
                           outputCol='features')

#Stage 8: Index this vector for use in the model with 10 max categories
stage_8 = VectorIndexer(inputCol="features", 
                        outputCol="indexedFeatures", 
                        maxCategories=10)

#Stage 9: Index the response variable (label)
stage_9 = StringIndexer(inputCol="Based", 
                        outputCol="indexedLabel")


#Stage 10: Initialise the model of our choosing                      
stage_10 = DecisionTreeClassifier(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the Pipeline
dt_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline
model = dt_pipeline.fit(fm_data_ml)
#Transform the data
fm_data_ml = model.transform(fm_data_ml)

#View Predicted values vs the Actuals
fm_data_ml.select("prediction", "indexedLabel", "features").show()

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       4.0|         4.0|(507,[0,1,2,3,4,5...|
|       5.0|         5.0|(507,[0,1,2,3,4,5...|
|       2.0|         2.0|(507,[0,1,2,3,4,5...|
|       5.0|         7.0|(507,[0,1,2,3,4,5...|
|       2.0|         2.0|(507,[0,1,2,3,4,5...|
|       5.0|         7.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       4.0|         4.0|(507,[0,1,2,3,4,5...|
|       5.0|         5.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       4.0|         4.0|(507,[0,1,2,3,4,5...|
|       4.0|         4.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       5.0|         5.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       5.0|         7.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       1.0|         1.0|(507,[0,1,2,3,4,5...|
|       5.0| 

In [78]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.744552 


Logistic Regression

In [84]:
from pyspark.ml.classification import LogisticRegression

#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])



#Stage 10: Initialise the model of our choosing                          
stage_10 = LogisticRegression(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the pipeline
lr_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline for the training data
model = lr_pipeline.fit(train)
#Transform the data
fm_data_ml = model.transform(train)

#View the predictions
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       5.0|         5.0|(461,[0,1,2,3,4,5...|
|      63.0|        63.0|(461,[0,1,2,3,4,5...|
|       5.0|         5.0|(461,[0,1,2,3,4,5...|
|       4.0|         4.0|(461,[0,1,2,3,4,5...|
|       7.0|         7.0|(461,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [85]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0 


Random Forest

In [81]:
from pyspark.ml.classification import RandomForestClassifier

#Resetting fm_data_ml for pipeline usage
fm_data_ml = fm_data_filtered
test, train = fm_data_ml.randomSplit([0.8,0.2])

#Stage 10: Initialise the model of our choosing                           
stage_10 = RandomForestClassifier(featuresCol='indexedFeatures',labelCol='indexedLabel')

#Setup the pipeline
rf_pipeline = Pipeline(stages= [stage_1,
                                stage_2, 
                                stage_3, 
                                stage_4, 
                                stage_5,
                                stage_6,
                                stage_7,
                                stage_8,
                                stage_9,
                                stage_10])

#Fit the pipeline for the training data
model = rf_pipeline.fit(train)
#Transform the data
fm_data_ml = model.transform(train)

#View the predictions
fm_data_ml.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       7.0|         7.0|(466,[0,1,2,3,4,5...|
|       3.0|         3.0|(466,[0,1,2,3,4,5...|
|       1.0|         1.0|(466,[0,1,2,3,4,5...|
|       1.0|         1.0|(466,[0,1,2,3,4,5...|
|       3.0|         3.0|(466,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [82]:
#Display the error of the model using the Classification Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(fm_data_ml)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.2523 
