### ML with Pyspark
+ classify/predict winning team 
+ this will be more direct as all the job has been done in the supervised notebook I will be training and testing for the most part
### Data Source
+ https://www.kaggle.com/hikne707/big-five-european-soccer-leagues 

In [6]:
# Load our Packages
from pyspark import SparkContext
import pandas as pd

In [None]:
sc = SparkContext(master='local[2]')

In [8]:
# spark UI
sc

In [27]:
# load spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark_England").getOrCreate()

In [28]:
# load our dataset other data set that can be used ...
# France_league_V1.csv
# Spain_league_V1.csv
# Germany_league_V1.csv
# Italy_league_V1.csv
# Leagues.csv
df = spark.read.csv("D:/Senior/Capstone/data-science-enviroment/data/Leagues/Leagues_V1.csv", header=True, inferSchema=True)

For a detail explaination of what is goin on below and the data specs/ metrics refer to the Spark_Leagues_Unsupervised notebook in the same folder.This preprocess data is being transfered from there so it is made in 1 code block. 

In [29]:
import pyspark.ml
# load Ml packages
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [30]:
# 2nd half goals full time - half time
df= df.withColumn("H2_Team_1", df['FT_Team_1']-df['HT_Team_1'])
df= df.withColumn("H2_Team_2", df['FT_Team_2']-df['HT_Team_2'])

#Rename HT to represent 1/2 halves, sorry for the redundency spark beginner
df= df.withColumnRenamed('HT_Team_1','H1_Team_1')
df= df.withColumnRenamed('HT_Team_2','H1_Team_2')

#Goal difference given, creating + for home win and - for away win
df= df.withColumn('FT_GD', df['FT_Team_1']-df['FT_Team_2'])
df= df.withColumn('H1_GD', df['H1_Team_1']-df['H1_Team_2'])
df= df.withColumn('H2_GD', df['FT_GD']-df['H1_GD'])

df = df.select('Round', 'Date', 'Team_1', 'Team_2', 'Year', 'Country', 'FT_Team_1', 'FT_Team_2', 'H1_Team_1', 'H1_Team_2', 'GGD', 'Team_1_(pts)', 'Team_2_(pts)', 'H2_Team_1', 'H2_Team_2', 'FT_GD', 'H1_GD', 'H2_GD', 'Outcome')

# Split the date column to get month, then label encode month
from pyspark.sql.functions import split
df = df.withColumn('Month', split(df['Date'],' ').getItem(2))

# convert the string Month into numbers
# label encoding
monthEncoder = StringIndexer(inputCol='Month',outputCol='Game_Month').fit(df)
df = monthEncoder.transform(df)
required_features = ['Round', 'Team_1', 'Team_2', 'Year', 'Country', 'FT_Team_1', 'FT_Team_2', 'H1_Team_1', 'H1_Team_2', 'GGD', 'Team_1_(pts)', 'Team_2_(pts)', 'H2_Team_1', 'H2_Team_2', 'FT_GD', 'H1_GD', 'H2_GD', 'Game_Month', 'Outcome']

In [32]:
# VectorAssembly
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='features')
#sticking entire feature required in a vector which I will be using to Model 
vec_df = vec_assembler.transform(df)

### All caught up to the pre-processing 

In [33]:
from pyspark.ml.feature import StandardScaler
scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(vec_df)
data_scale_output=data_scale.transform(vec_df)

In [35]:
data_scale_output.toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44259,44260,44261,44262,44263,44264,44265,44266,44267,44268
Round,1,1,1,1,1,1,1,1,1,1,...,25,25,25,25,26,26,26,26,26,26
Date,(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sat) 19 Aug 1995 (W33),(Sun) 20 Aug 1995 (W33),...,(Sat) 29 Feb 2020 (W9),(Sat) 29 Feb 2020 (W9),(Sun) 1 Mar 2020 (W9),(Sun) 1 Mar 2020 (W9),(Sun) 8 Mar 2020 (W10),(Sun) 8 Mar 2020 (W10),(Sun) 8 Mar 2020 (W10),(Sun) 8 Mar 2020 (W10),(Sun) 8 Mar 2020 (W10),(Mon) 9 Mar 2020 (W11)
Team_1,28,38,63,127,129,135,185,220,222,26,...,176,178,206,56,145,8,200,208,116,207
Team_2,130,150,80,184,198,65,137,123,41,131,...,40,197,29,19,175,99,110,11,86,45
Year,1995,1995,1995,1995,1995,1995,1995,1995,1995,1995,...,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019
Country,0,0,0,0,0,0,0,0,0,0,...,4,4,4,4,4,4,4,4,4,4
FT_Team_1,3,1,0,1,1,3,3,1,3,1,...,2,2,2,3,0,1,2,0,2,3
FT_Team_2,1,0,0,0,1,0,4,2,2,1,...,0,1,7,4,1,2,1,0,0,0
H1_Team_1,3,1,0,0,0,1,1,1,2,1,...,2,1,2,1,0,0,0,0,0,1
H1_Team_2,0,0,0,0,1,0,3,0,2,1,...,0,0,2,2,0,2,1,0,0,0
