In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('../Data/train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7443 entries, 0 to 7442
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   season       7443 non-null   int64  
 1   date         7443 non-null   object 
 2   league_id    7443 non-null   int64  
 3   league       7443 non-null   object 
 4   Team 1       7443 non-null   object 
 5   Team2        7443 non-null   object 
 6   SPI1         7443 non-null   float64
 7   SPI2         7443 non-null   float64
 8   proj_score1  7443 non-null   float64
 9   proj_score2  7443 non-null   float64
 10  importance1  2527 non-null   float64
 11  importance2  2527 non-null   float64
 12  score1       2643 non-null   float64
 13  score2       2643 non-null   float64
 14  xg1          1715 non-null   float64
 15  xg2          1715 non-null   float64
 16  nsxg1        1715 non-null   float64
 17  nsxg2        1715 non-null   float64
 18  adj_score1   1715 non-null   float64
 19  adj_sc

In [4]:
cat_cols = list(df.select_dtypes('object').columns)
cat_cols = cat_cols + ['league_id', 'season']
cat_cols.remove('date')
cat_cols

['league', 'Team 1', 'Team2', 'league_id', 'season']

In [5]:
for x in cat_cols:
    print(f"""
    {df[x].value_counts(ascending=False)}
    
    """)


    United Soccer League                        496
Major League Soccer                         459
Japanese J League                           380
Brasileiro Série A                          380
APD                                         325
English League Championship                 264
English League One                          264
English League Two                          252
Swedish Allsvenskan                         240
Chinese Super League                        240
Norwegian Tippeligaen                       240
Spanish Segunda Division                    220
UEFA Europa League                          204
French Ligue 2                              180
French Ligue 1                              180
Mexican Primera Division Torneo Clausura    171
Spanish Primera Division                    170
Italy Serie B                               170
Italy Serie A                               170
BJL                                         162
AAL                                

In [6]:
df['date'] = pd.to_datetime(df['date'])

In [7]:
def ConvertUnix(x):
    y = pd.Timestamp(x).timestamp()
    y = int(y)
    return y

df['date'] = df['date'].apply(ConvertUnix)

In [8]:
num_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
num_cols.remove('league_id')
num_cols.remove('season')
num_cols.remove('Outcome')

root_cols = num_cols.copy()
root_cols.remove('date')


In [9]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])
df

Unnamed: 0,season,date,league_id,league,Team 1,Team2,SPI1,SPI2,proj_score1,proj_score2,...,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,Outcome
0,2019,1.546474e+09,1979,Chinese Super League,Shandong Luneng,Guizhou Renhe,48.22,37.83,1.75,0.84,...,22.100000,1.000000,0.000000,1.390000,0.260000,2.050000,0.5400,1.05000,0.00,1
1,2019,1.546474e+09,1979,Chinese Super League,Shanghai Greenland,Shanghai SIPG,39.81,60.08,1.22,1.89,...,63.400000,0.000000,4.000000,0.570000,2.760000,0.800000,1.5000,0.00000,3.26,0
2,2019,1.546474e+09,1979,Chinese Super League,Guangzhou Evergrande,Tianjin Quanujian,65.59,39.99,2.58,0.62,...,28.800000,3.000000,0.000000,0.490000,0.450000,1.050000,0.7500,3.15000,0.00,1
3,2019,1.546474e+09,1979,Chinese Super League,Wuhan Zall,Beijing Guoan,32.25,54.82,1.10,1.79,...,58.900000,0.000000,1.000000,1.120000,0.970000,1.510000,0.9400,0.00000,1.05,0
4,2019,1.546474e+09,1979,Chinese Super League,Chongqing Lifan,Guangzhou RF,38.24,40.45,1.57,1.24,...,21.300000,2.000000,2.000000,2.770000,3.170000,1.050000,2.0800,2.10000,2.10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7438,2021,1.639267e+09,1846,German 2. Bundesliga,Karlsruher SC,1. FC Heidenheim 1846,34.05,33.76,1.39,1.19,...,31.957776,1.482785,1.196746,1.462035,1.169376,1.360292,1.1087,1.51014,1.23,1
7439,2021,1.639267e+09,1846,German 2. Bundesliga,SC Paderborn,SV Darmstadt 98,36.73,36.11,1.67,1.44,...,31.957776,1.482785,1.196746,1.462035,1.169376,1.360292,1.1087,1.51014,1.23,1
7440,2021,1.639440e+09,1983,South African ABSA Premier League,Moroka Swallows,Orlando Pirates,12.97,23.59,1.05,1.50,...,31.957776,1.482785,1.196746,1.462035,1.169376,1.360292,1.1087,1.51014,1.23,0
7441,2021,1.639440e+09,1983,South African ABSA Premier League,Tshakhuma Tsha Madzivhandila,Black Aces,11.76,18.07,1.22,1.46,...,31.957776,1.482785,1.196746,1.462035,1.169376,1.360292,1.1087,1.51014,1.23,0


In [10]:
df[root_cols] = np.sqrt(df[root_cols])

In [None]:
df.hist(figsize=(20,20), bins=50)

# Preprocessing

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

X = df.drop(columns='Outcome')
y = df[['Outcome']]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('roots', num_pipeline, root_cols),
    ('nums', num_pipeline, num_cols),
    ('cat', OneHotEncoder(), cat_cols)
])

In [12]:
cat_cols

['league', 'Team 1', 'Team2', 'league_id', 'season']

In [13]:
X_trans = full_pipeline.fit_transform(X)

In [14]:
y = np.array(y)

# Train Test Split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.2, random_state=42, shuffle=True)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier()

base_model = rf.fit(X_train, y_train)

In [None]:
predictions = base_model.predict_proba(X_test)

cross_val_score(base_model, X_test, y_test, cv=5, scoring='neg_log_loss').mean()

# Neural Network

In [16]:
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from datetime import datetime

today = datetime.today()
today = str(today.date()) + ' ' + str(today.hour) + '-' + str(today.minute)

In [17]:
today

'2021-08-21 14-59'

In [22]:
mc = ModelCheckpoint(f'../Models/soccer_model {today}.h5', 
                     monitor='val_loss', mode='min', verbose=1, save_best_only=True)

In [23]:
model = keras.Sequential([
    keras.layers.Dense(1424, activation='relu'),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(600, activation='relu'),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [24]:
model.compile(optimizer='adam',
             loss=keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=[keras.metrics.BinaryCrossentropy(from_logits=True), 'accuracy'])

In [25]:
history = model.fit(
    X_train,
    y_train,
    batch_size=500,
    epochs=150,
    validation_data=(X_test, y_test),
    callbacks=mc)

Epoch 1/150





Epoch 00001: val_loss improved from inf to 0.10074, saving model to ../Models\soccer_model 2021-08-21 14-59.h5
Epoch 2/150

Epoch 00002: val_loss improved from 0.10074 to 0.07243, saving model to ../Models\soccer_model 2021-08-21 14-59.h5
Epoch 3/150

Epoch 00003: val_loss did not improve from 0.07243
Epoch 4/150

Epoch 00004: val_loss did not improve from 0.07243
Epoch 5/150

Epoch 00005: val_loss did not improve from 0.07243
Epoch 6/150

Epoch 00006: val_loss did not improve from 0.07243
Epoch 7/150

Epoch 00007: val_loss did not improve from 0.07243
Epoch 8/150

Epoch 00008: val_loss did not improve from 0.07243
Epoch 9/150

Epoch 00009: val_loss did not improve from 0.07243
Epoch 10/150

Epoch 00010: val_loss did not improve from 0.07243
Epoch 11/150

Epoch 00011: val_loss did not improve from 0.07243
Epoch 12/150

Epoch 00012: val_loss did not improve from 0.07243
Epoch 13/150

KeyboardInterrupt: 

In [26]:
model = load_model(f'../Models/soccer_model {today}.h5')

# Run on Test Data

In [27]:
test = pd.read_csv('../Data/test.csv')

In [28]:
test

Unnamed: 0,season,date,league_id,league,Team 1,Team2,SPI1,SPI2,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2021,14/12/21,2411,BPL,Arsenal,West Ham United,79.65,74.06,1.67,1.19,,,,,,,,,,
1,2021,14/12/21,2411,BPL,Brighton and Hove Albion,Wolverhampton,74.19,71.14,1.35,0.98,,,,,,,,,,
2,2021,14/12/21,2411,BPL,Norwich City,Aston Villa,60.67,71.45,1.18,1.49,,,,,,,,,,
3,2021,14/12/21,2411,BPL,Leicester City,Tottenham Hotspur,76.88,79.06,1.52,1.44,,,,,,,,,,
4,2021,14/12/21,2411,BPL,Brentford,Manchester United,63.53,85.58,0.95,1.92,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4003,2021,29/05/22,1871,Spanish Segunda Division,Leganes,Almeria,41.51,48.12,1.40,1.39,,,,,,,,,,
4004,2021,29/05/22,1871,Spanish Segunda Division,AD Alcorcon,Eibar,26.77,33.53,1.17,1.18,,,,,,,,,,
4005,2021,29/05/22,1871,Spanish Segunda Division,Mirandes,Fuenlabrada,26.62,28.65,1.31,1.14,,,,,,,,,,
4006,2021,29/05/22,1871,Spanish Segunda Division,Sporting Gijón,Las Palmas,33.74,30.32,1.33,0.87,,,,,,,,,,


In [29]:
test['date'] = pd.to_datetime(test['date'])

In [30]:
def ConvertUnix(x):
    y = pd.Timestamp(x).timestamp()
    y = int(y)
    return y

test['date'] = test['date'].apply(ConvertUnix)

In [31]:
test[num_cols] = imputer.transform(test[num_cols])

In [32]:
test[root_cols] = np.sqrt(test[root_cols])

In [33]:
test = full_pipeline.transform(test)

In [34]:
predictions = model.predict(test)

In [35]:
submission = pd.DataFrame(predictions, columns=['Outcome'])

In [36]:
submission.to_csv(f"../Submissions/{today} Submission.csv", index=False)