# Model creation - SVM

In [1]:
# imports
from numpy import mean
from numpy import std
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

In [2]:
# read processed csv file
train = pd.read_csv("train_p.csv")
print(train.head())

train_ind = train.copy()
train_ind.drop(labels=["outcome", "id", "season", "home_team_goal", "away_team_goal", "date", "Unnamed: 0", "home_team_api_id", "away_team_api_id"], axis=1,inplace=True)
print(train_ind.head())

   Unnamed: 0    id     season  stage                 date  home_team_api_id  \
0           0  1729  2008/2009      1  2008-08-17 00:00:00                 1   
1           1  1730  2008/2009      1  2008-08-16 00:00:00                 2   
2           2  1731  2008/2009      1  2008-08-16 00:00:00                 3   
3           3  1732  2008/2009      1  2008-08-16 00:00:00                 4   
4           4  1733  2008/2009      1  2008-08-17 00:00:00                 5   

   away_team_api_id  home_team_goal  away_team_goal  B365H  ...  PSH   PSD  \
0                13               1               1   1.29  ...  2.2  3.67   
1                16               1               0   1.20  ...  2.2  3.67   
2                12               0               1   5.50  ...  2.2  3.67   
3                20               2               1   1.91  ...  2.2  3.67   
4                18               4               2   1.91  ...  2.2  3.67   

    PSA   WHH  WHD    WHA   VCH  VCD    VCA  outco

In [3]:
# normalizing data
def normalize(df):
    dfMod = df.copy()
    for column in dfMod.columns:
        dfMod[column] =(dfMod[column]-dfMod[column].min())/(dfMod[column].max()-dfMod[column].min())
        #dfMod = dfMod.rename(columns = {column:f"{column}Normalized"})
    return dfMod

dfMod = normalize(train_ind)


In [4]:
from sklearn.model_selection import train_test_split

X = dfMod
y = train["outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [69]:
# SVM model

# fit final model
SVM_model = SVC(kernel='poly', degree=0)
SVM_model.fit(X_train, y_train)


SVC(degree=0, kernel='poly')

In [70]:
y_pred = SVM_model.predict(X_test)
print('Accuracy - classifier on test set: {:.4f}'.format(SVM_model.score(X_test, y_test)))

Accuracy - classifier on test set: 0.4375


In [71]:
# model evaluation - cross validation

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(SVM_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print(classification_report(y_test,y_pred))
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       166
           0       0.00      0.00      0.00       149
           1       0.44      1.00      0.61       245

    accuracy                           0.44       560
   macro avg       0.15      0.33      0.20       560
weighted avg       0.19      0.44      0.27       560

Accuracy: 0.459 (0.002)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# confusion matrix

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

TypeError: 'numpy.ndarray' object is not callable

# TEST DATASET


In [72]:
test_set = pd.read_csv("test_p.csv")
test_set.columns

Index(['Unnamed: 0', 'id', 'season', 'stage', 'date', 'home_team_api_id',
       'away_team_api_id', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA',
       'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH',
       'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD',
       'GBA', 'BSH', 'BSD', 'BSA', 'outcome'],
      dtype='object')

In [73]:
test_ind =test_set.copy()
test_ind.drop(labels=["id", "Unnamed: 0","outcome", "season", "date", "home_team_api_id", "away_team_api_id", "GBH", "GBD", "GBA", "BSH", "BSD", "BSD", "BSA", "SJA", "SJD", "SJH"], axis=1,inplace=True)
print(test_ind.head())

   stage  B365H  B365D  B365A   BWH   BWD    BWA   IWH  IWD   IWA  ...    LBA  \
0     15   1.29   6.00  13.00  1.26  6.25  10.50  1.22  5.5  12.0  ...  12.00   
1     15   1.36   5.25   9.50  1.38  5.00   8.00  1.35  4.8   7.6  ...   9.00   
2     15   2.10   3.60   3.75  2.10  3.30   3.40  2.00  3.3   3.6  ...   3.60   
3     15   1.44   4.50   9.00  1.44  4.33   8.00  1.40  4.4   7.3  ...   8.50   
4     15   5.75   4.33   1.62  5.50  4.00   1.62  5.40  3.7   1.6  ...   1.57   

    PSH   PSD    PSA   WHH   WHD    WHA   VCH   VCD    VCA  
0  1.28  6.14  13.53  1.29  5.00  12.00  1.29  5.75  13.00  
1  1.38  5.23   9.75  1.36  4.50   9.00  1.36  5.25  10.00  
2  2.06  3.63   3.88  2.00  3.30   3.75  2.05  3.60   3.80  
3  1.43  4.70   9.06  1.44  3.80   9.00  1.44  4.50   9.00  
4  5.75  4.35   1.62  5.50  3.75   1.62  6.00  4.20   1.62  

[5 rows x 22 columns]


In [74]:
#check which ones remain with NaN values
print("Columns in test dataset including null values:")
list_names_test = []
for i in test_ind.columns:
    if test_ind[i].isnull().values.any() == True:
        print(i)
        list_names_test.append(i)
    else:
        continue

Columns in test dataset including null values:
BWH
BWD
BWA


In [75]:
# replacing with a median

test_ind= test_ind.fillna(test_ind.median())
test_ind.head()

#dropping these variables
test_ind.drop(labels=['BWH', 'BWD', 'BWA'], axis=1, inplace=True)

test_ind.columns

Index(['stage', 'B365H', 'B365D', 'B365A', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD',
       'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA'],
      dtype='object')

In [76]:
model_test3 = SVM_model.predict(test_ind)
model_test3

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

# Submission file

In [77]:
upload_df = pd.DataFrame(test_set["id"])

upload_df["outcome"] = model_test3

upload_df = upload_df.set_index("id")

upload_df

Unnamed: 0_level_0,outcome
id,Unnamed: 1_level_1
4449,1
4450,1
4451,1
4452,1
4453,1
...,...
4704,1
4705,1
4706,1
4707,1


In [78]:
upload_df.to_csv("SVM_outcome_3_pol_0.csv")