In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("results.csv")
df

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False
...,...,...,...,...,...,...,...,...,...
47772,2024-10-15,Palestine,Kuwait,2,2,FIFA World Cup qualification,Al Rayyan,Qatar,True
47773,2024-10-15,South Korea,Iraq,3,2,FIFA World Cup qualification,Yongin,South Korea,False
47774,2024-10-15,Japan,Australia,1,1,FIFA World Cup qualification,Saitama,Japan,False
47775,2024-10-15,China PR,Indonesia,2,1,FIFA World Cup qualification,Qingdao,China PR,False


In [4]:
print(f"Number of rows: {df.shape[0]}\nNumber of columns: {df.shape[1]}")

Number of rows: 47777
Number of columns: 9


In [5]:
df.count()

date          47777
home_team     47777
away_team     47777
home_score    47777
away_score    47777
tournament    47777
city          47777
country       47777
neutral       47777
dtype: int64

In [6]:
df.isnull().sum()

date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,home_score,away_score
count,47777.0,47777.0
mean,1.757896,1.181636
std,1.772524,1.399827
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,31.0,21.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47777 entries, 0 to 47776
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        47777 non-null  object
 1   home_team   47777 non-null  object
 2   away_team   47777 non-null  object
 3   home_score  47777 non-null  int64 
 4   away_score  47777 non-null  int64 
 5   tournament  47777 non-null  object
 6   city        47777 non-null  object
 7   country     47777 non-null  object
 8   neutral     47777 non-null  bool  
dtypes: bool(1), int64(2), object(6)
memory usage: 3.0+ MB


In [9]:
print("-- Attributes in Data --")
for cols in df.columns:
    print(cols)

-- Attributes in Data --
date
home_team
away_team
home_score
away_score
tournament
city
country
neutral


In [10]:
print("-- Number of instances in Data --")
print(df.count())

-- Number of instances in Data --
date          47777
home_team     47777
away_team     47777
home_score    47777
away_score    47777
tournament    47777
city          47777
country       47777
neutral       47777
dtype: int64


In [11]:
df.nunique()

date          16221
home_team       327
away_team       321
home_score       26
away_score       22
tournament      176
city           2077
country         270
neutral           2
dtype: int64

In [12]:
df['date'].nunique()

16221

In [13]:
df['city'].nunique()

2077

In [14]:
df = df.drop('date',axis=1)
df = df.drop('city',axis=1)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47777 entries, 0 to 47776
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   home_team   47777 non-null  object
 1   away_team   47777 non-null  object
 2   home_score  47777 non-null  int64 
 3   away_score  47777 non-null  int64 
 4   tournament  47777 non-null  object
 5   country     47777 non-null  object
 6   neutral     47777 non-null  bool  
dtypes: bool(1), int64(2), object(4)
memory usage: 2.2+ MB


In [16]:
df = df.drop('neutral',axis = 1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47777 entries, 0 to 47776
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   home_team   47777 non-null  object
 1   away_team   47777 non-null  object
 2   home_score  47777 non-null  int64 
 3   away_score  47777 non-null  int64 
 4   tournament  47777 non-null  object
 5   country     47777 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.2+ MB


In [18]:
def dataEncoder(cols):
    for i in cols:
        dataLabelEncoder = LabelEncoder()
        df[i] = dataLabelEncoder.fit_transform(df[i])

columns = ['home_team','away_team','tournament','country']
dataEncoder(columns)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47777 entries, 0 to 47776
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   home_team   47777 non-null  int32
 1   away_team   47777 non-null  int32
 2   home_score  47777 non-null  int64
 3   away_score  47777 non-null  int64
 4   tournament  47777 non-null  int32
 5   country     47777 non-null  int32
dtypes: int32(4), int64(2)
memory usage: 1.5 MB


In [20]:
df.to_csv(r'encoded-data.csv', index = False, header = True)

In [21]:
trainData, testData = train_test_split(df, test_size=0.2, shuffle=False)

In [22]:
trainData.shape

(38221, 6)

In [23]:
testData.shape

(9556, 6)

In [24]:
train_x = trainData.iloc()[:, 1:]
test_x  = testData.iloc()[:, 1:]

train_y = trainData.iloc()[:, 0]
test_y  = testData.iloc()[:, 0]

In [25]:
train_x.head(2)

Unnamed: 0,away_team,home_score,away_score,tournament,country
0,89,0,0,85,206
1,244,4,2,85,70


In [26]:
train_y.head(2)

0    250
1     88
Name: home_team, dtype: int32

In [27]:
test_x.head(2)

Unnamed: 0,away_team,home_score,away_score,tournament,country
38221,47,3,0,85,185
38222,14,1,0,162,187


In [28]:
test_y.head(2)

38221    220
38222    222
Name: home_team, dtype: int32

In [29]:
model_svc = SVC()
model_svc.fit(train_x, train_y)

print(model_svc)

SVC()


In [30]:
pickle.dump(model_svc, open('model_svc.pkl', 'wb'))

In [31]:
model_svc = pickle.load(open('model_svc.pkl', 'rb'))

In [32]:
model_predictions = model_svc.predict(test_x)

In [33]:
model_accuracy_score = accuracy_score(test_y, model_predictions)

print("-- Model Accuracy Score: ", end='')
print(round(model_accuracy_score,3))

-- Model Accuracy Score: 0.081


In [34]:
testdata_predict = testData.copy(deep=True)
pd.options.mode.chained_assignment = None

testdata_predict['Prediction'] = model_predictions

In [36]:
model_accuracy_score = accuracy_score(testdata_predict['home_team'], testdata_predict['Prediction'])

print("-- Model Accuracy Score: ", end='')
print(round(model_accuracy_score,3))

-- Model Accuracy Score: 0.081
