In [1]:
import pandas as pd

In [2]:
table = pd.read_csv("matches.csv", index_col=0)

In [3]:
table.head() # gives you the glance of the first 5 rows

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [4]:
table.shape
#(rows, col)

(1389, 27)

In [63]:
# to find the count of missing values
# table.isnull().sum()
table.isna().sum()

date               0
time               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance       696
captain            0
formation          0
referee            0
match report       0
notes           1389
sh                 0
sot                0
dist               1
fk                 0
pk                 0
pkatt              0
season             0
team               0
target             0
opp_code           0
day_code           0
venue_code         0
hour               0
dtype: int64

In [7]:
# to show you the datatypes of each column
table.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [8]:
# to print the column names
print(table.columns)

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')


### Dealing with Categorical Data

In [9]:
table["target"] = (table["result"] == "W").astype("int") # result as target with (W-1), (D,L- 0)

In [10]:
table["date"] = pd.to_datetime(table["date"]) # Converting date column
table["opp_code"] = table["opponent"].astype("category").cat.codes #opponent names to opp_code (each no. to different teams)
table["day_code"] = table["date"].dt.dayofweek #date as day_code
table["venue_code"] = table["venue"].astype("category").cat.codes #venue as venue_code

In [11]:
table.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,fk,pk,pkatt,season,team,target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City,0


In [15]:
#converting time to hour
table["hour"] = table["time"].str.replace(":.+", "", regex=True).astype("int")

In [16]:
table.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,target,opp_code,day_code,venue_code,hour
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,1.0,0.0,0.0,2022,Manchester City,0,18,6,0,16
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,1.0,0.0,0.0,2022,Manchester City,1,15,5,1,15
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,0.0,0.0,0.0,2022,Manchester City,1,0,5,1,12
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,0.0,0.0,0.0,2022,Manchester City,1,10,5,0,15
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,1.0,0.0,0.0,2022,Manchester City,0,17,5,1,15


In [17]:
del table["comp"]

## Training and Testing


In [18]:
# IMP columns

# some comes from hit and trial
# some from common sence
# some form thinging like your model would
predictors = ["venue_code", "opp_code", "hour", "day_code", "xg", "xga", "poss"]

### RandomForest Classifier

In [19]:
# how to train <model name> - search or do AI
from sklearn.ensemble import RandomForestClassifier

In [56]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=20, random_state=1)

In [57]:
#split the data
train = table[table["date"] < '2022-01-01']
test = table[table["date"] > '2022-01-01']

In [58]:
rf.fit(train[predictors], train["target"])
# <model>. fit(features, label)

In [59]:
preds = rf.predict(test[predictors])
# <var> = <model>. predict(test)

In [60]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test["target"], preds)
# <var> = accuracy_score(dataset target colum, machine predicted value)
accuracy

0.75

### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(train[predictors], train["target"])

In [26]:
accuracy = accuracy_score(test["target"], preds)
accuracy

0.7463768115942029

### Support Vector Model (SVC)

In [27]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train[predictors], train["target"])

In [28]:
accuracy = accuracy_score(test["target"], preds)
accuracy

0.7463768115942029

### XGBoost

In [29]:
import xgboost as xgb
clf = xgb.XGBClassifier(n_estimators=100)
clf.fit(train[predictors], train["target"])

In [30]:
accuracy = accuracy_score(test["target"], preds)
accuracy

0.7463768115942029