##### Pre-processing data

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Concatenate the dataframes
df1 = pd.read_csv("serie_a_results_2022_2023.csv")
df2 = pd.read_csv("serie_a_results_2023_2024.csv")
df3 = pd.read_csv("serie_a_results_2024_2025.csv")
combined_df = pd.concat([df1, df2, df3], ignore_index=True)
# Save to a new CSV file
combined_df.to_csv('combined_3seasons.csv', index=False)

In [4]:
import pandas as pd
matches = pd.read_csv("combined_3seasons.csv", index_col = False)
matches.head()

Unnamed: 0,date,time,home_team,away_team,score,match_url,Expected_goals_(xG)_home,Expected_goals_(xG)_away,Ball_possession_home,Ball_possession_away,...,Errors_leading_to_shot_home,Errors_leading_to_shot_away,Errors_leading_to_goal_home,Errors_leading_to_goal_away,xGOT_faced_home,xGOT_faced_away,Goals_prevented_home,Goals_prevented_away,Headed_goals_home,Headed_goals_away
0,30.04.2023,21:00,Cremonese,Verona,1:1,https://www.flashscore.co.uk/match/football/AP...,0.49,1.21,51%,49%,...,,,,,,,,,,
1,12.06.2023,02:45,Spezia,Verona Winner,1:3,https://www.flashscore.co.uk/match/football/Q3...,2.37,0.75,69%,31%,...,,,,,,,,,,
2,08.01.2023,19:30,Salernitana,Torino,1:1,https://www.flashscore.co.uk/match/football/pl...,,,39%,61%,...,,,,,,,,,,
3,28.01.2023,22:00,Empoli,Torino,2:2,https://www.flashscore.co.uk/match/football/tE...,,,47%,53%,...,,,,,,,,,,
4,04.01.2023,23:30,AS Roma,Bologna,1:0,https://www.flashscore.co.uk/match/football/bZ...,,,39%,61%,...,,,,,,,,,,


In [5]:
matches.shape



(1141, 76)

In [6]:
38 * 20 * 3 # 2280 matches in total
# My dataset considers each row as a single match i.e. we don't have 2 separate rows for each match from home and away teams' perspectives
2280/2
# Where we get 1 extra match from???

1140.0

In [7]:
# matches['home_team'].value_counts()
# matches['away_team'].value_counts()

### Cleaning the data

In [8]:
matches.head()

Unnamed: 0,date,time,home_team,away_team,score,match_url,Expected_goals_(xG)_home,Expected_goals_(xG)_away,Ball_possession_home,Ball_possession_away,...,Errors_leading_to_shot_home,Errors_leading_to_shot_away,Errors_leading_to_goal_home,Errors_leading_to_goal_away,xGOT_faced_home,xGOT_faced_away,Goals_prevented_home,Goals_prevented_away,Headed_goals_home,Headed_goals_away
0,30.04.2023,21:00,Cremonese,Verona,1:1,https://www.flashscore.co.uk/match/football/AP...,0.49,1.21,51%,49%,...,,,,,,,,,,
1,12.06.2023,02:45,Spezia,Verona Winner,1:3,https://www.flashscore.co.uk/match/football/Q3...,2.37,0.75,69%,31%,...,,,,,,,,,,
2,08.01.2023,19:30,Salernitana,Torino,1:1,https://www.flashscore.co.uk/match/football/pl...,,,39%,61%,...,,,,,,,,,,
3,28.01.2023,22:00,Empoli,Torino,2:2,https://www.flashscore.co.uk/match/football/tE...,,,47%,53%,...,,,,,,,,,,
4,04.01.2023,23:30,AS Roma,Bologna,1:0,https://www.flashscore.co.uk/match/football/bZ...,,,39%,61%,...,,,,,,,,,,


In [9]:
# Since home_team and away_team columns have the same domain of values, 
# we should encode them into categorical value using the same mapping
categories = sorted(matches['home_team'].unique())
matches['date'] = pd.to_datetime(matches['date'], format='%d.%m.%Y')
matches['home_team_code'] = pd.Categorical(matches['home_team'], categories = categories).codes
matches['away_team_code'] = pd.Categorical(matches['away_team'], categories = categories).codes
matches['hour'] = matches['time'].str.replace(":.+", "", regex = True).astype('int')
matches['day_code'] = matches['date'].dt.dayofweek


matches['Passes_home'] = matches['Passes_home'].str.split(',').str[0].str.replace('%','').astype('float')/100
matches['Passes_away'] = matches['Passes_away'].str.split(',').str[0].str.replace('%','').astype('float')/100
matches['Ball_possession_home'] = matches['Ball_possession_home'].str.replace('%','').astype('float')/100

matches.shape



(1141, 80)

In [10]:
matches.dtypes

date                 datetime64[ns]
time                         object
home_team                    object
away_team                    object
score                        object
                          ...      
Headed_goals_away           float64
home_team_code                 int8
away_team_code                 int8
hour                          int64
day_code                      int32
Length: 80, dtype: object

In [11]:
print(len(matches.columns))

80


In [12]:
matches['score'] = matches['score'].astype(str)
matches['result'] = matches.apply(lambda row: 
    1 if int(row['score'].split(':')[0]) > int(row['score'].split(':')[1]) else -1,
    axis = 1
)
'''matches['result'] = matches.apply(lambda row: 
    1 if int(row['score'].split(':')[0]) > int(row['score'].split(':')[1]) else 
    (-1 if int(row['score'].split(':')[0]) < int(row['score'].split(':')[1]) else 0),
    axis = 1
)'''
# 1 if home team wins, -1 if away team wins, 0 if draw
matches.shape

(1141, 81)

### Handling the missing data

In [13]:
# I wanna find what columns contain NaN values
columns_with_nan = matches.columns[matches.isna().any()].tolist()
columns_with_nan

['Expected_goals_(xG)_home',
 'Expected_goals_(xG)_away',
 'Ball_possession_home',
 'Ball_possession_away',
 'Total_shots_home',
 'Total_shots_away',
 'Shots_on_target_home',
 'Shots_on_target_away',
 'Corner_kicks_home',
 'Corner_kicks_away',
 'Passes_home',
 'Passes_away',
 'Red_cards_home',
 'Red_cards_away',
 'Shots_off_target_home',
 'Shots_off_target_away',
 'Blocked_shots_home',
 'Blocked_shots_away',
 'Offsides_home',
 'Offsides_away',
 'Fouls_home',
 'Fouls_away',
 'Goalkeeper_saves_home',
 'Goalkeeper_saves_away',
 'Yellow_cards_home',
 'Yellow_cards_away',
 'Free_kicks_home',
 'Free_kicks_away',
 'Throw_ins_home',
 'Throw_ins_away',
 'Big_chances_home',
 'Big_chances_away',
 'Shots_inside_the_box_home',
 'Shots_inside_the_box_away',
 'Shots_outside_the_box_home',
 'Shots_outside_the_box_away',
 'Hit_the_woodwork_home',
 'Hit_the_woodwork_away',
 'Touches_in_opposition_box_home',
 'Touches_in_opposition_box_away',
 'Passes_in_final_third_home',
 'Passes_in_final_third_away',


In [14]:
nan_counts = matches.isna().sum()
nan_percentage = matches.isna().mean() * 100
nan_summary = pd.DataFrame({'Nan_Count' : nan_counts, 'Nan_Percentage' : nan_percentage})
print('Nan Summary:\n', nan_summary[nan_summary['Nan_Count'] > 0])
nan_summary.dtypes

Nan Summary:
                           Nan_Count  Nan_Percentage
Expected_goals_(xG)_home        216       18.930762
Expected_goals_(xG)_away        216       18.930762
Ball_possession_home              1        0.087642
Ball_possession_away              1        0.087642
Total_shots_home                  1        0.087642
...                             ...             ...
xGOT_faced_away                1051       92.112182
Goals_prevented_home           1051       92.112182
Goals_prevented_away           1051       92.112182
Headed_goals_home              1053       92.287467
Headed_goals_away              1053       92.287467

[70 rows x 2 columns]


Nan_Count           int64
Nan_Percentage    float64
dtype: object

In [15]:
rows_with_nan = matches[matches.isna().any(axis=1)]
print(f'rows with >=1 NaN value: {len(rows_with_nan)} out of {len(matches)}')

rows with >=1 NaN value: 1134 out of 1141


In [16]:
# Let's find columns with NaN percentage < 20% 
threshold = 15
columns_to_keep = nan_summary[nan_summary['Nan_Percentage'] <= threshold].index.tolist()
if 'result' not in columns_to_keep:
    columns_to_keep.append('result')
# columns_to_keep
matches = matches[columns_to_keep]

In [17]:
columns_to_stay = ['date',
              'home_team_code', 'away_team_code', 'hour', 'day_code', 
              'Ball_possession_home', 
              'Total_shots_home', 'Total_shots_away',
              'Goalkeeper_saves_home', 'Goalkeeper_saves_away', 
              'Corner_kicks_home', 'Corner_kicks_away',
              'Passes_home', 'Passes_away',
              'Free_kicks_home', 'Free_kicks_away',
              'result']
predictors = ['date',
              'home_team_code', 'away_team_code', 'hour', 'day_code', 
              'Ball_possession_home', 
              'Total_shots_home', 'Total_shots_away',
              'Goalkeeper_saves_home', 'Goalkeeper_saves_away', 
              'Corner_kicks_home', 'Corner_kicks_away',
              'Passes_home', 'Passes_away',
              'Free_kicks_home', 'Free_kicks_away']
# target = ['result']
matches = matches[columns_to_stay]
matches.dtypes

date                     datetime64[ns]
home_team_code                     int8
away_team_code                     int8
hour                              int64
day_code                          int32
Ball_possession_home            float64
Total_shots_home                float64
Total_shots_away                float64
Goalkeeper_saves_home           float64
Goalkeeper_saves_away           float64
Corner_kicks_home               float64
Corner_kicks_away               float64
Passes_home                     float64
Passes_away                     float64
Free_kicks_home                 float64
Free_kicks_away                 float64
result                            int64
dtype: object

In [18]:
# Handle remaining NaN values for Numeric columns
numeric_cols = matches.select_dtypes(include = ['float64','int64']).columns
matches[numeric_cols] = matches[numeric_cols].fillna(matches[numeric_cols].median())

In [19]:
# Handle remaining NaN values for Categorical columns
categorical_cols = matches.select_dtypes(include = ['int8','int32']).columns
matches[categorical_cols] = matches[categorical_cols].fillna(matches[categorical_cols].mode().iloc[0])

In [20]:
# Most of the rows are still present in the table now
matches.shape

(1141, 17)

### Creating ML Model

In [21]:
from sklearn.ensemble import RandomForestClassifier
# Linear Regression is not suitable for classification problems, so we use Random Forest Classifier 
# bcs Linear Regression Model doesn't understand categorical variables well
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [22]:
# I will include only characteristics of the home team and the match itself as predictors
# since the away team characteristics are deterministic by the home team characteristics 
# so it would be redundant to include them and increase computation time
# (including both is perfectly fine RandomForests can handle it)
predictors = ['home_team_code', 'away_team_code', 'hour', 'day_code', 
              'Ball_possession_home', 
              'Total_shots_home', 'Total_shots_away',
              'Goalkeeper_saves_home', 'Goalkeeper_saves_away', 
              'Corner_kicks_home', 'Corner_kicks_away',
              'Passes_home', 'Passes_away',
              'Free_kicks_home', 'Free_kicks_away']

In [23]:
train = matches[matches['date']< '2024-10-05']
test = matches[matches['date']>= '2024-10-05']
matches.shape, train.shape, test.shape

((1141, 17), (821, 17), (320, 17))

In [24]:
train.columns

Index(['date', 'home_team_code', 'away_team_code', 'hour', 'day_code',
       'Ball_possession_home', 'Total_shots_home', 'Total_shots_away',
       'Goalkeeper_saves_home', 'Goalkeeper_saves_away', 'Corner_kicks_home',
       'Corner_kicks_away', 'Passes_home', 'Passes_away', 'Free_kicks_home',
       'Free_kicks_away', 'result'],
      dtype='object')

In [25]:
print(train[predictors].dtypes)

home_team_code              int8
away_team_code              int8
hour                       int64
day_code                   int32
Ball_possession_home     float64
Total_shots_home         float64
Total_shots_away         float64
Goalkeeper_saves_home    float64
Goalkeeper_saves_away    float64
Corner_kicks_home        float64
Corner_kicks_away        float64
Passes_home              float64
Passes_away              float64
Free_kicks_home          float64
Free_kicks_away          float64
dtype: object


In [26]:
rf.fit(train[predictors], train['result'])

In [27]:
preds = rf.predict(test[predictors])

In [28]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test['result'], preds)
acc

0.65

In [29]:
combined = pd.DataFrame(dict(actual = test['result'], prediction = preds))
pd.crosstab(index = combined['actual'], columns = combined['prediction'])


prediction,-1,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,153,38
1,74,55


In [30]:
from sklearn.metrics import precision_score
precision_score(test['result'], preds)


0.5913978494623656

In [31]:
matches.head(1)

Unnamed: 0,date,home_team_code,away_team_code,hour,day_code,Ball_possession_home,Total_shots_home,Total_shots_away,Goalkeeper_saves_home,Goalkeeper_saves_away,Corner_kicks_home,Corner_kicks_away,Passes_home,Passes_away,Free_kicks_home,Free_kicks_away,result
0,2023-04-30,6,25,21,6,0.51,12.0,14.0,5.0,4.0,3.0,5.0,0.74,0.71,14.0,13.0,-1


In [32]:
grouped_matches = matches.groupby('home_team_code')
group = grouped_matches.get_group(0)
group

Unnamed: 0,date,home_team_code,away_team_code,hour,day_code,Ball_possession_home,Total_shots_home,Total_shots_away,Goalkeeper_saves_home,Goalkeeper_saves_away,Corner_kicks_home,Corner_kicks_away,Passes_home,Passes_away,Free_kicks_home,Free_kicks_away,result
34,2022-11-06,0,21,3,6,0.59,23.0,13.0,4.0,9.0,3.0,6.0,0.77,0.65,20.0,17.0,1
57,2023-01-29,0,20,19,6,0.61,10.0,9.0,2.0,3.0,2.0,3.0,0.78,0.74,11.0,17.0,-1
77,2023-06-05,0,25,3,0,0.79,14.0,6.0,0.0,2.0,7.0,1.0,0.88,0.59,21.0,8.0,1
79,2022-10-09,0,12,0,6,0.4,21.0,10.0,3.0,1.0,9.0,3.0,0.72,0.85,11.0,10.0,1
109,2023-01-09,0,1,3,0,0.61,12.0,12.0,1.0,3.0,3.0,4.0,0.86,0.8,14.0,19.0,-1
117,2023-05-06,0,13,21,5,0.52,10.0,8.0,0.0,1.0,6.0,4.0,0.88,0.87,9.0,11.0,1
138,2022-10-23,0,15,0,6,0.49,12.0,16.0,5.0,3.0,2.0,2.0,0.84,0.84,13.0,28.0,1
139,2023-04-08,0,7,3,5,0.7,23.0,2.0,1.0,4.0,9.0,0.0,0.89,0.77,10.0,11.0,-1
159,2022-09-19,0,16,2,0,0.51,22.0,9.0,2.0,4.0,9.0,4.0,0.82,0.83,10.0,18.0,-1
170,2022-11-14,0,8,1,0,0.49,12.0,11.0,4.0,1.0,5.0,7.0,0.79,0.78,13.0,15.0,1


### Improving precision with rolling averages(skipped for now)

### Using the model for predicting live match

In [33]:
# Statistics needed for the prediction: 
# predictors = 'home_team_code', 'away_team_code', 'hour', 'day_code', 
            #   'Ball_possession_home', 
            #   'Total_shots_home', 'Total_shots_away',
            #   'Goalkeeper_saves_home', 'Goalkeeper_saves_away', 
            #   'Corner_kicks_home', 'Corner_kicks_away',
            #   'Passes_home', 'Passes_away',
            #   'Free_kicks_home', 'Free_kicks_away'
predictors = ['home_team_code', 'away_team_code', 'hour', 'day_code', 
              'Ball_possession_home', 
              'Total_shots_home', 'Total_shots_away',
              'Goalkeeper_saves_home', 'Goalkeeper_saves_away', 
              'Corner_kicks_home', 'Corner_kicks_away',
              'Passes_home', 'Passes_away',
              'Free_kicks_home', 'Free_kicks_away'] 
           
# Example:
match_instance = pd.DataFrame({
    'home_team_code': [0],
    'away_team_code': [20],
    'hour': [20],
    'day_code': [5],  # Saturday
    'Ball_possession_home': [0.95],
    'Total_shots_home': [40],
    'Total_shots_away': [5],
    'Goalkeeper_saves_home': [30],
    'Goalkeeper_saves_away': [2],
    'Corner_kicks_home': [30],
    'Corner_kicks_away': [2],
    'Passes_home': [1],
    'Passes_away': [0.05],
    'Free_kicks_home': [30],
    'Free_kicks_away': [0]
}, columns = predictors)

# If there are any NaN values in the live match instance
match_instance[numeric_cols.intersection(predictors)] = match_instance[numeric_cols.intersection(predictors)].fillna(matches[numeric_cols.intersection(predictors)].median())
match_instance[categorical_cols.intersection(predictors)] = match_instance[categorical_cols.intersection(predictors)].fillna(matches[categorical_cols.intersection(predictors)].mode().iloc[0])

prediction = rf.predict(match_instance)
pr = rf.predict_proba(match_instance)

result_mapping = {1: "Home Win", -1: "Away Win or Draw"}
print("Predicted result:", result_mapping.get(prediction[0], "Unknown"))
print("Prediction probabilities (Home Win, Away Win or Draw):", pr[0])



Predicted result: Home Win
Prediction probabilities (Home Win, Away Win or Draw): [0.38093401 0.61906599]
