In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
data_train = pd.read_csv("train_df.csv")
data_train.head()

Unnamed: 0,Country,League,Season,Match_Date,Home_Team,Away_Team,Full_Time_Home_Team_Goals,Full_Time_Away_Team_Goals,Full_Time_Result,Home_Team_Shots,...,Average_draw_odds,Average_over_2.5_goals_odds,Average_under_2.5_goals_odds,Average_Asian_Handicap_Home_odds,Average_Asian_Handicap_Away_odds,Close_home_win_odds,Close_draw_odds,Close_away_win_odds,has_favorite,Time_kick_off_bin
0,belgium,1,1920,2019-07-26,Genk,Kortrijk,2.0,1.0,H,10.0,...,4.928333,1.52,2.506667,1.165,1.786667,1.45,4.94,6.79,1,Late Evening
1,belgium,1,1920,2019-07-27,Waasland-Beveren,Club Brugge,1.0,3.0,A,7.0,...,4.37,1.556667,2.41,1.725,1.893333,8.99,6.52,1.29,1,Late Evening
2,belgium,1,1920,2019-07-27,Cercle Brugge,Standard,0.0,2.0,A,13.0,...,3.671667,1.766667,2.053333,1.5575,1.946667,3.79,3.74,1.99,0,Evening
3,belgium,1,1920,2019-07-27,Waregem,Mechelen,0.0,2.0,A,7.0,...,3.618333,1.653333,2.21,1.3875,1.923333,2.8,3.59,2.51,0,Late Evening
4,belgium,1,1920,2019-07-27,St Truiden,Mouscron,0.0,1.0,A,10.0,...,3.57,1.833333,1.986667,1.3525,1.886667,2.16,3.55,3.46,0,Late Evening


In [None]:
dl0 = pd.read_csv("df_league_0.csv")
dl1 = pd.read_csv("df_league_1.csv")
dl2 = pd.read_csv("df_league_2.csv")
dl3 = pd.read_csv("df_league_3.csv")

In [None]:
unique_team_names = set()

unique_team_names = unique_team_names.union(set(data_train['Home_Team'].unique()))
unique_team_names = unique_team_names.union(set(data_train['Away_Team'].unique()))

In [None]:
def team_dummy_variables(df, team_names):
    home_team_columns = [f'HomeTeam_{team}' for team in team_names]
    away_team_columns = [f'AwayTeam_{team}' for team in team_names]

    home_team_dummies = pd.DataFrame({col: (df['Home_Team'] == team) for team, col in zip(team_names, home_team_columns)})
    away_team_dummies = pd.DataFrame({col: (df['Away_Team'] == team) for team, col in zip(team_names, away_team_columns)})
    
    df = pd.concat([df, home_team_dummies, away_team_dummies], axis=1)
    return df

dummy_train_df = team_dummy_variables(data_train, unique_team_names)
dummy_df_league_0 = team_dummy_variables(dl0, unique_team_names)
dummy_df_league_1 = team_dummy_variables(dl1, unique_team_names)
dummy_df_league_2 = team_dummy_variables(dl2, unique_team_names)
dummy_df_league_3 = team_dummy_variables(dl3, unique_team_names)

In [None]:
dummy_train_df = dummy_train_df.drop(['Home_Team', 'Away_Team'], axis=1)

In [None]:
dummy_df_league_0.head()

Unnamed: 0,Country,League,Season,Match_Date,Home_Team,Away_Team,Full_Time_Home_Team_Goals,Full_Time_Away_Team_Goals,Full_Time_Result,Home_Team_Shots,...,AwayTeam_Foggia,AwayTeam_Leverkusen,AwayTeam_Benevento,AwayTeam_Benfica,AwayTeam_Eupen,AwayTeam_Porto,AwayTeam_Rangers,AwayTeam_Hamburg,AwayTeam_Moreirense,AwayTeam_Standard
0,england,0,1718,2017-08-11,Arsenal,Leicester,4.0,3.0,H,27.0,...,False,False,False,False,False,False,False,False,False,False
1,england,0,1718,2017-08-12,Brighton,Man City,0.0,2.0,A,6.0,...,False,False,False,False,False,False,False,False,False,False
2,england,0,1718,2017-08-12,Crystal Palace,Huddersfield,0.0,3.0,A,14.0,...,False,False,False,False,False,False,False,False,False,False
3,england,0,1718,2017-08-12,Chelsea,Burnley,2.0,3.0,A,19.0,...,False,False,False,False,False,False,False,False,False,False
4,england,0,1718,2017-08-12,Everton,Stoke,1.0,0.0,H,9.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
dummy_df_league_0 = dummy_df_league_0.drop(['Home_Team', 'Away_Team'], axis=1)


dummies_country_l0 = pd.get_dummies(dummy_df_league_0['Country'])
dummies_time_l0 = pd.get_dummies(dummy_df_league_0['Time_kick_off_bin'])


dummy_df_league_0 = pd.concat([dummy_df_league_0, dummies_country_l0], axis=1)
dummy_df_league_0 = pd.concat([dummy_df_league_0, dummies_time_l0], axis=1)


dummy_df_league_0 = dummy_df_league_0.drop(['Country'], axis=1)
dummy_df_league_0 = dummy_df_league_0.drop(['Time_kick_off_bin'], axis=1)

In [None]:
dummy_df_league_1 = dummy_df_league_1.drop(['Home_Team', 'Away_Team'], axis=1)


dummies_country_l1 = pd.get_dummies(dummy_df_league_1['Country'])
dummies_time_l1 = pd.get_dummies(dummy_df_league_1['Time_kick_off_bin'])


dummy_df_league_1 = pd.concat([dummy_df_league_1, dummies_country_l1], axis=1)
dummy_df_league_1 = pd.concat([dummy_df_league_1, dummies_time_l1], axis=1)


dummy_df_league_1 = dummy_df_league_1.drop(['Country'], axis=1)
dummy_df_league_1 = dummy_df_league_1.drop(['Time_kick_off_bin'], axis=1)

In [None]:
dummy_df_league_2 = dummy_df_league_2.drop(['Home_Team', 'Away_Team'], axis=1)

dummies_country_l2 = pd.get_dummies(dummy_df_league_2['Country'])
dummies_time_l2 = pd.get_dummies(dummy_df_league_2['Time_kick_off_bin'])


dummy_df_league_2 = pd.concat([dummy_df_league_2, dummies_country_l2], axis=1)
dummy_df_league_2 = pd.concat([dummy_df_league_2, dummies_time_l2], axis=1)


dummy_df_league_2 = dummy_df_league_2.drop(['Country'], axis=1)
dummy_df_league_2 = dummy_df_league_2.drop(['Time_kick_off_bin'], axis=1)

In [None]:
dummy_df_league_3 = dummy_df_league_3.drop(['Home_Team', 'Away_Team'], axis=1)

dummies_country_l3 = pd.get_dummies(dummy_df_league_3['Country'])
dummies_time_l3 = pd.get_dummies(dummy_df_league_3['Time_kick_off_bin'])


dummy_df_league_3 = pd.concat([dummy_df_league_3, dummies_country_l3], axis=1)
dummy_df_league_3 = pd.concat([dummy_df_league_3, dummies_time_l3], axis=1)


dummy_df_league_3 = dummy_df_league_3.drop(['Country'], axis=1)
dummy_df_league_3 = dummy_df_league_3.drop(['Time_kick_off_bin'], axis=1)

In [None]:
dummy_df_league_0 = dummy_df_league_0.drop(['Match_Date'], axis=1)
model_rf0 = RandomForestClassifier()
X = dummy_df_league_0.drop('Full_Time_Result', axis=1)
y = dummy_df_league_0['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_rf0.fit(X_train, y_train)
model_rf0_predict = model_rf0.predict(X_test)


print(f1_score(y_test, model_rf0_predict, average='weighted'))
print(f1_score(y_test, model_rf0_predict, average='macro'))
print(f1_score(y_test, model_rf0_predict, average='micro'))
print(f1_score(y_test, model_rf0_predict, average=None))

0.7753639981538148
0.7267190576402373
0.7808535178777393
[0.79456706 0.51612903 0.86946108]


In [None]:
dummy_df_league_1 = dummy_df_league_1.drop(['Match_Date'], axis=1)
model_rf1 = RandomForestClassifier()
X = dummy_df_league_1.drop('Full_Time_Result', axis=1)
y = dummy_df_league_1['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_rf1.fit(X_train, y_train)
model_rf1_predict = model_rf1.predict(X_test)


print(f1_score(y_test, model_rf1_predict, average='weighted'))
print(f1_score(y_test, model_rf1_predict, average='macro'))
print(f1_score(y_test, model_rf1_predict, average='micro'))
print(f1_score(y_test, model_rf1_predict, average=None))

0.9021191207329659
0.892708985231809
0.9045879009338206
[0.92748092 0.81695063 0.93369541]


In [None]:
dummy_df_league_2 = dummy_df_league_2.drop(['Match_Date'], axis=1)
model_rf2 = RandomForestClassifier()
X = dummy_df_league_2.drop('Full_Time_Result', axis=1)
y = dummy_df_league_2['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_rf2.fit(X_train, y_train)
model_rf2_predict = model_rf2.predict(X_test)


print(f1_score(y_test, model_rf2_predict, average='weighted'))
print(f1_score(y_test, model_rf2_predict, average='macro'))
print(f1_score(y_test, model_rf2_predict, average='micro'))
print(f1_score(y_test, model_rf2_predict, average=None))

0.890372929317603
0.8837884562067441
0.8913467794404685
[0.89909297 0.82290437 0.92936803]


In [None]:
dummy_df_league_3 = dummy_df_league_3.drop(['Match_Date'], axis=1)
model_rf3 = RandomForestClassifier()
X = dummy_df_league_3.drop('Full_Time_Result', axis=1)
y = dummy_df_league_3['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_rf3.fit(X_train, y_train)
model_rf3_predict = model_rf3.predict(X_test)


print(f1_score(y_test, model_rf3_predict, average='weighted'))
print(f1_score(y_test, model_rf3_predict, average='macro'))
print(f1_score(y_test, model_rf3_predict, average='micro'))
print(f1_score(y_test, model_rf3_predict, average=None))

0.832899689596082
0.817619636738656
0.842211055276382
[0.86914378 0.68649886 0.89721627]


In [None]:
model_lr0 = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option
X = dummy_df_league_0.drop('Full_Time_Result', axis=1)
y = dummy_df_league_0['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_lr0.fit(X_train, y_train)

model_lr0_predict = model_lr0.predict(X_test)

print(f1_score(y_test, model_lr0_predict, average='weighted'))
print(f1_score(y_test, model_lr0_predict, average='macro'))
print(f1_score(y_test, model_lr0_predict, average='micro'))
print(f1_score(y_test, model_lr0_predict, average=None))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0.7682677600848972
0.7381781675904288
0.770472895040369
[0.79509632 0.53975904 0.87967914]


In [None]:
model_lr1 = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option
X = dummy_df_league_1.drop('Full_Time_Result', axis=1)
y = dummy_df_league_1['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_lr1.fit(X_train, y_train)

model_lr1_predict = model_lr1.predict(X_test)

print(f1_score(y_test, model_lr1_predict, average='weighted'))
print(f1_score(y_test, model_lr1_predict, average='macro'))
print(f1_score(y_test, model_lr1_predict, average='micro'))
print(f1_score(y_test, model_lr1_predict, average=None))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0.7275747723238666
0.6956366247332583
0.7427933414535121
[0.77898663 0.46146482 0.84645842]


In [None]:
model_lr2 = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option
X = dummy_df_league_2.drop('Full_Time_Result', axis=1)
y = dummy_df_league_2['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_lr2.fit(X_train, y_train)

model_lr2_predict = model_lr2.predict(X_test)

print(f1_score(y_test, model_lr2_predict, average='weighted'))
print(f1_score(y_test, model_lr2_predict, average='macro'))
print(f1_score(y_test, model_lr2_predict, average='micro'))
print(f1_score(y_test, model_lr2_predict, average=None))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0.7333482198442776
0.7158145884139819
0.7358490566037735
[0.7585034  0.55188679 0.83705357]


In [None]:
model_lr3 = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option
X = dummy_df_league_3.drop('Full_Time_Result', axis=1)
y = dummy_df_league_3['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
model_lr3.fit(X_train, y_train)

model_lr3_predict = model_lr3.predict(X_test)

print(f1_score(y_test, model_lr3_predict, average='weighted'))
print(f1_score(y_test, model_lr3_predict, average='macro'))
print(f1_score(y_test, model_lr3_predict, average='micro'))
print(f1_score(y_test, model_lr3_predict, average=None))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0.7878173701127366
0.7675612163531514
0.7919597989949749
[0.82701812 0.60080645 0.87485908]


In [None]:
model_SVC_0 = SVC()
X = dummy_df_league_0.drop('Full_Time_Result', axis=1)
y = dummy_df_league_0['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
# Train the model
model_SVC_0.fit(X_train, y_train)
model_SVC_0_predict = model_SVC_0.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model_SVC_0_predict))


Classification Report:
               precision    recall  f1-score   support

           A       0.00      0.00      0.00       286
           D       0.00      0.00      0.00       213
           H       0.42      1.00      0.60       368

    accuracy                           0.42       867
   macro avg       0.14      0.33      0.20       867
weighted avg       0.18      0.42      0.25       867

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model_SVC_1 = SVC()
X = dummy_df_league_1.drop('Full_Time_Result', axis=1)
y = dummy_df_league_1['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
# Train the model
model_SVC_1.fit(X_train, y_train)
model_SVC_1_predict = model_SVC_1.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model_SVC_1_predict))

  _warn_prf(average, modifier, msg_start, len(result))

Classification Report:
               precision    recall  f1-score   support

           A       0.00      0.00      0.00      1547
           D       0.00      0.00      0.00      1250
           H       0.43      1.00      0.60      2129

    accuracy                           0.43      4926
   macro avg       0.14      0.33      0.20      4926
weighted avg       0.19      0.43      0.26      4926

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model_SVC_2 = SVC()
X = dummy_df_league_2.drop('Full_Time_Result', axis=1)
y = dummy_df_league_2['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
# Train the model
model_SVC_2.fit(X_train, y_train)
model_SVC_2_predict = model_SVC_2.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model_SVC_2_predict))


Classification Report:
               precision    recall  f1-score   support

           A       0.00      0.00      0.00       900
           D       0.00      0.00      0.00       870
           H       0.42      1.00      0.60      1304

    accuracy                           0.42      3074
   macro avg       0.14      0.33      0.20      3074
weighted avg       0.18      0.42      0.25      3074

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model_SVC_3 = SVC()
X = dummy_df_league_3.drop('Full_Time_Result', axis=1)
y = dummy_df_league_3['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
# Train the model
model_SVC_3.fit(X_train, y_train)
model_SVC_3_predict = model_SVC_3.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model_SVC_3_predict))


Classification Report:
               precision    recall  f1-score   support

           A       0.00      0.00      0.00       298
           D       0.00      0.00      0.00       264
           H       0.44      1.00      0.61       433

    accuracy                           0.44       995
   macro avg       0.15      0.33      0.20       995
weighted avg       0.19      0.44      0.26       995

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Create a Decision Tree classifier
model_DT_0 = DecisionTreeClassifier()

# Train the model
model_DT_0.fit(X_train, y_train)
# Predictions
model_DT_0_predict = model_DT_0.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, model_DT_0_predict))
print("\nClassification Report:\n", classification_report(y_test, model_DT_0_predict))

Confusion Matrix:
 [[298   0   0]
 [  0 264   0]
 [  0   0 433]]

Classification Report:
               precision    recall  f1-score   support

           A       1.00      1.00      1.00       298
           D       1.00      1.00      1.00       264
           H       1.00      1.00      1.00       433

    accuracy                           1.00       995
   macro avg       1.00      1.00      1.00       995
weighted avg       1.00      1.00      1.00       995



In [None]:
# Create a Decision Tree classifier
model_DT_1 = DecisionTreeClassifier()

# Train the model
model_DT_1.fit(X_train, y_train)
# Predictions
model_DT_1_predict = model_DT_1.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, model_DT_1_predict))
print("\nClassification Report:\n", classification_report(y_test, model_DT_1_predict))

Confusion Matrix:
 [[298   0   0]
 [  0 264   0]
 [  0   0 433]]

Classification Report:
               precision    recall  f1-score   support

           A       1.00      1.00      1.00       298
           D       1.00      1.00      1.00       264
           H       1.00      1.00      1.00       433

    accuracy                           1.00       995
   macro avg       1.00      1.00      1.00       995
weighted avg       1.00      1.00      1.00       995



In [None]:
# Create a Decision Tree classifier
model_DT_2 = DecisionTreeClassifier()

# Train the model
model_DT_2.fit(X_train, y_train)
# Predictions
model_DT_2_predict = model_DT_2.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, model_DT_2_predict))
print("\nClassification Report:\n", classification_report(y_test, model_DT_2_predict))

Confusion Matrix:
 [[298   0   0]
 [  0 264   0]
 [  0   0 433]]

Classification Report:
               precision    recall  f1-score   support

           A       1.00      1.00      1.00       298
           D       1.00      1.00      1.00       264
           H       1.00      1.00      1.00       433

    accuracy                           1.00       995
   macro avg       1.00      1.00      1.00       995
weighted avg       1.00      1.00      1.00       995



In [None]:
j# Create a Decision Tree classifier
model_DT_3 = DecisionTreeClassifier()

# Train the model
model_DT_3.fit(X_train, y_train)
# Predictions
model_DT_3_predict = model_DT_3.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, model_DT_3_predict))
print("\nClassification Report:\n", classification_report(y_test, model_DT_3_predict))

Confusion Matrix:
 [[298   0   0]
 [  0 264   0]
 [  0   0 433]]

Classification Report:
               precision    recall  f1-score   support

           A       1.00      1.00      1.00       298
           D       1.00      1.00      1.00       264
           H       1.00      1.00      1.00       433

    accuracy                           1.00       995
   macro avg       1.00      1.00      1.00       995
weighted avg       1.00      1.00      1.00       995



# Hlavni nadrazi model

In [None]:

dummies_country = pd.get_dummies(dummy_train_df['Country'])
dummies_time = pd.get_dummies(dummy_train_df['Time_kick_off_bin'])


dummy_train_df = pd.concat([dummy_train_df, dummies_country], axis=1)
dummy_train_df = pd.concat([dummy_train_df, dummies_time], axis=1)


dummy_train_df = dummy_train_df.drop(['Country'], axis=1)
dummy_train_df = dummy_train_df.drop(['Time_kick_off_bin'], axis=1)

In [None]:
dummy_train_df = dummy_train_df.drop(['Match_Date'], axis=1)

In [None]:
dummy_train_df.head()

Unnamed: 0,League,Season,Match_Date,Full_Time_Home_Team_Goals,Full_Time_Away_Team_Goals,Full_Time_Result,Home_Team_Shots,Away_Team_Shots,Home_Team_Shots_on_Target,Away_Team_Shots_on_Target,...,italy,netherlands,portugal,scotland,spain,turkey,Afternoon,Evening,Late Evening,Morning
0,1,1920,2019-07-26,2.0,1.0,H,10.0,8.0,2.0,4.0,...,0,0,0,0,0,0,0,0,1,0
1,1,1920,2019-07-27,1.0,3.0,A,7.0,25.0,2.0,22.0,...,0,0,0,0,0,0,0,0,1,0
2,1,1920,2019-07-27,0.0,2.0,A,13.0,14.0,5.0,9.0,...,0,0,0,0,0,0,0,1,0,0
3,1,1920,2019-07-27,0.0,2.0,A,7.0,10.0,2.0,5.0,...,0,0,0,0,0,0,0,0,1,0
4,1,1920,2019-07-27,0.0,1.0,A,10.0,10.0,4.0,6.0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
model1 = RandomForestClassifier()
X = dummy_train_df.drop('Full_Time_Result', axis=1)
y = dummy_train_df['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
model1.fit(X_train, y_train)
model1_predict = model1.predict(X_test)


In [None]:
model1.feature_importances_

array([6.41044786e-03, 8.78649080e-03, 1.69230761e-01, 1.70890699e-01,
       1.67957704e-02, 1.62907988e-02, 3.16586932e-02, 3.05231536e-02,
       1.61295947e-02, 1.46444372e-02, 1.07188428e-02, 1.08974604e-02,
       4.75927042e-03, 4.39991633e-03, 8.60908819e-03, 3.45692629e-03,
       1.45299171e-02, 1.51944073e-02, 4.01675527e-02, 2.70369670e-02,
       2.77895898e-02, 2.02133549e-02, 1.84733400e-02, 1.83564536e-02,
       2.14242150e-02, 1.76005558e-02, 2.80881939e-02, 2.04474012e-02,
       2.79995320e-02, 3.42419882e-03, 2.30025438e-04, 1.06783958e-04,
       7.32639134e-05, 1.99605816e-04, 8.78545024e-05, 1.29369532e-04,
       1.64535835e-04, 8.87399888e-05, 2.40380070e-04, 1.02244634e-04,
       9.97754601e-05, 4.53756078e-05, 1.66973970e-04, 1.45962496e-04,
       1.83575913e-04, 1.60462715e-04, 3.21208265e-04, 2.21640027e-04,
       1.60600967e-04, 2.88585856e-04, 2.36444524e-04, 4.55917370e-05,
       2.24290686e-04, 2.12455153e-04, 2.54922631e-04, 1.64719556e-04,
      

In [None]:
model1_predict

array(['A', 'H', 'A', ..., 'H', 'A', 'H'], dtype=object)

In [None]:
print(classification_report(y_test, model1_predict))

              precision    recall  f1-score   support

           A       0.93      0.96      0.94      3011
           D       0.95      0.83      0.89      2591
           H       0.94      0.98      0.96      4260

    accuracy                           0.94      9862
   macro avg       0.94      0.93      0.93      9862
weighted avg       0.94      0.94      0.94      9862



In [None]:
print(f1_score(y_test, model1_predict, average='weighted'))
print(f1_score(y_test, model1_predict, average='macro'))
print(f1_score(y_test, model1_predict, average='micro'))
print(f1_score(y_test, model1_predict, average=None))

0.9357545104694043
0.9301411350794039
0.9368282295680389
[0.94271258 0.88756423 0.96014659]


In [None]:
model2 = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option

model2.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model2_predict = model2.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model2_predict))


Classification Report:
               precision    recall  f1-score   support

           A       0.80      0.82      0.81      3011
           D       0.60      0.54      0.57      2591
           H       0.83      0.87      0.85      4260

    accuracy                           0.77      9862
   macro avg       0.74      0.74      0.74      9862
weighted avg       0.76      0.77      0.76      9862



In [None]:
print(f1_score(y_test, model2_predict, average='weighted'))
print(f1_score(y_test, model2_predict, average='macro'))
print(f1_score(y_test, model2_predict, average='micro'))
print(f1_score(y_test, model2_predict, average=None))

0.7628924112518185
0.7416973039445031
0.766477387953762
[0.80551452 0.56869919 0.8508782 ]


In [None]:
model3 = SVC()

# Train the model
model3.fit(X_train, y_train)

In [None]:
model3_predict = model3.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model3_predict))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

Classification Report:
               precision    recall  f1-score   support

           A       0.00      0.00      0.00      3011
           D       0.00      0.00      0.00      2591
           H       0.43      1.00      0.60      4260

    accuracy                           0.43      9862
   macro avg       0.14      0.33      0.20      9862
weighted avg       0.19      0.43      0.26      9862

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Create a Decision Tree classifier
model4 = DecisionTreeClassifier()

# Train the model
model4.fit(X_train, y_train)


In [None]:
# Predictions
model4_predict = model4.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, model4_predict))
print("\nClassification Report:\n", classification_report(y_test, model4_predict))



Confusion Matrix:
 [[3010    1    0]
 [   0 2591    0]
 [   0    0 4260]]

Classification Report:
               precision    recall  f1-score   support

           A       1.00      1.00      1.00      3011
           D       1.00      1.00      1.00      2591
           H       1.00      1.00      1.00      4260

    accuracy                           1.00      9862
   macro avg       1.00      1.00      1.00      9862
weighted avg       1.00      1.00      1.00      9862



In [None]:
print(f1_score(y_test, model4_predict, average='weighted'))
print(f1_score(y_test, model4_predict, average='macro'))
print(f1_score(y_test, model4_predict, average='micro'))
print(f1_score(y_test, model4_predict, average=None))

0.9998986020509575
0.9998803253931624
0.9998986006895153
[0.99983391 0.99980706 1.        ]


Unnamed: 0,Country,League,Season,Match_Date,Home_Team,Away_Team,Full_Time_Home_Team_Goals,Full_Time_Away_Team_Goals,Full_Time_Result,Home_Team_Shots,...,AwayTeam_Foggia,AwayTeam_Leverkusen,AwayTeam_Benevento,AwayTeam_Benfica,AwayTeam_Eupen,AwayTeam_Porto,AwayTeam_Rangers,AwayTeam_Hamburg,AwayTeam_Moreirense,AwayTeam_Standard
0,england,0,1718,2017-08-11,Arsenal,Leicester,4.0,3.0,H,27.0,...,False,False,False,False,False,False,False,False,False,False
1,england,0,1718,2017-08-12,Brighton,Man City,0.0,2.0,A,6.0,...,False,False,False,False,False,False,False,False,False,False
2,england,0,1718,2017-08-12,Crystal Palace,Huddersfield,0.0,3.0,A,14.0,...,False,False,False,False,False,False,False,False,False,False
3,england,0,1718,2017-08-12,Chelsea,Burnley,2.0,3.0,A,19.0,...,False,False,False,False,False,False,False,False,False,False
4,england,0,1718,2017-08-12,Everton,Stoke,1.0,0.0,H,9.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
feature_importance = model1.feature_importances_

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3182c57a-4879-462a-8002-e0676ac18eff' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>