In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV

In [24]:
data_train = pd.read_csv("train_df.csv")
data_train.head()

Unnamed: 0,Country,League,Season,Match_Date,Home_Team,Away_Team,Full_Time_Home_Team_Goals,Full_Time_Away_Team_Goals,Full_Time_Result,Home_Team_Shots,...,Average_draw_odds,Average_over_2.5_goals_odds,Average_under_2.5_goals_odds,Average_Asian_Handicap_Home_odds,Average_Asian_Handicap_Away_odds,Close_home_win_odds,Close_draw_odds,Close_away_win_odds,has_favorite,Time_kick_off_bin
0,belgium,1,1920,2019-07-26,Genk,Kortrijk,2.0,1.0,H,10.0,...,4.928333,1.52,2.506667,1.165,1.786667,1.45,4.94,6.79,1,Late Evening
1,belgium,1,1920,2019-07-27,Waasland-Beveren,Club Brugge,1.0,3.0,A,7.0,...,4.37,1.556667,2.41,1.725,1.893333,8.99,6.52,1.29,1,Late Evening
2,belgium,1,1920,2019-07-27,Cercle Brugge,Standard,0.0,2.0,A,13.0,...,3.671667,1.766667,2.053333,1.5575,1.946667,3.79,3.74,1.99,0,Evening
3,belgium,1,1920,2019-07-27,Waregem,Mechelen,0.0,2.0,A,7.0,...,3.618333,1.653333,2.21,1.3875,1.923333,2.8,3.59,2.51,0,Late Evening
4,belgium,1,1920,2019-07-27,St Truiden,Mouscron,0.0,1.0,A,10.0,...,3.57,1.833333,1.986667,1.3525,1.886667,2.16,3.55,3.46,0,Late Evening


In [25]:

data_train['Season'].unique()

array([1920, 2021, 2122, 1718, 1819])

In [26]:
dl0 = pd.read_csv("df_league_0.csv")
dl1 = pd.read_csv("df_league_1.csv")
dl2 = pd.read_csv("df_league_2.csv")
dl3 = pd.read_csv("df_league_3.csv")

In [27]:

data_train = data_train.drop(['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals'], axis = 1)

dl0 = dl0.drop(['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals'], axis = 1)
dl1 = dl1.drop(['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals'], axis = 1)
dl2 = dl2.drop(['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals'], axis = 1)
dl3 = dl3.drop(['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals'], axis = 1)


data_train = pd.get_dummies(data_train, columns=['Season'])

data_train.head(10)

data_train['Full_Time_Result'] = data_train['Full_Time_Result'].astype("category")

dl0['Full_Time_Result'] = dl0['Full_Time_Result'].astype("category")
dl1['Full_Time_Result'] = dl1['Full_Time_Result'].astype("category")
dl2['Full_Time_Result'] = dl2['Full_Time_Result'].astype("category")
dl3['Full_Time_Result'] = dl3['Full_Time_Result'].astype("category")

In [28]:
unique_team_names = set()

unique_team_names = unique_team_names.union(set(data_train['Home_Team'].unique()))
unique_team_names = unique_team_names.union(set(data_train['Away_Team'].unique()))

In [29]:
def team_dummy_variables(df, team_names):
    home_team_columns = [f'HomeTeam_{team}' for team in team_names]
    away_team_columns = [f'AwayTeam_{team}' for team in team_names]

    home_team_dummies = pd.DataFrame({col: (df['Home_Team'] == team) for team, col in zip(team_names, home_team_columns)})
    away_team_dummies = pd.DataFrame({col: (df['Away_Team'] == team) for team, col in zip(team_names, away_team_columns)})
    
    df = pd.concat([df, home_team_dummies, away_team_dummies], axis=1)
    return df

dummy_train_df = team_dummy_variables(data_train, unique_team_names)
dummy_df_league_0 = team_dummy_variables(dl0, unique_team_names)
dummy_df_league_1 = team_dummy_variables(dl1, unique_team_names)
dummy_df_league_2 = team_dummy_variables(dl2, unique_team_names)
dummy_df_league_3 = team_dummy_variables(dl3, unique_team_names)

In [30]:
dummy_train_df = dummy_train_df.drop(['Home_Team', 'Away_Team'], axis=1)

In [31]:
# Convert to datetime
dummy_train_df['Match_Date'] = pd.to_datetime(dummy_train_df['Match_Date'])

# Convert datetime to integer (Unix timestamp)
dummy_train_df['date_int'] = dummy_train_df['Match_Date'].astype(int) / 10**9

In [32]:
dummies_country = pd.get_dummies(dummy_train_df['Country'])
dummies_time = pd.get_dummies(dummy_train_df['Time_kick_off_bin'])


dummy_train_df = pd.concat([dummy_train_df, dummies_country], axis=1)
dummy_train_df = pd.concat([dummy_train_df, dummies_time], axis=1)


dummy_train_df = dummy_train_df.drop(['Country'], axis=1)
dummy_train_df = dummy_train_df.drop(['Time_kick_off_bin'], axis=1)
dummy_train_df = dummy_train_df.drop(['Match_Date'], axis=1)

In [33]:
dummy_df_league_2.head()

Unnamed: 0,Country,League,Season,Match_Date,Home_Team,Away_Team,Full_Time_Result,Home_Team_Shots,Away_Team_Shots,Home_Team_Shots_on_Target,...,AwayTeam_Man United,AwayTeam_Boavista,AwayTeam_Ath Madrid,AwayTeam_Trapani,AwayTeam_Castellon,AwayTeam_Clyde,AwayTeam_Olympiakos,AwayTeam_Volos NFC,AwayTeam_Fleetwood Town,AwayTeam_Bologna
0,england,2,1718,2017-08-05,Doncaster,Gillingham,D,16.0,5.0,7.0,...,False,False,False,False,False,False,False,False,False,False
1,england,2,1718,2017-08-05,Southend,Blackburn,H,15.0,7.0,7.0,...,False,False,False,False,False,False,False,False,False,False
2,england,2,1718,2017-08-05,Bradford,Blackpool,H,15.0,9.0,4.0,...,False,False,False,False,False,False,False,False,False,False
3,england,2,1718,2017-08-05,Fleetwood Town,Rotherham,H,9.0,8.0,3.0,...,False,False,False,False,False,False,False,False,False,False
4,england,2,1718,2017-08-05,Charlton,Bristol Rvs,H,9.0,14.0,2.0,...,False,False,False,False,False,False,False,False,False,False


In [34]:
dummy_df_league_0 = dummy_df_league_0.drop(['Home_Team', 'Away_Team'], axis=1)


dummies_country_l0 = pd.get_dummies(dummy_df_league_0['Country'])
dummies_time_l0 = pd.get_dummies(dummy_df_league_0['Time_kick_off_bin'])


dummy_df_league_0 = pd.concat([dummy_df_league_0, dummies_country_l0], axis=1)
dummy_df_league_0 = pd.concat([dummy_df_league_0, dummies_time_l0], axis=1)


dummy_df_league_0 = dummy_df_league_0.drop(['Country'], axis=1)
dummy_df_league_0 = dummy_df_league_0.drop(['Time_kick_off_bin'], axis=1)

# Convert to datetime
dummy_df_league_0['Match_Date'] = pd.to_datetime(dummy_df_league_0['Match_Date'])

# Convert datetime to integer (Unix timestamp)
dummy_df_league_0['date_int'] = dummy_df_league_0['Match_Date'].astype(int) / 10**9
dummy_df_league_0 = dummy_df_league_0.drop(['Match_Date'], axis=1)

In [35]:
dummy_df_league_1 = dummy_df_league_1.drop(['Home_Team', 'Away_Team'], axis=1)


dummies_country_l1 = pd.get_dummies(dummy_df_league_1['Country'])
dummies_time_l1 = pd.get_dummies(dummy_df_league_1['Time_kick_off_bin'])


dummy_df_league_1 = pd.concat([dummy_df_league_1, dummies_country_l1], axis=1)
dummy_df_league_1 = pd.concat([dummy_df_league_1, dummies_time_l1], axis=1)


dummy_df_league_1 = dummy_df_league_1.drop(['Country'], axis=1)
dummy_df_league_1 = dummy_df_league_1.drop(['Time_kick_off_bin'], axis=1)

In [36]:
# Convert to datetime
dummy_df_league_1['Match_Date'] = pd.to_datetime(dummy_df_league_1['Match_Date'])

# Convert datetime to integer (Unix timestamp)
dummy_df_league_1['date_int'] = dummy_df_league_1['Match_Date'].astype(int) / 10**9
dummy_df_league_1 = dummy_df_league_1.drop(['Match_Date'], axis=1)

In [37]:
dummy_df_league_2 = dummy_df_league_2.drop(['Home_Team', 'Away_Team'], axis=1)

dummies_country_l2 = pd.get_dummies(dummy_df_league_2['Country'])
dummies_time_l2 = pd.get_dummies(dummy_df_league_2['Time_kick_off_bin'])


dummy_df_league_2 = pd.concat([dummy_df_league_2, dummies_country_l2], axis=1)
dummy_df_league_2 = pd.concat([dummy_df_league_2, dummies_time_l2], axis=1)


dummy_df_league_2 = dummy_df_league_2.drop(['Country'], axis=1)
dummy_df_league_2 = dummy_df_league_2.drop(['Time_kick_off_bin'], axis=1)

# Convert to datetime
dummy_df_league_2['Match_Date'] = pd.to_datetime(dummy_df_league_2['Match_Date'])

# Convert datetime to integer (Unix timestamp)
dummy_df_league_2['date_int'] = dummy_df_league_2['Match_Date'].astype(int) / 10**9
dummy_df_league_2 = dummy_df_league_2.drop(['Match_Date'], axis=1)

In [38]:
dummy_df_league_3 = dummy_df_league_3.drop(['Home_Team', 'Away_Team'], axis=1)

dummies_country_l3 = pd.get_dummies(dummy_df_league_3['Country'])
dummies_time_l3 = pd.get_dummies(dummy_df_league_3['Time_kick_off_bin'])


dummy_df_league_3 = pd.concat([dummy_df_league_3, dummies_country_l3], axis=1)
dummy_df_league_3 = pd.concat([dummy_df_league_3, dummies_time_l3], axis=1)


dummy_df_league_3 = dummy_df_league_3.drop(['Country'], axis=1)
dummy_df_league_3 = dummy_df_league_3.drop(['Time_kick_off_bin'], axis=1)

# Convert to datetime
dummy_df_league_3['Match_Date'] = pd.to_datetime(dummy_df_league_3['Match_Date'])

# Convert datetime to integer (Unix timestamp)
dummy_df_league_3['date_int'] = dummy_df_league_3['Match_Date'].astype(int) / 10**9
dummy_df_league_3 = dummy_df_league_3.drop(['Match_Date'], axis=1)

In [39]:
scaler = StandardScaler()
def pipeline1(model, df):
    X = df.drop('Full_Time_Result', axis=1)
    y = df['Full_Time_Result']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='macro'))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [40]:
model_rf = RandomForestClassifier()
pipeline1(model_rf, dummy_df_league_0)

Accuracy: 0.5859284890426759
F1 Score: 0.5020967729332663
[[189  15  82]
 [ 65  27 121]
 [ 59  17 292]]
              precision    recall  f1-score   support

           A       0.60      0.66      0.63       286
           D       0.46      0.13      0.20       213
           H       0.59      0.79      0.68       368

    accuracy                           0.59       867
   macro avg       0.55      0.53      0.50       867
weighted avg       0.56      0.59      0.54       867



In [41]:
pipeline1(model_rf, dummy_df_league_1)

Accuracy: 0.5998781973203411
F1 Score: 0.5522463266427106
[[ 895  240  412]
 [ 312  360  578]
 [ 232  197 1700]]
              precision    recall  f1-score   support

           A       0.62      0.58      0.60      1547
           D       0.45      0.29      0.35      1250
           H       0.63      0.80      0.71      2129

    accuracy                           0.60      4926
   macro avg       0.57      0.56      0.55      4926
weighted avg       0.58      0.60      0.58      4926



In [42]:
pipeline1(model_rf, dummy_df_league_2)

Accuracy: 0.5605074821080026
F1 Score: 0.5305388079690155
[[448 170 282]
 [177 303 390]
 [152 180 972]]
              precision    recall  f1-score   support

           A       0.58      0.50      0.53       900
           D       0.46      0.35      0.40       870
           H       0.59      0.75      0.66      1304

    accuracy                           0.56      3074
   macro avg       0.54      0.53      0.53      3074
weighted avg       0.55      0.56      0.55      3074



In [43]:
pipeline1(model_rf, dummy_df_league_3)

Accuracy: 0.5839195979899497
F1 Score: 0.5353806257253838
[[180  32  86]
 [ 72  67 125]
 [ 63  36 334]]
              precision    recall  f1-score   support

           A       0.57      0.60      0.59       298
           D       0.50      0.25      0.34       264
           H       0.61      0.77      0.68       433

    accuracy                           0.58       995
   macro avg       0.56      0.54      0.54       995
weighted avg       0.57      0.58      0.56       995



In [44]:
model_lr = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option
pipeline1(model_lr, dummy_df_league_0)

Accuracy: 0.6182237600922722
F1 Score: 0.5707420417662276
[[190  39  57]
 [ 63  61  89]
 [ 41  42 285]]
              precision    recall  f1-score   support

           A       0.65      0.66      0.66       286
           D       0.43      0.29      0.34       213
           H       0.66      0.77      0.71       368

    accuracy                           0.62       867
   macro avg       0.58      0.58      0.57       867
weighted avg       0.60      0.62      0.60       867



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
pipeline1(model_lr, dummy_df_league_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.623020706455542
F1 Score: 0.5920701839919027
[[ 993  258  296]
 [ 318  481  451]
 [ 228  306 1595]]
              precision    recall  f1-score   support

           A       0.65      0.64      0.64      1547
           D       0.46      0.38      0.42      1250
           H       0.68      0.75      0.71      2129

    accuracy                           0.62      4926
   macro avg       0.60      0.59      0.59      4926
weighted avg       0.61      0.62      0.62      4926



In [46]:
pipeline1(model_lr, dummy_df_league_2)

Accuracy: 0.5949902407286922
F1 Score: 0.5776434444498683
[[504 203 193]
 [195 399 276]
 [133 245 926]]
              precision    recall  f1-score   support

           A       0.61      0.56      0.58       900
           D       0.47      0.46      0.46       870
           H       0.66      0.71      0.69      1304

    accuracy                           0.59      3074
   macro avg       0.58      0.58      0.58      3074
weighted avg       0.59      0.59      0.59      3074



In [47]:
pipeline1(model_lr, dummy_df_league_3)

Accuracy: 0.5979899497487438
F1 Score: 0.5755939026451132
[[179  51  68]
 [ 63 111  90]
 [ 61  67 305]]
              precision    recall  f1-score   support

           A       0.59      0.60      0.60       298
           D       0.48      0.42      0.45       264
           H       0.66      0.70      0.68       433

    accuracy                           0.60       995
   macro avg       0.58      0.58      0.58       995
weighted avg       0.59      0.60      0.59       995



In [48]:
# Create a Decision Tree classifier
model_DT = DecisionTreeClassifier()
pipeline1(model_DT, dummy_df_league_0)

Accuracy: 0.5236447520184544
F1 Score: 0.5015059393702344
[[152  66  68]
 [ 52  76  85]
 [ 66  76 226]]
              precision    recall  f1-score   support

           A       0.56      0.53      0.55       286
           D       0.35      0.36      0.35       213
           H       0.60      0.61      0.61       368

    accuracy                           0.52       867
   macro avg       0.50      0.50      0.50       867
weighted avg       0.52      0.52      0.52       867



In [49]:
pipeline1(model_DT, dummy_df_league_1)

Accuracy: 0.5152253349573691
F1 Score: 0.4885887200655666
[[ 767  376  404]
 [ 364  413  473]
 [ 320  451 1358]]
              precision    recall  f1-score   support

           A       0.53      0.50      0.51      1547
           D       0.33      0.33      0.33      1250
           H       0.61      0.64      0.62      2129

    accuracy                           0.52      4926
   macro avg       0.49      0.49      0.49      4926
weighted avg       0.51      0.52      0.51      4926



In [50]:
pipeline1(model_DT, dummy_df_league_2)

Accuracy: 0.4977228366948601
F1 Score: 0.4826231111011403
[[435 232 233]
 [225 325 320]
 [228 306 770]]
              precision    recall  f1-score   support

           A       0.49      0.48      0.49       900
           D       0.38      0.37      0.38       870
           H       0.58      0.59      0.59      1304

    accuracy                           0.50      3074
   macro avg       0.48      0.48      0.48      3074
weighted avg       0.50      0.50      0.50      3074



In [51]:
pipeline1(model_DT, dummy_df_league_3)

Accuracy: 0.4994974874371859
F1 Score: 0.4820385644269208
[[146  81  71]
 [ 68  99  97]
 [ 89  92 252]]
              precision    recall  f1-score   support

           A       0.48      0.49      0.49       298
           D       0.36      0.38      0.37       264
           H       0.60      0.58      0.59       433

    accuracy                           0.50       995
   macro avg       0.48      0.48      0.48       995
weighted avg       0.50      0.50      0.50       995



# Hlavni nadrazi model

In [52]:
dummy_train_df.head()

Unnamed: 0,League,Full_Time_Result,Home_Team_Shots,Away_Team_Shots,Home_Team_Shots_on_Target,Away_Team_Shots_on_Target,Home_Team_Corners,Away_Team_Corners,Home_Team_Yellow_Cards,Away_Team_Yellow_Cards,...,italy,netherlands,portugal,scotland,spain,turkey,Afternoon,Evening,Late Evening,Morning
0,1,H,10.0,8.0,2.0,4.0,6.0,2.0,0.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,1,A,7.0,25.0,2.0,22.0,1.0,14.0,3.0,1.0,...,0,0,0,0,0,0,0,0,1,0
2,1,A,13.0,14.0,5.0,9.0,3.0,7.0,2.0,2.0,...,0,0,0,0,0,0,0,1,0,0
3,1,A,7.0,10.0,2.0,5.0,4.0,2.0,3.0,1.0,...,0,0,0,0,0,0,0,0,1,0
4,1,A,10.0,10.0,4.0,6.0,5.0,5.0,2.0,4.0,...,0,0,0,0,0,0,0,0,1,0


In [53]:
model1 = RandomForestClassifier()
pipeline1(model1, dummy_train_df)

Accuracy: 0.6068748732508619
F1 Score: 0.5694686233023932
[[1837  423  731]
 [ 655  855 1120]
 [ 499  449 3293]]
              precision    recall  f1-score   support

           A       0.61      0.61      0.61      2991
           D       0.50      0.33      0.39      2630
           H       0.64      0.78      0.70      4241

    accuracy                           0.61      9862
   macro avg       0.58      0.57      0.57      9862
weighted avg       0.59      0.61      0.59      9862



In [54]:
model1.feature_importances_

array([1.06050377e-02, 2.60679204e-02, 2.49702359e-02, 4.55884735e-02,
       4.50883315e-02, 2.55214675e-02, 2.32271666e-02, 1.71927342e-02,
       1.76728806e-02, 6.56672574e-03, 6.00825525e-03, 1.33378576e-02,
       5.84217490e-03, 2.33878509e-02, 2.39419493e-02, 3.02417731e-02,
       4.06546351e-02, 3.93629128e-02, 3.09708641e-02, 2.86711826e-02,
       2.82275204e-02, 3.22030772e-02, 2.71236737e-02, 3.90394840e-02,
       3.07121876e-02, 3.96794813e-02, 4.58598548e-03, 4.19912203e-03,
       4.69257562e-03, 4.93755414e-03, 4.98223133e-03, 4.74670745e-03,
       2.32349782e-04, 3.67513649e-04, 2.30384749e-04, 1.07777329e-04,
       3.23534332e-04, 3.11692378e-04, 2.87955718e-04, 2.77587658e-04,
       2.37677947e-04, 2.13517304e-04, 2.71307245e-04, 3.52308582e-04,
       4.08481632e-04, 3.84147371e-04, 3.45641303e-04, 3.06139243e-04,
       2.81846338e-05, 3.87911213e-04, 2.98787222e-04, 2.24154189e-04,
       1.55156341e-04, 3.46777161e-04, 1.48904712e-04, 9.73912019e-05,
      

In [55]:
model2 = LogisticRegression(multi_class='multinomial', solver='lbfgs') # 'lbfgs' solver supports the multinomial option

pipeline1(model2, dummy_train_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.6158994118839992
F1 Score: 0.5902105033689334
[[1855  574  562]
 [ 668 1080  882]
 [ 485  617 3139]]
              precision    recall  f1-score   support

           A       0.62      0.62      0.62      2991
           D       0.48      0.41      0.44      2630
           H       0.68      0.74      0.71      4241

    accuracy                           0.62      9862
   macro avg       0.59      0.59      0.59      9862
weighted avg       0.61      0.62      0.61      9862



In [56]:
# Create a Decision Tree classifier
model4 = DecisionTreeClassifier()
pipeline1(model4, dummy_train_df)   


Accuracy: 0.5257554248631109
F1 Score: 0.5054774196471491
[[1565  696  730]
 [ 721  968  941]
 [ 707  882 2652]]
              precision    recall  f1-score   support

           A       0.52      0.52      0.52      2991
           D       0.38      0.37      0.37      2630
           H       0.61      0.63      0.62      4241

    accuracy                           0.53      9862
   macro avg       0.51      0.51      0.51      9862
weighted avg       0.52      0.53      0.52      9862



In [73]:
def tune_parameters(model, param_grid, df, cv):
    X = df.drop('Full_Time_Result', axis=1)
    y = df['Full_Time_Result']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    clf= GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_macro',
                        cv=cv, n_jobs=-1, verbose=2)
    clf.fit(X_train, y_train)
    print(clf.best_params_)
    print(clf.best_score_)
    print(clf.best_estimator_)
    best_parameters = grid_search.best_params_
    return best_parameters

In [74]:
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
tune_parameters(model=model4,param_grid=param_grid, df = dummy_train_df,cv = 5)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   7.4s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   7.5s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   7.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   7.5s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   7.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   7.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   7.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   7.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   4.1s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

X = dummy_train_df.drop('Full_Time_Result', axis=1)
y = dummy_train_df['Full_Time_Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=model4, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_parameters = grid_search.best_params_


Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   8.4s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   8.5s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   8.5s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=10; total time=   8.5s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=10; total time=   8.5s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   8.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   8.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   8.6s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   4.4s
[CV] END criterion=gini, max_depth=3, min_samples_leaf

In [61]:
print("Best Parameters:", best_parameters)

Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [62]:

model4_tuned = DecisionTreeClassifier(max_depth=best_parameters['max_depth'], 
                                min_samples_split=best_parameters['min_samples_split'], 
                                min_samples_leaf=best_parameters['min_samples_leaf'], 
                                criterion=best_parameters['criterion'])
pipeline1(model4_tuned, dummy_train_df)


Accuracy: 0.5925775704725208
F1 Score: 0.5708844516839914
[[1279  856  856]
 [ 292 1432  906]
 [ 277  831 3133]]
              precision    recall  f1-score   support

           A       0.69      0.43      0.53      2991
           D       0.46      0.54      0.50      2630
           H       0.64      0.74      0.69      4241

    accuracy                           0.59      9862
   macro avg       0.60      0.57      0.57      9862
weighted avg       0.61      0.59      0.59      9862



In [None]:
# Сделать ЛОГРЕГ по странам (сделать subsetы по странам 2-3 страны и ок)
# Добавить комментарии 
# Исправить код (поменять порядок блоков там удалить лишнее хз)
# Доп задание да Betting Strategy?

In [63]:
# Define hyperparameters to tune
param_grid = {
    'C': np.logspace(-4, 4, 20),          # Regularization strength
    'penalty': ['l2'],              # Regularization type
    'solver': ['lbfgs']               
}

# Setup grid search
grid_search = GridSearchCV(model2, param_grid, cv=5, n_jobs=-1, verbose=1)

# Perform grid search on the scaled data
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters found: {'C': 0.615848211066026, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validated score: 0.6063709011730627


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model2_tuned = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=500, penalty = 'l2', C=0.012742749857031334) # 'lbfgs' solver supports the multinomial option

In [None]:
model2_tuned_iter_predict = model2_tuned_iter.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model2_tuned_iter_predict))

In [None]:
model2_tuned_penalty_predict = model2_tuned_penalty.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model2_tuned_penalty_predict))

In [None]:
import matplotlib.pyplot as plt


# Create the histogram
plt.hist(dummy_train_df['Full_Time_Result'])
# Show the plot
plt.show()


In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
undersampler = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = undersampler.fit_resample(X_train, y_train)
model2_tuned_penalty = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=500, penalty = 'l2', C=0.012742749857031334) # 'lbfgs' solver supports the multinomial option

model2_tuned_penalty.fit(X_train_res, y_train_res)

model2_tuned_penalty_predict_resampled = model2_tuned_penalty.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model2_tuned_penalty_predict_resampled))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_res_SMOTE, y_train_res_SMOTE = smote.fit_resample(X_train, y_train)

model2_tuned_penalty.fit(X_train_res_SMOTE, y_train_res_SMOTE)

model2_tuned_penalty_predict_resampled = model2_tuned_penalty.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model2_tuned_penalty_predict_resampled))

In [None]:
from imblearn.over_sampling import SMOTE
undersampler = RandomUnderSampler(random_state=42)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = undersampler.fit_resample(X_train, y_train)
X_train_res_SMOTE_UNDER, y_train_res_SMOTE_UNDER = smote.fit_resample(X_train_res, y_train_res)
model2_tuned_penalty.fit(X_train_res_SMOTE_UNDER, y_train_res_SMOTE_UNDER)

model2_tuned_penalty_predict_resampled = model2_tuned_penalty.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, model2_tuned_penalty_predict_resampled))

In [None]:
feature_importance = model1.feature_importances_

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3182c57a-4879-462a-8002-e0676ac18eff' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>