In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Or any other model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [6]:
df = pd.read_csv('ML_data.csv')

In [7]:
# Encode 'HomeTeam' and 'AwayTeam'
le_home = LabelEncoder()
le_away = LabelEncoder()
df['HomeTeam_encoded'] = le_home.fit_transform(df['HomeTeam'])
df['AwayTeam_encoded'] = le_away.fit_transform(df['AwayTeam'])

le_season = LabelEncoder()
df['season_encoded'] = le_season.fit_transform(df['season'])

In [14]:
# Prepare features and target variable
features = ['season_encoded', 'HomeTeam_encoded', 'AwayTeam_encoded', 'HomeTeamStrength', 'AwayTeamStrength', 
            'avgHG', 'avgAG', 'avgHHG', 'avgHAG', 'avgHST', 'avgAST', 'avgHR', 'avgAR', 'FTR']
df_ML = df[features]
df_ML.dropna()

Unnamed: 0,season_encoded,HomeTeam_encoded,AwayTeam_encoded,HomeTeamStrength,AwayTeamStrength,avgHG,avgAG,avgHHG,avgHAG,avgHST,avgAST,avgHR,avgAR,FTR
0,5,10,17,8.0,13.0,0.666667,0.666667,0.000000,0.333333,5.666667,3.333333,0.000000,0.000000,D
1,5,18,0,1.0,1.0,1.333333,1.666667,0.666667,1.000000,4.666667,5.666667,0.000000,0.000000,D
2,5,9,1,8.0,4.0,2.000000,0.666667,1.333333,0.666667,6.333333,5.666667,0.000000,0.333333,D
3,5,5,3,1.0,13.0,1.666667,0.333333,0.666667,0.333333,7.333333,4.000000,0.000000,0.000000,H
4,5,14,19,1.0,13.0,2.000000,0.000000,1.333333,0.000000,5.000000,0.333333,0.333333,0.333333,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,0,20,5,16.0,1.0,0.857143,1.785714,0.428571,0.785714,3.714286,6.428571,0.000000,0.000000,A
1504,0,17,12,13.0,12.0,1.214286,1.142857,0.428571,0.428571,3.500000,3.500000,0.071429,0.000000,A
1505,0,21,11,8.0,2.0,2.142857,1.500000,1.000000,0.500000,5.000000,3.142857,0.000000,0.142857,A
1506,0,3,19,14.0,7.0,1.142857,1.642857,0.714286,0.642857,5.285714,4.928571,0.142857,0.357143,A


In [17]:
#Split the data into training and testing sets
X= df_ML.drop('FTR', axis=1)
y = df_ML['FTR']

# Convert the target to numerical classes (0, 1, 2)
y = y.map({'H': 0, 'D': 1, 'A': 2})




##RandomForestClassifier

In [36]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=42)

#Train a RandomForestClassifier (or any other model)
model = RandomForestClassifier(n_estimators=330, class_weight={2:1, 1:3.5, 0:1}, max_depth = 300,random_state=42)
model.fit(X_train, y_train)
#Make predictions and evaluate the model
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.55      0.65      0.60       112
           1       0.32      0.17      0.22        65
           2       0.52      0.58      0.55        95

    accuracy                           0.51       272
   macro avg       0.47      0.47      0.46       272
weighted avg       0.49      0.51      0.49       272



##Logistic Regression

In [69]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=71)
model = LogisticRegression(penalty='l2', multi_class='multinomial', solver='lbfgs', C=0.2, max_iter=400, class_weight='balanced', random_state=71)
#(multi_class='ovr', class_weight='balanced')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.62      0.64       221
           1       0.37      0.33      0.35       126
           2       0.56      0.66      0.60       181

    accuracy                           0.56       528
   macro avg       0.53      0.53      0.53       528
weighted avg       0.56      0.56      0.56       528





##XGBoost

In [76]:
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize the XGBoost classifier
model = xgb.XGBClassifier(
    use_label_encoder=False,  # Suppress warnings about the label encoder
    max_depth = 1000,
    eval_metric='mlogloss',   # Evaluation metric for multi-class classification
    class_weight='balanced',  # Handle class imbalance
    random_state=71
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

Parameters: { "class_weight", "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.47      0.57      0.51       151
           1       0.40      0.22      0.28        97
           2       0.47      0.52      0.49       129

    accuracy                           0.46       377
   macro avg       0.45      0.44      0.43       377
weighted avg       0.45      0.46      0.45       377



##KNN Classifier

In [87]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
grid_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END .....algorithm=auto, n_neighbors=5, we