In [None]:
import pandas as pd
import sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# load data
games = pd.read_csv('../data/processed_games_with_features.csv')

# prepare feature matrix and target variable
X = games[['home_team_form', 'away_team_form', 'home_team_rating', 'away_team_rating', 'home_team_injuries', 'away_team_injuries']]
y = games['FTR'].map({'H': 1, 'A': 0, 'D': 2})

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define Random Forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# evaluate model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# save model
import pickle
with open('../models/trained_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
