In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('updated_dataset1.csv')  

# Drop the 'home_score' and 'away_score' columns
data.drop(['home_score', 'away_score'], axis=1, inplace=True)

# Drop irrelevant columns (if any) and select relevant features and the target variable
features = ['home_team', 'away_team', 'tournament', 'city', 'country', 'neutral', 'year', 'month', 'day', 'Continent', 'MaximumTemp_Month', 'MinimumTemp_Month', 'HomeTeamRank', 'AwayTeamRank']
target = 'MatchResult'
data = data[features + [target]]

# Convert categorical variables to numerical using label encoding
label_encoder = LabelEncoder()
data['tournament'] = label_encoder.fit_transform(data['tournament'])
data['city'] = label_encoder.fit_transform(data['city'])
data['country'] = label_encoder.fit_transform(data['country'])
data['Continent'] = label_encoder.fit_transform(data['Continent'])

# Convert 'neutral' column to numerical (TRUE: 1, FALSE: 0)
data['neutral'] = data['neutral'].astype(int)

# Map team names to numerical representations
data['home_team'] = label_encoder.fit_transform(data['home_team'])
data['away_team'] = label_encoder.fit_transform(data['away_team'])

# Convert 'month' column to numerical representation using a mapping
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_to_num = {month: num+1 for num, month in enumerate(months)}
data['month'] = data['month'].map(month_to_num)

# Split the data into training and test sets
X = data.drop(target, axis=1)
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

# Initialize the XGBoost classifier
model = XGBClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Save the best model
import joblib
joblib.dump(best_model, 'xgboost_model3.joblib')


Test Accuracy: 0.7020890099909174


['xgboost_model3.joblib']