In [1]:
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from skopt.space import Real, Integer, Categorical
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from skopt import BayesSearchCV
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'xgboost'

In [None]:
def pivot_approach(df, aggfunc='sum'):
    # Use pivot_table to reshape the data
    result = pd.pivot_table(df, values=df.columns.drop('POSITION'), 
                            index=df.index, columns='POSITION', 
                            aggfunc=aggfunc, fill_value=0)
    
    # Flatten column names
    result.columns = [f'{stat}_{pos}' for stat, pos in result.columns]
    
    return result

In [None]:
train_home_team_statistics_df = pd.read_csv("Train_Data/train_home_team_statistics_df.csv", index_col=0)
train_home_player_statistics_df = pd.read_csv("Train_Data/train_home_player_statistics_df.csv", index_col=0)

train_away_team_statistics_df = pd.read_csv("Train_Data/train_away_team_statistics_df.csv", index_col=0)
train_away_player_statistics_df = pd.read_csv("Train_Data/train_away_player_statistics_df.csv", index_col=0)

train_scores = pd.read_csv("Y_train_1rknArQ.csv", index_col=0)

In [None]:
train_home_player_statistics_df = train_home_player_statistics_df.drop(["LEAGUE", "TEAM_NAME", "PLAYER_NAME"], axis=1)
train_away_player_statistics_df = train_away_player_statistics_df.drop(["LEAGUE", "TEAM_NAME", "PLAYER_NAME"], axis=1)

In [None]:
pivoted_df_home = pivot_approach(train_home_player_statistics_df)
pivoted_df_away = pivot_approach(train_away_player_statistics_df)

In [None]:
train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]

In [None]:
train_home = train_home.fillna(0)
train_away = train_away.fillna(0)

In [None]:
pivoted_df_home = pivoted_df_home.fillna(0)
pivoted_df_away = pivoted_df_away.fillna(0)

In [None]:
pivoted_df_home.columns = 'HOME_' + pivoted_df_home.columns
pivoted_df_away.columns = 'AWAY_' + pivoted_df_away.columns

In [None]:
train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

train_data =  pd.concat([train_home, pivoted_df_home ,train_away, pivoted_df_away],join='inner',axis=1)
#train_data =  pd.concat([train_home, train_away],join='inner',axis=1)
train_scores = train_scores.loc[train_data.index]

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})

In [None]:
train_new_y = train_scores['AWAY_WINS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_new_y, train_size=0.8, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, random_state=42)

In [None]:
model_lr = LogisticRegressionCV(random_state=42, solver="sag")
model_gb = GradientBoostingClassifier(random_state=42)
#model = MLPClassifier(max_iter=350)

In [None]:
# # Define the search spaces for each model
# gb_space = {
#     'n_estimators': Integer(10, 200),
#     'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
#     'max_depth': Integer(1, 10),
#     'min_samples_split': Integer(2, 10),
#     'min_samples_leaf': Integer(1, 10),
# }

In [None]:
# model = BayesSearchCV(
#     model,
#     gb_space,
#     n_iter=5,
#     cv=3,
#     n_jobs=-1,
#     random_state=42,
#     verbose=2,
#     scoring="accuracy"
# )

In [None]:
estimators = [
    ("lr", model_lr),
    ("gb", model_gb)
]

In [None]:
model = StackingClassifier(
    estimators=estimators,
    final_estimator=GradientBoostingClassifier(),
    #n_jobs=-1,
    stack_method="predict_proba"
)

In [None]:
model.fit(X_train, y_train)

In [None]:
# print("val. score: %s" % model.best_score_)
# print("test score: %s" % model.score(X_test, y_test))
# print("best params: %s" % str(model.best_params_))

In [None]:
target = train_scores.loc[X_test.index].copy()

home_wins = target
home_wins = 0 * home_wins
home_wins.iloc[:,0] = 1

np.round(accuracy_score(home_wins,target),4)

In [None]:
predictions = model.predict_proba(X_test)
predictions = pd.DataFrame(predictions)
predictions[2] = 0
predictions.columns = [0,2,1]
predictions = (predictions.reindex(columns=[0,1,2]).rank(1,ascending=False)==1).astype(int).values
print(f"The score is: {np.round(accuracy_score(predictions,target),4)}")

Gradient Boosting Classifier (no optimization): 0.4921

XGBoost : 0.4852

MLPClassifier (max_iter=300) : 0.4808

LogisticRegression : 0.4821

Submission

In [None]:
# test_home = pd.read_csv("Test_Data/test_home_team_statistics_df.csv", index_col=0)
# test_away = pd.read_csv("Test_Data/test_away_team_statistics_df.csv", index_col=0)

# test_home.columns = 'HOME_' + test_home.columns
# test_away.columns = 'AWAY_' + test_away.columns

# test_data =  pd.concat([test_home,test_away],join='inner',axis=1)

In [None]:
# predictions = model.predict_proba(test_data)
# predictions = pd.DataFrame(predictions)

# predictions[2] = 0
# predictions.columns = [0,2,1]
# predictions = (predictions.reindex(columns=[0,1,2]).rank(1,ascending=False)==1).astype(int)

# predictions.columns = ['HOME_WINS', 'DRAW', 'AWAY_WINS']
# predictions.index = test_data.index
# submission = predictions.reset_index()
# submission.to_csv("benchmark_submission.csv", index=False)
