In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)



In [47]:
df = pd.read_csv("MScCardsDatasetNew.csv")
print(df.head())

   fixture_id     season season_start_date season_end_date country league  \
0     1320059  2014-2015        2014-08-01      2015-05-22  France  FraL2   
1     1320060  2014-2015        2014-08-01      2015-05-22  France  FraL2   
2     1320061  2014-2015        2014-08-01      2015-05-22  France  FraL2   
3     1320062  2014-2015        2014-08-01      2015-05-22  France  FraL2   
4     1320063  2014-2015        2014-08-01      2015-05-22  France  FraL2   

   competition_level     kick_off_datetime       team1_name    team2_name  \
0                  2  2014-08-01T18:00:00Z            Arles       Ajaccio   
1                  2  2014-08-01T18:00:00Z       AJ Auxerre      Le Havre   
2                  2  2014-08-04T18:30:00Z            Brest      Clermont   
3                  2  2014-08-01T18:00:00Z  Gazelec Ajaccio  Valenciennes   
4                  2  2014-08-01T18:00:00Z      Chateauroux        Troyes   

   ...  stadium_surface  stadium_runningtrack  stadium_capacity  \
0  ... 

In [48]:
df = df[df["league"] == "EngPr"].copy()

In [49]:
# Target variables
target_vars = ["team1_yc", "team2_yc", "team1_rc", "team2_rc"]

# Feature variables
feature_vars = [
    "season",
    "kick_off_datetime",
    "team1_name", "team2_name",
    "referee",
    "attendance_value",
    "limited_audience",
    "stadium_surface",
    "stadium_capacity",
    "stadium_altitude",
    "distance",
    "team1_stadium_dist", "team2_stadium_dist"
]

# Final selection
chosen_vars = target_vars + feature_vars
df = df[chosen_vars]
list(df)

['team1_yc',
 'team2_yc',
 'team1_rc',
 'team2_rc',
 'season',
 'kick_off_datetime',
 'team1_name',
 'team2_name',
 'referee',
 'attendance_value',
 'limited_audience',
 'stadium_surface',
 'stadium_capacity',
 'stadium_altitude',
 'distance',
 'team1_stadium_dist',
 'team2_stadium_dist']

In [50]:
df["kick_off_datetime"] = pd.to_datetime(df["kick_off_datetime"], errors="coerce")
df["kickoff_year"] = df["kick_off_datetime"].dt.year
df["kickoff_month"] = df["kick_off_datetime"].dt.month
df["kickoff_day"] = df["kick_off_datetime"].dt.day
df["kickoff_dayOfWeek"] = df["kick_off_datetime"].dt.dayofweek
df["kickoff_hour"] = df["kick_off_datetime"].dt.hour

df.drop(columns=["kick_off_datetime"], inplace=True)

list(df)

['team1_yc',
 'team2_yc',
 'team1_rc',
 'team2_rc',
 'season',
 'team1_name',
 'team2_name',
 'referee',
 'attendance_value',
 'limited_audience',
 'stadium_surface',
 'stadium_capacity',
 'stadium_altitude',
 'distance',
 'team1_stadium_dist',
 'team2_stadium_dist',
 'kickoff_year',
 'kickoff_month',
 'kickoff_day',
 'kickoff_dayOfWeek',
 'kickoff_hour']

In [51]:
if "limited_audience" in df.columns:
    df["limited_audience"] = (
        df["limited_audience"]
        .astype(str).str.strip().str.upper()   
        .map({"TRUE": 1})                     
        .fillna(0)                           
        .astype(int)                         
    )
    

In [52]:
y = df[target_vars].copy()
X = df.drop(columns=target_vars).copy()

numeric_features = [
    "attendance_value", 
    "stadium_capacity", 
    "stadium_altitude",
    "distance", 
    "team1_stadium_dist", 
    "team2_stadium_dist", 
    "limited_audience",               
    "kickoff_year", 
    "kickoff_month", 
    "kickoff_day", 
    "kickoff_dayOfWeek", 
    "kickoff_hour"
]
categorical_features = [
    "season", 
    "team1_name", 
    "team2_name", 
    "referee", 
    "stadium_surface"
]

In [53]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler(with_mean=True, with_std=True)) 
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

final_numeric_features = []
for col in numeric_features:
    if col in X.columns:
        final_numeric_features.append(col)

final_categorical_features = []
for col in categorical_features:
    if col in X.columns:
        final_categorical_features.append(col)
        
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, final_numeric_features),
        ("cat", categorical_transformer, final_categorical_features),
    ]
)