# Projet foot

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import r2_score, f1_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

## data
load and process all the data
### load data

In [7]:
data = pd.read_csv("data\matchs_2013_2022.csv", sep=",")
dataAPredire = pd.read_csv("data\match_2023.csv", sep=",")
y = data["results"]

def remove_categorical():
    X = data[dataAPredire.columns].drop("date", axis=1)
    
    categoricalData = X.select_dtypes(include=['object'])
    # categoricalData = categoricalData.drop(columns=["date","home_club_name","away_club_name","home_club_manager_name","away_club_manager_name","referee", "stadium"])
    # print(f'shape of categorical data: {categoricalData.shape}'),

    dataToOHE = pd.DataFrame([])
    ohe = OneHotEncoder()
    X[dataToOHE.columns] = dataToOHE.apply(ohe.fit_transform)

    dataToLabelEncode = categoricalData
    le = LabelEncoder()
    X[dataToLabelEncode.columns] = dataToLabelEncode.apply(le.fit_transform)

    return X

def plot_correlation_matrix(data):
    correlation_matrix = data.corr()
    plt.figure(figsize=(20, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.show()

def create_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier()
    
    max_missing = 0.1 * len(X_train)
    missing_values = X.isna().sum()

    if all(missing_values < max_missing):
        imputer = SimpleImputer(strategy='mean')
    else:
        imputer = SimpleImputer(strategy='constant', fill_value=0)

    pipeline = Pipeline([
        ('imputer', imputer),
        ('model', model)
    ])
    
    pipeline.fit(X_train, y_train)
    return model    

def get_accuracy_with_model(model, X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def test_data(X, y, model=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    if model is None:
        model = create_model(X, y)
    return get_accuracy_with_model(model, X_train, X_test, y_train, y_test)

In [9]:
from fonctions_tests import add_manager_win_percentage
import sklearn.discriminant_analysis as DA

add_manager_win_percentage(data)

X = data[["attendance", "home_club_manager_win_percentage", "away_club_manager_win_percentage"]]
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

test_data(X, y, model=DA.LinearDiscriminantAnalysis())

0.5392156862745098

### Process data
Create different pipeline :

In [3]:
# imports

from sklearn.pipeline import Pipeline

easy_pipeline = Pipeline(steps=[])

In [4]:
from sklearn.model_selection import train_test_split
# Create splits



## Model
### imports

### fit and train model

## predict