# 1. EXPLORATION DES DONNEES


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
fifa = pd.read_csv("all_players.csv", index_col=0)
fifa.head()


In [None]:
fifa.describe()

In [None]:
print(fifa.info())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

missing_data = fifa.isnull().sum()
missing_percentage = (fifa.isnull().sum() / len(fifa)) * 100


missing_df = pd.DataFrame({
    'Total Manquants': missing_data,
    'Pourcentage (%)': missing_percentage
})

missing_df = missing_df[missing_df['Total Manquants'] > 0].sort_values(by='Pourcentage (%)', ascending=False)

print("R√©sum√© des valeurs manquantes :")
print(missing_df)

plt.figure(figsize=(12, 6))
sns.heatmap(fifa.isnull(), cbar=False, yticklabels=False, cmap='viridis')

plt.title('Carte des valeurs manquantes (Jaune = Manquant)')
plt.show()

In [None]:
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


numerical_selector = make_column_selector(dtype_include=['number'])
categorical_selector = make_column_selector(dtype_include=['object', 'category'])

cat_col = categorical_selector(fifa)
num_col = numerical_selector(fifa)

print(cat_col)
print(num_col)


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns


plt.figure(figsize=(10, 6))
sns.countplot(x='Position', data=fifa, order=fifa['Position'].value_counts().index, palette='viridis')
plt.title('Distribution des modalit√©s de la variable Position')
plt.show()

# 2. ANALYSE UNIVARIEE & BIVARIEE


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


cardinality = fifa[cat_col].nunique().sort_values(ascending=False)

# Cr√©ation du graphique
plt.figure(figsize=(10, len(cat_col) * 0.4 + 2)) 

# Code couleur : Rouge pour cardinalit√© √©lev√©e (>50), Bleu pour le reste
colors = ['red' if x > 50 else 'skyblue' for x in cardinality.values]

ax = sns.barplot(x=cardinality.values, y=cardinality.index, palette=colors)

# Ajout des √©tiquettes de valeur (le nombre exact) au bout des barres
for i, v in enumerate(cardinality.values):
    ax.text(v + 1, i, str(v), color='black', va='center', fontweight='bold')

plt.title('Nombre de modalit√©s uniques par variable cat√©gorielle', fontsize=14)
plt.xlabel('Nombre de modalit√©s uniques')
plt.ylabel('Variables')
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

* Seules les variales Position et Preferred foot sont des variables cat√©gorielles pertientes pour notre analyse.

* Les variables Height, Weight sont des variables num√©riques.

* V√©rifier si les noms ne sont pas doubl√©s



In [None]:
mask_dups = fifa["Name"].duplicated(keep=False)
dups = fifa[mask_dups]
dups_sorted = dups.sort_values("Name", ascending=False)
display(dups_sorted[["Name", "Position"]])
 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math


n_cols_plot = 3
n_rows_plot = math.ceil(len(num_col) / n_cols_plot)

fig, axes = plt.subplots(n_rows_plot, n_cols_plot, figsize=(15, 4 * n_rows_plot))
axes = axes.flatten() 

for i, col in enumerate(num_col):
    
    sns.histplot(fifa[col], bins=50, kde=True, ax=axes[i], color='teal')
    
    axes[i].set_title(f'Distribution : {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Nombre de joueurs')

for i in range(len(num_col), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

n_cols_plot = 3
n_rows_plot = math.ceil(len(num_col) / n_cols_plot)

fig, axes = plt.subplots(n_rows_plot, n_cols_plot, figsize=(15, 4 * n_rows_plot))
axes = axes.flatten()

for i, col in enumerate(num_col):

    sns.boxplot(x=fifa[col], ax=axes[i], color='lightblue', showmeans=True)
    
    axes[i].set_title(f'Boxplot : {col}')
    axes[i].set_xlabel(col)

for i in range(len(num_col), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# 3. ENTRAINMENT ET EVALUATION DU MODELE

## 3.1 preprocessing des donn√©es

## 3.1.1 Tra√Ætement de "play style" et "Alternative positions"

In [None]:

complex_cols = ['Alternative positions', 'play style']

for col in complex_cols:
    if col in fifa.columns:
        print(f"--- Analyse des modalit√©s pour : {col} ---")
        
        all_modalities = fifa[col].dropna().astype(str).str.split(',').explode().str.strip()
        

        counts = all_modalities.value_counts()
        
        print(f"Nombre de modalit√©s uniques trouv√©es : {len(counts)}")
        print("Les 10 plus fr√©quentes :")
        print(counts.head(10))
        print("\nToutes les modalit√©s :")
        print(counts.index.tolist())
        print("\n" + "="*50 + "\n")
    else:
        print(f"Attention : La colonne '{col}' n'a pas √©t√© trouv√©e dans le DataFrame.")

In [None]:
def one_hot_encode_list_column(df, col_name, separator=','):
    clean_series = df[col_name].fillna('').astype(str).str.replace(' ', '')

    dummies = clean_series.str.get_dummies(sep=separator)

    dummies = dummies.add_prefix(f"{col_name}_")
    df_result = pd.concat([df, dummies], axis=1)
    
    return df_result


ALTP = one_hot_encode_list_column(fifa, 'play style')

ALTP.head(10)


## 3.2 Cr√©ation du mod√®le

### 3.2.1 Mod√®le 1.0 (KNN)

On utilise que les colonnes num√©riques pour ce mod√®le

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
 
y = fifa[["Position"]]
X = fifa.iloc[:,2:41].drop(columns=["Position"])
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
cross_values = []
n_neighbors_values = []
 
for i in range(1, 35) :
    model = KNeighborsClassifier(n_neighbors=i)
    model_fit = model.fit(X_train,y_train)
    cross_value = cross_val_score(model_fit, X_train, y_train)
    cross_values.append(cross_value.mean())
    n_neighbors_values.append(i)
 
best_idx = np.argmax(cross_values)
best_n_neighbors = n_neighbors_values[best_idx]
best_score = cross_values[best_idx]

plt.figure(figsize=(10,5))
plt.plot(n_neighbors_values,cross_values)
plt.xticks(np.arange(min(n_neighbors_values), max(n_neighbors_values)+1, 1.0))
print(f"Best neighbors : {best_n_neighbors}, Score : {best_score}")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
 
model = KNeighborsClassifier(n_neighbors=29)
model_fit = model.fit(X_train,y_train)
cross_value = cross_val_score(model_fit, X_train, y_train)
print(cross_value.mean())
 
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)
model.score(X_test,y_test)

plt.figure(figsize=(30,30))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

In [None]:

def one_hot_encode_list_column(df, col_name, separator=','):
    clean_series = df[col_name].fillna('').astype(str).str.replace(' ', '')

    dummies = clean_series.str.get_dummies(sep=separator)

    dummies = dummies.add_prefix(f"{col_name}_")
    df_result = pd.concat([df, dummies], axis=1)
    
    return df_result

In [None]:
import pandas as pd
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score


y = fifa['Position'].copy()

# Colonnes √† supprimer (non pertinentes ou non atomiques)
cols_to_drop = ["url", "GK Diving", "GK Handling", "GK Kicking", "GK Positioning", 
                "GK Reflexes", 'Name', 'Nation', 
                'League', 'Team', 'Rank', 'Position']

fifa_cleaned = fifa.drop(columns=cols_to_drop)

# Nettoyage de Height et Weight
fifa_cleaned["Height"] = fifa["Height"].str.extract(r'(\d+)').astype(int)
fifa_cleaned["Weight"] = fifa["Weight"].str.extract(r'(\d+)').astype(int)
fifa_cleaned = one_hot_encode_list_column(fifa_cleaned, 'play style')
fifa_cleaned = one_hot_encode_list_column(fifa_cleaned, 'Alternative positions')
fifa_cleaned = fifa_cleaned.drop(columns= ['play style', 'Alternative positions'])

numerical_selector = make_column_selector(dtype_include=['number'])
categorical_selector = make_column_selector(dtype_include=['object', 'category'])

num_features = numerical_selector(fifa_cleaned)
cat_features = categorical_selector(fifa_cleaned)


X = fifa_cleaned[num_features + cat_features].copy()

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train


In [None]:
# D√©finition du Preprocessor et Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KNeighborsClassifier(n_neighbors=5))
])



In [None]:
# Cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Scores Accuracy par fold: {cv_scores}")
print(f"Accuracy moyenne: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Meilleurs param√®tres: {grid_search.best_params_}")
print(f"Meilleur score Accuracy: {grid_search.best_score_:.4f}")
grid_search

In [None]:
from sklearn.linear_model import SGDClassifier

pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SGDClassifier(random_state=42, max_iter=1000))
])

cv_scores = cross_val_score(pipeline2, X_train, y_train, cv=5, scoring='accuracy')
print(f"Scores Accuracy par fold: {cv_scores}")
print(f"Accuracy moyenne: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
param_grid2 = {
    'model__loss': ['hinge', 'log_loss', 'modified_huber'],  # 'hinge' = SVM lin√©aire, 'log_loss' = R√©gression logistique
    'model__alpha': [0.0001, 0.001, 0.01, 0.1],              #
    'model__penalty': ['l1', 'l2', 'elasticnet'],
    'model__learning_rate': ['constant', 'optimal', 'adaptive'],
    'model__eta0': [0.01, 0.1]                               # Taux d'apprentissage initial
}
grid_search2 = GridSearchCV(pipeline2, param_grid2, cv=5, scoring='accuracy', n_jobs=-1)
grid_search2.fit(X_train, y_train)
print("--- SGD Classifier ---")
print(f"Meilleurs param√®tres: {grid_search2.best_params_}")
print(f"Meilleur score Accuracy: {grid_search2.best_score_:.4f}")

In [None]:
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score
)

# Meilleur mod√®le
best_model = grid_search2.best_estimator_

# Pr√©diction sur les donn√©es de test
y_pred = best_model.predict(X_test)
print(f"accuracy: {best_model.score(X_test, y_test)}")
best_model

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)

class_names = best_model.classes_

plt.figure(figsize=(10, 8))
sns.heatmap(
    conf_matrix, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names, 
    yticklabels=class_names   
)
plt.title('Matrice de Confusion')
plt.ylabel('Vraie √©tiquette')
plt.xlabel('√âtiquette pr√©dite')
plt.xticks(rotation=45, ha='right')  
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import classification_report
import pandas as pd

# output_dict=True pour avoir un dictionnaire convertible en DataFrame
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Arrondir pour la lisibilit√©
report_df = report_df.round(2)

print(report_df)

# Trier par Recall d√©croissant pour voir les classes probl√©matiques
print("\n--- Tri√©es par Recall (plus faible en premier) ---")
report_df.iloc[:-3].sort_values(by='recall')

# Analyse des performances par classe

## üî¥ Classes probl√©matiques (Recall < 50%)

| Classe | Recall | Diagnostic |
|--------|--------|------------|
| LW | 42% | Plus de la moiti√© des ailiers gauches sont mal class√©s |
| RW | 47% | M√™me probl√®me pour les ailiers droits |

## üü° Classes moyennes (Recall 70-95%)

| Classe | Recall | Note |
|--------|--------|------|
| CAM | 71% | Confondu avec d'autres milieux offensifs |
| CDM | 89% | Parfois pris pour CB |
| LB | 91% | Correcte dans l'ensemble |
| RM | 94% | Correcte dans l'ensemble |


## üü¢ Classes excellentes (Recall ‚â• 95%)

| Classe | Recall | Pourquoi ? |
|--------|--------|------------|
| GK | 100% | Stats uniques (arr√™ts, plongeons, etc.) |
| CB | 98% | Profil d√©fensif tr√®s marqu√© |
| ST | 97% | Stats d'attaque/finition distinctes |
| CM | 97% | Position centrale bien d√©finie |
| RB | 96% | Position centrale bien d√©finie |
| LM | 96% | Position centrale bien d√©finie |

### Hypoth√®se 1 : Biais de d√©s√©quilibre des classes

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


report_dict = classification_report(y_test, y_pred, output_dict=True)
df_metrics = pd.DataFrame(report_dict).transpose()


df_metrics = df_metrics.iloc[:-3]
df_sorted = df_metrics.sort_values(by='support')


plt.figure(figsize=(12, 6))
# Courbe Precision
plt.plot(df_sorted['support'], df_sorted['precision'], 
         marker='o', linestyle='-', color='dodgerblue', label='Precision', linewidth=2)

# Courbe Recall
plt.plot(df_sorted['support'], df_sorted['recall'], 
         marker='s', linestyle='--', color='darkorange', label='Recall', linewidth=2)

plt.title('√âvolution de la Performance selon le Support (Taille de classe)')
plt.xlabel('Support (Nombre de joueurs)')
plt.ylabel('Score (0 √† 1)')
plt.legend()
plt.grid(True, alpha=0.3)

for idx, row in df_sorted.iterrows():
    plt.text(row['support'], row['recall'] + 0.02, idx, fontsize=9, ha='center')

plt.show()

### Hypoth√®se 2 : Similitude entre certaines classes

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np

classes = best_model.classes_
cm = confusion_matrix(y_test, y_pred, labels=classes)

# Normalisation par ligne
cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Cr√©ation d'un DataFrame pour manipuler facilement
df_cm = pd.DataFrame(cm_pct, index=classes, columns=classes)

target_rows = ['LW', 'RW', 'CAM', 'CDM', 'LM', 'RM']
df_focused = df_cm.loc[target_rows]

cols_active = df_focused.columns[(df_focused > 1).any()]
df_focused = df_focused[cols_active]

# Heatmap
plt.figure(figsize=(14, 5))
sns.heatmap(df_focused, annot=True, fmt='.1f', cmap='YlOrRd', 
            cbar_kws={'label': 'Pourcentage de pr√©diction (%)'})

plt.title('Analyse des Confusions : O√π le mod√®le classe-t-il vraiment ces joueurs ?')
plt.xlabel('Classe Pr√©dite par le mod√®le')
plt.ylabel('Vraie Classe du joueur')
plt.yticks(rotation=0)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filtrer uniquement les ailiers
wingers = fifa[fifa['Position'].isin(['LW', 'RW'])]

# Cr√©ation de la figure avec 2 sous-graphiques
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Nombre de Droitiers/Gauchers
sns.countplot(x='Preferred foot', hue='Preferred foot', data=wingers, palette='Set1', ax=ax[0])
ax[0].set_title('R√©partition Droitiers/Gauchers chez les Ailiers')
ax[0].set_ylabel('Nombre de joueurs')

# Distribution de la note "Weak foot" (Mauvais pied)
sns.boxplot(x='Position', y='Weak foot', data=wingers, palette='Set1', ax=ax[1])
ax[1].set_title('Qualit√© du Mauvais Pied (Weak Foot Rating)')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

analysis_df = X.copy()
analysis_df['True_Position'] = y 

wingers_df = analysis_df[analysis_df['True_Position'].isin(['LW', 'RW', 'LM', 'RM'])]


alt_cols = [c for c in X.columns if 'Alternative positions_' in c]

# Groupement par vraie position et moyenne des alternatives
heatmap_data = wingers_df.groupby('True_Position')[alt_cols].mean() * 100
heatmap_data = heatmap_data.loc[:, (heatmap_data > 5).any(axis=0)]
heatmap_data.columns = [c.replace('Alternative positions_', '') for c in heatmap_data.columns]

# Visualisation
plt.figure(figsize=(12, 4))
sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='Greens', cbar_kws={'label': '% de joueurs'})
plt.title('Pourcentage des LW/RW poss√©dant ces positions alternatives')
plt.show()

In [None]:
# M√©thode "Back to basics" avec le dataset d'origine (beaucoup plus simple si X n'a pas de colonnes nomm√©es)
df_wingers = fifa[fifa['Position'].isin(['LW', 'RW'])]

# Tableau crois√© (Crosstab) normalis√© en pourcentage
ct = pd.crosstab(fifa['Position'], fifa['Preferred foot'], normalize='index') * 100

plt.figure(figsize=(8, 4))
sns.heatmap(ct, annot=True, fmt='.1f', cmap='Blues', cbar_kws={'label': '% joueurs'})
plt.title('R√©partition du Pied Pr√©f√©r√© (Source: Dataset Original)')
plt.show()

#### Test ANOVA

In [None]:
import pandas as pd
from scipy.stats import f_oneway

# S√©lection classes cibles
target_classes = ['LW', 'RW', 'LM', 'RM']
df_subset = fifa[fifa['Position'].isin(target_classes)]

# R√©cup√©ration des colonnes num√©riques
num_cols = df_subset.select_dtypes(include=['number']).columns.tolist()

results = []

print(f"Test ANOVA pour les classes : {target_classes}")
print("-" * 50)

for col in num_cols:
    groups = [df_subset[df_subset['Position'] == pos][col].dropna() for pos in target_classes]
    
    if all(len(g) > 10 for g in groups):
        f_stat, p_val = f_oneway(*groups)
        results.append({'Variable': col, 'F-Score': f_stat, 'P-Value': p_val})

anova_df = pd.DataFrame(results).sort_values(by='F-Score', ascending=False)

significant_vars = anova_df[anova_df['P-Value'] < 0.05]

print("Top 10 des variables les plus discriminantes (F-Score √©lev√©) :")
print(significant_vars.head(10))

print("\nVariables qui ne permettent PAS de distinguer ces positions (P-Value > 0.05) :")
print(anova_df[anova_df['P-Value'] >= 0.05]['Variable'].tolist())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

top_vars = anova_df.head(20)
non_sig_vars = anova_df[anova_df['P-Value'] >= 0.05]
plot_data = pd.concat([top_vars, non_sig_vars]).drop_duplicates()

plt.figure(figsize=(12, 8))

colors = ['red' if p > 0.05 else 'dodgerblue' for p in plot_data['P-Value']]

# Barplot du F-Score 
sns.barplot(x='F-Score', y='Variable', data=plot_data, palette=colors)

plt.title("ANOVA : Capacit√© des variables √† distinguer LW/RW/LM/RM")
plt.xlabel("F-Score (Pouvoir discriminant)")
plt.ylabel("Variables")

# L√©gende
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='dodgerblue', label='Significatif (p < 0.05) - Distingue les groupes'),
    Patch(facecolor='red', label='Non Significatif (p > 0.05) - Indiscernable')
]
plt.legend(handles=legend_elements)

plt.tight_layout()
plt.show()

#### Mod√®le avec StratifiedKFold

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

kfold = StratifiedKFold(n_splits=12, shuffle=True, random_state=42)

pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SGDClassifier(random_state=42, max_iter=1000))
])

# Utilisation du kfold
cv_scores = cross_val_score(pipeline2, X_train, y_train, cv=kfold, scoring='accuracy')

print(f"Scores Accuracy par fold: {cv_scores}")
print(f"Accuracy moyenne: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

### Mod√®le avec √©quilibrage des classes

In [None]:
# Cr√©ation du pipeline avec class_weight='balanced'
pipeline_balanced = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SGDClassifier(
        loss='modified_huber',       
        alpha=0.001, 
        penalty='l1', 
        learning_rate='adaptive', 
        eta0=0.1,
        class_weight='balanced',    
        random_state=42, 
        max_iter=1000
    ))
])

# Entra√Ænement
print("Entra√Ænement avec pond√©ration des classes...")
pipeline_balanced.fit(X_train, y_train)

# √âvaluation
print(f"accuracy: {pipeline_balanced.score(X_test, y_test)}")
y_pred_balanced = pipeline_balanced.predict(X_test)

print("\n--- RAPPORT AVEC class_weight='balanced' ---")
print(classification_report(y_test, y_pred_balanced))



### Fusion des classes lat√©rales

In [None]:
# 1. D√©finition du dictionnaire
position_mapping = {
    'LW': 'Winger', 'RW': 'Winger', 
    'LM': 'Winger', 'RM': 'Winger'
}

y_grouped = y.replace(position_mapping)

# V√©rifications
print("Nouvelles classes :")
print(y_grouped.unique())

print("\nDistribution :")
print(y_grouped.value_counts())
print(f"\nNombre de NaN restants : {y_grouped.isna().sum()}")

In [None]:
groups_to_merge = {}
for old_pos, new_pos in position_mapping.items():
    groups_to_merge.setdefault(new_pos, []).append(old_pos)
# 2. It√©ration et fusion dans le DataFrame X
print("--- Fusion des colonnes dans X ---")
for new_group, old_positions in groups_to_merge.items():
    # On construit les noms de colonnes exacts dans X (ex: 'Alternative positions_LW')
    target_cols = [f"Alternative positions_{pos}" for pos in old_positions]
    
    # On v√©rifie lesquelles existent vraiment dans X (pour √©viter les erreurs)
    available_cols = [c for c in X.columns if c in target_cols]
    
    if available_cols:
        new_col_name = f"Alternative positions_{new_group}"
        

        X[new_col_name] = X[available_cols].max(axis=1)
        
        # SUPPRESSION des anciennes colonnes
        X.drop(columns=available_cols, inplace=True)
        
        print(f"‚úÖ Fusionn√© : {available_cols} \n   ---> {new_col_name}")
# V√©rification finale
print("\nColonnes restantes (Alternative positions) :")
print([c for c in X.columns if 'Alternative positions_' in c])

### Mod√®le avec les classes lat√©rales fusionn√©es

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y_grouped, test_size=0.2, random_state=42)
X_train

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

numerical_selector = make_column_selector(dtype_include=['number'])
categorical_selector = make_column_selector(dtype_include=['object', 'category'])
# 2. Recr√©ation du Preprocessor
preprocessor_updated = ColumnTransformer(
    transformers=[
        # On standardise les variables num√©riques (y compris les positions OneHot fusionn√©es)
        ('num', StandardScaler(), numerical_selector),
        # On encode les variables cat√©gorielles restantes (Nation, League...)
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_selector)
    ]
)
# 3. Cr√©ation du Pipeline avec le NOUVEAU preprocessor
pipeline3 = Pipeline(steps=[
    ('preprocessor', preprocessor_updated),
    ('model', SGDClassifier(random_state=42, max_iter=1000, class_weight='balanced')) 
])

# 4. Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Lancement de la validation crois√©e sur les donn√©es fusionn√©es...")
cv_scores = cross_val_score(pipeline3, X_train, y_train, cv=kfold, scoring='accuracy')
print(f"\nScores Accuracy par fold: {cv_scores}")
print(f"Accuracy moyenne: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

In [None]:
pipeline_opti = Pipeline(steps=[
    ('preprocessor', preprocessor_updated),
    ('model', SGDClassifier(
        loss='modified_huber',       
        alpha=0.001, 
        penalty='l1', 
        learning_rate='adaptive', 
        eta0=0.1,
        class_weight='balanced',    
        random_state=42, 
        max_iter=1000
    ))
])

In [None]:
# 4. Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Lancement de la validation crois√©e sur les donn√©es fusionn√©es...")
cv_scores = cross_val_score(pipeline_opti, X_train, y_train, cv=kfold, scoring='accuracy')
print(f"\nScores Accuracy par fold: {cv_scores}")
print(f"Accuracy moyenne: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

In [None]:
pipeline_opti.fit(X_train, y_train)
# Pr√©diction sur les donn√©es de test
y_pred = pipeline_opti.predict(X_test)
print(f"accuracy: {pipeline_opti.score(X_test, y_test)}")
pipeline_opti

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)

class_names = pipeline_opti.classes_

plt.figure(figsize=(10, 8))
sns.heatmap(
    conf_matrix, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_names, 
    yticklabels=class_names   
)
plt.title('Matrice de Confusion')
plt.ylabel('Vraie √©tiquette')
plt.xlabel('√âtiquette pr√©dite')
plt.xticks(rotation=45, ha='right')  
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import classification_report
import pandas as pd

# output_dict=True pour avoir un dictionnaire convertible en DataFrame
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Arrondir pour la lisibilit√©
report_df = report_df.round(2)

print(report_df)

# Trier par Recall d√©croissant pour voir les classes probl√©matiques
print("\n--- Tri√©es par Recall (plus faible en premier) ---")
report_df.iloc[:-3].sort_values(by='recall')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np

classes = pipeline_opti.classes_
cm = confusion_matrix(y_test, y_pred, labels=classes)

# Normalisation par ligne
cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Cr√©ation d'un DataFrame pour manipuler facilement
df_cm = pd.DataFrame(cm_pct, index=classes, columns=classes)

target_rows = ['CAM', 'CDM', 'CB', 'CM']
df_focused = df_cm.loc[target_rows]

cols_active = df_focused.columns[(df_focused > 1).any()]
df_focused = df_focused[cols_active]

# Heatmap
plt.figure(figsize=(14, 5))
sns.heatmap(df_focused, annot=True, fmt='.1f', cmap='YlOrRd', 
            cbar_kws={'label': 'Pourcentage de pr√©diction (%)'})

plt.title('Analyse des Confusions : O√π le mod√®le classe-t-il vraiment ces joueurs ?')
plt.xlabel('Classe Pr√©dite par le mod√®le')
plt.ylabel('Vraie Classe du joueur')
plt.yticks(rotation=0)
plt.show()

# 4. TEST DU MODELE

In [None]:
import pandas as pd
import numpy as np

def predict_player_position(player_data, model):

    # 1. Conversion en DataFrame si c'est un dictionnaire
    if isinstance(player_data, dict):
        df_player = pd.DataFrame([player_data])
    else:
        df_player = player_data
        
    # 2. Pr√©diction
    try:
        # R√©cup√©ration des probabilit√©s
        probas = model.predict_proba(df_player)[0]
        classes = model.classes_
        
        results = list(zip(classes, probas))
        
        results.sort(key=lambda x: x[1], reverse=True)
        
        # 3. Affichage Format√©
        top_pred, top_score = results[0]
        
        print(f"POSITION PR√âDITE : {top_pred.upper()}")
        print(f"Confiance : {top_score:.1%}\n")
        
        print("--- Autres possibilit√©s ---")
        for cls, score in results[1:]:
            if score > 0.01: 
                bar_len = int(score * 20) 
                bar = '‚ñà' * bar_len
                print(f"{cls:15} : {score:.1%}  {bar}")
                
    except AttributeError:
        print("‚ùå Erreur : Ce mod√®le ne supporte pas les probabilit√©s (predict_proba).")
        print("Assurez-vous d'avoir utilis√© loss='modified_huber' ou 'log_loss' dans SGDClassifier.")


In [None]:

STATS = {
    # Physique & Vitesse
    'Pace': 88,          # Vitesse
    'Physics': 75,       # Physique
    
    # Technique
    'Shoot': 82,         # Tir
    'Passing': 70,       # Passe
    'Dribble': 85,       # Dribble
    'Defense': 30,       # D√©fense
    
    # Infos
    'Foot': 'Left'       # 'Right' ou 'Left'
}

In [None]:

print(f"üìä Analyse du profil : {STATS}")
print("..."*10)
predict_custom_random(
    model=pipeline_opti,
    base_dataset=X_train,
    
    # Mapping automatique des stats saisies vers le dataset FIFA
    Sprint_Speed=STATS['Pace'], Acceleration=STATS['Pace'],
    
    Finishing=STATS['Shoot'], Shot_Power=STATS['Shoot'], Long_Shots=STATS['Shoot'], 
    Positioning=STATS['Shoot'], Volleys=STATS['Shoot'], Penalties=STATS['Shoot'],
    
    Short_Passing=STATS['Passing'], Long_Passing=STATS['Passing'], Vision=STATS['Passing'], 
    Crossing=STATS['Passing'], Curve=STATS['Passing'], Free_Kick_Accuracy=STATS['Passing'],
    
    Dribbling=STATS['Dribble'], Ball_Control=STATS['Dribble'], Agility=STATS['Dribble'], 
    Balance=STATS['Dribble'], Reactions=STATS['Dribble'], Composure=STATS['Dribble'],
    
    Defensive_Awareness=STATS['Defense'], Standing_Tackle=STATS['Defense'], 
    Sliding_Tackle=STATS['Defense'], Interceptions=STATS['Defense'], Heading_Accuracy=STATS['Defense'],
    
    Strength=STATS['Physics'], Stamina=STATS['Physics'], Jumping=STATS['Physics'], Aggression=STATS['Physics'],
    
    Preferred_foot=STATS['Foot']
)

In [None]:
def start_console_predictor(model, base_dataset):
    print("\n‚öΩÔ∏è --- CR√âATEUR DE JOUEUR FIFA (Mode Console) --- ‚öΩÔ∏è")
    print("Entrez les notes (0-99) quand demand√©. Tapez 'q' pour quitter.")
    
    while True:
        try:
            print("\n------------------------------------------------")
            # 1. Saisie des donn√©es
            pace = input("üèÉ Vitesse (Pace) [ex: 75] : ")
            if pace.lower() == 'q': break
            
            shoot = input("üéØ Tir (Shooting) [ex: 60] : ")
            passing = input("ap Passe (Passing) [ex: 70] : ")
            dribble = input("‚ö°Ô∏è Dribble [ex: 80] : ")
            defense = input("üõ° D√©fense [ex: 40] : ")
            physics = input("üí™ Physique [ex: 65] : ")
            foot = input("üëü Pied (Left/Right) [ex: Right] : ")
            
            # 2. Conversion & Mapping (Approximation pour remplir les colonnes fines)
            # On convertit les inputs en int
            pace = int(pace)
            shoot = int(shoot)
            passing = int(passing)
            dribble = int(dribble)
            defense = int(defense)
            physics = int(physics)
            
            # Gestion du pied par d√©faut
            if foot.lower() not in ['left', 'right']:
                foot = 'Right'
                print("-> Pied 'Right' s√©lectionn√© par d√©faut.")
            else:
                foot = foot.capitalize()

            print("\n‚è≥ Calcul de la pr√©diction...")
            
            # 3. Appel de la fonction de pr√©diction (avec mapping des stats g√©n√©rales vers pr√©cises)
            predict_custom_random(
                model=model,
                base_dataset=base_dataset,
                
                # Mapping Vitesse
                Sprint_Speed=pace, Acceleration=pace,
                
                # Mapping Tir
                Finishing=shoot, Shot_Power=shoot, Long_Shots=shoot, Positioning=shoot, Volleys=shoot, Penalties=shoot,
                
                # Mapping Passe
                Short_Passing=passing, Long_Passing=passing, Vision=passing, Crossing=passing, Curve=passing, Free_Kick_Accuracy=passing,
                
                # Mapping Dribble
                Dribbling=dribble, Ball_Control=dribble, Agility=dribble, Balance=dribble, Reactions=dribble, Composure=dribble,
                
                # Mapping D√©fense
                Defensive_Awareness=defense, Standing_Tackle=defense, Sliding_Tackle=defense, Interceptions=defense, Heading_Accuracy=defense,
                
                # Mapping Physique
                Strength=physics, Stamina=physics, Jumping=physics, Aggression=physics,
                
                # Pied
                Preferred_foot=foot
            )
            
        except ValueError:
            print("‚ùå Erreur : Veuillez entrer uniquement des nombres entiers (0-99).")
        except KeyboardInterrupt:
            print("\nArr√™t demand√©.")
            break

# --- LANCEMENT ---
start_console_predictor(pipeline_opti, X_train)