# Exploration et Machine Learning - Donn√©es Olympiques üèÜ

Ce notebook pr√©sente une analyse compl√®te des donn√©es olympiques avec :
- üìä **Exploration des donn√©es** depuis Azure MySQL
- üßπ **Nettoyage et pr√©paration** des donn√©es
- üìà **Visualisations interactives** avec Plotly
- ü§ñ **Mod√®les de Machine Learning** avec Scikit-learn
  - Classification avec Random Forest
  - Clustering avec KMeans

## Structure des donn√©es
- **Athletes** : 75,904 athl√®tes olympiques
- **Hosts** : 53 pays/villes h√¥tes
- **Medals** : 17,011 m√©dailles 
- **Results** : 113,632 r√©sultats de comp√©titions

In [2]:
# Imports des biblioth√®ques pour l'exploration et le Machine Learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly

# Biblioth√®ques Machine Learning
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score

# Base de donn√©es
import mysql.connector
from mysql.connector import Error

# Configuration d'affichage
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Validation des imports
print("‚úÖ Biblioth√®ques import√©es avec succ√®s")
print(f"üìä Pandas: {pd.__version__}")
print(f"üî¢ NumPy: {np.__version__}")
print(f"üìà Plotly: {plotly.__version__}")
print(f"ü§ñ Scikit-learn disponible")
print(f"üóÉÔ∏è MySQL Connector disponible")
print(f"üé® Matplotlib et Seaborn configur√©s")

‚úÖ Biblioth√®ques import√©es avec succ√®s
üìä Pandas: 2.3.3
üî¢ NumPy: 2.3.4
üìà Plotly: 6.3.1
ü§ñ Scikit-learn disponible
üóÉÔ∏è MySQL Connector disponible
üé® Matplotlib et Seaborn configur√©s


## 1. Connexion √† la base de donn√©es et chargement des donn√©es

In [3]:
# Configuration de connexion Azure MySQL
def create_connection():
    """Cr√©e une connexion √† la base MySQL Azure"""
    try:
        config = {
            'host': 'olympics-m.mysql.database.azure.com',
            'user': 'azure7',
            'password': 'Gnarok246272',
            'database': 'olympics',
            'port': 3306,
            'ssl_disabled': False
        }
        conn = mysql.connector.connect(**config)
        print("‚úÖ Connexion √† Azure MySQL r√©ussie")
        return conn
    except mysql.connector.Error as e:
        print(f"‚ùå Erreur de connexion: {e}")
        return None

# Chargement des donn√©es depuis la base
def load_olympic_data():
    """Charge toutes les donn√©es olympiques dans des DataFrames"""
    conn = create_connection()
    if not conn:
        return None, None, None, None
    
    try:
        # Chargement des tables
        print("üìä Chargement des donn√©es...")
        
        athletes_df = pd.read_sql("SELECT * FROM athletes", conn)
        hosts_df = pd.read_sql("SELECT * FROM hosts", conn)
        medals_df = pd.read_sql("SELECT * FROM medals", conn)
        results_df = pd.read_sql("SELECT * FROM results", conn)
        
        print(f"‚úÖ Athletes: {len(athletes_df):,} lignes")
        print(f"‚úÖ Hosts: {len(hosts_df):,} lignes")
        print(f"‚úÖ Medals: {len(medals_df):,} lignes")
        print(f"‚úÖ Results: {len(results_df):,} lignes")
        
        conn.close()
        return athletes_df, hosts_df, medals_df, results_df
        
    except Exception as e:
        print(f"‚ùå Erreur lors du chargement: {e}")
        conn.close()
        return None, None, None, None

# Chargement des donn√©es
athletes_df, hosts_df, medals_df, results_df = load_olympic_data()

‚úÖ Connexion √† Azure MySQL r√©ussie
üìä Chargement des donn√©es...
‚úÖ Athletes: 75,904 lignes
‚úÖ Hosts: 53 lignes
‚úÖ Medals: 17,011 lignes
‚úÖ Results: 113,632 lignes
‚úÖ Athletes: 75,904 lignes
‚úÖ Hosts: 53 lignes
‚úÖ Medals: 17,011 lignes
‚úÖ Results: 113,632 lignes


## 2. Exploration et analyse des donn√©es

In [None]:
# Exploration d√©taill√©e des donn√©es
def explore_dataframe(df, name):
    """Explore un DataFrame en d√©tail"""
    print(f"\nüîç EXPLORATION - {name.upper()}")
    print("=" * 50)
    
    print(f"üìä Forme: {df.shape}")
    print(f"üìã Colonnes: {list(df.columns)}")
    print(f"üî¢ Types de donn√©es:")
    print(df.dtypes)
    
    print(f"\n‚ùì Valeurs manquantes:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Colonne': missing.index,
        'Manquantes': missing.values,
        'Pourcentage': missing_pct.values
    })
    print(missing_df[missing_df['Manquantes'] > 0])
    
    print(f"\nüìà Statistiques descriptives:")
    print(df.describe())
    
    return missing_df

# Exploration de chaque table
if athletes_df is not None:
    athletes_missing = explore_dataframe(athletes_df, "Athletes")
    hosts_missing = explore_dataframe(hosts_df, "Hosts") 
    medals_missing = explore_dataframe(medals_df, "Medals")
    results_missing = explore_dataframe(results_df, "Results")
else:
    print("‚ùå Donn√©es non charg√©es - v√©rifiez la connexion")

## 3. Nettoyage et pr√©paration des donn√©es

In [None]:
# Nettoyage et pr√©paration des donn√©es
def clean_olympic_data(athletes_df, hosts_df, medals_df, results_df):
    """Nettoie et pr√©pare les donn√©es pour l'analyse"""
    print("üßπ NETTOYAGE DES DONN√âES")
    print("=" * 30)
    
    # Copie des DataFrames
    athletes_clean = athletes_df.copy()
    hosts_clean = hosts_df.copy()
    medals_clean = medals_df.copy()
    results_clean = results_df.copy()
    
    # Nettoyage Athletes
    print("üë®‚Äçüíº Nettoyage Athletes...")
    athletes_clean = athletes_clean.dropna(subset=['name'])  # Supprime les athl√®tes sans nom
    athletes_clean['age'] = athletes_clean['age'].fillna(athletes_clean['age'].median())
    print(f"   ‚úÖ {len(athletes_clean):,} athl√®tes apr√®s nettoyage")
    
    # Nettoyage Hosts  
    print("üèüÔ∏è Nettoyage Hosts...")
    hosts_clean = hosts_clean.dropna()  # Supprime les lignes avec valeurs manquantes
    print(f"   ‚úÖ {len(hosts_clean):,} h√¥tes apr√®s nettoyage")
    
    # Nettoyage Medals
    print("ü•á Nettoyage Medals...")
    medals_clean = medals_clean.dropna(subset=['athlete_id', 'medal'])
    medals_clean = medals_clean[medals_clean['medal'].isin(['GOLD', 'SILVER', 'BRONZE'])]
    print(f"   ‚úÖ {len(medals_clean):,} m√©dailles apr√®s nettoyage")
    
    # Nettoyage Results
    print("üèÜ Nettoyage Results...")
    results_clean = results_clean.dropna(subset=['athlete_id', 'ranking'])
    results_clean = results_clean[results_clean['ranking'] > 0]
    print(f"   ‚úÖ {len(results_clean):,} r√©sultats apr√®s nettoyage")
    
    return athletes_clean, hosts_clean, medals_clean, results_clean

# Application du nettoyage
if athletes_df is not None:
    athletes_clean, hosts_clean, medals_clean, results_clean = clean_olympic_data(
        athletes_df, hosts_df, medals_df, results_df
    )
    
    # Cr√©ation d'un dataset unifi√© pour l'analyse
    print("\nüîó Cr√©ation du dataset unifi√©...")
    
    # Jointure medals avec athletes
    medals_athletes = medals_clean.merge(
        athletes_clean[['id', 'name', 'age', 'nationality']], 
        left_on='athlete_id', 
        right_on='id', 
        how='left'
    )
    
    # Jointure avec hosts pour ajouter les informations de saison
    medals_complete = medals_athletes.merge(
        hosts_clean[['year', 'season', 'country']], 
        on='year', 
        how='left',
        suffixes=('_athlete', '_host')
    )
    
    print(f"‚úÖ Dataset unifi√© cr√©√©: {len(medals_complete):,} lignes")
    print(f"üìä Colonnes: {list(medals_complete.columns)}")
    
else:
    print("‚ùå Impossible de nettoyer - donn√©es non charg√©es")

## 4. Visualisations interactives avec Plotly

In [None]:
# Visualisations avec Plotly
def create_olympic_visualizations(medals_complete, hosts_clean, athletes_clean):
    """Cr√©e des visualisations interactives des donn√©es olympiques"""
    
    print("üìà CR√âATION DES VISUALISATIONS")
    print("=" * 35)
    
    # 1. Distribution des m√©dailles par type
    print("ü•á Graphique 1: Distribution des m√©dailles...")
    medal_counts = medals_complete['medal'].value_counts()
    
    fig1 = px.pie(
        values=medal_counts.values,
        names=medal_counts.index,
        title="Distribution des m√©dailles olympiques",
        color_discrete_map={'GOLD': '#FFD700', 'SILVER': '#C0C0C0', 'BRONZE': '#CD7F32'}
    )
    fig1.show()
    
    # 2. M√©dailles par pays h√¥te (top 10)
    print("üåç Graphique 2: Top 10 pays par m√©dailles...")
    medals_by_country = medals_complete.groupby('country_host')['medal'].count().sort_values(ascending=False).head(10)
    
    fig2 = px.bar(
        x=medals_by_country.index,
        y=medals_by_country.values,
        title="Top 10 des pays h√¥tes par nombre de m√©dailles",
        labels={'x': 'Pays h√¥te', 'y': 'Nombre de m√©dailles'},
        color=medals_by_country.values,
        color_continuous_scale='viridis'
    )
    fig2.show()
    
    # 3. Distribution de l'√¢ge des athl√®tes
    print("üë• Graphique 3: Distribution des √¢ges...")
    fig3 = px.histogram(
        athletes_clean,
        x='age',
        nbins=30,
        title="Distribution de l'√¢ge des athl√®tes olympiques",
        labels={'age': '√Çge', 'count': 'Nombre d\'athl√®tes'}
    )
    fig3.show()
    
    # 4. M√©dailles par sport (top 15)
    print("üèÉ‚Äç‚ôÇÔ∏è Graphique 4: Sports avec le plus de m√©dailles...")
    medals_by_sport = medals_complete['sport'].value_counts().head(15)
    
    fig4 = px.bar(
        x=medals_by_sport.values,
        y=medals_by_sport.index,
        orientation='h',
        title="Top 15 des sports par nombre de m√©dailles",
        labels={'x': 'Nombre de m√©dailles', 'y': 'Sport'}
    )
    fig4.show()
    
    # 5. √âvolution des m√©dailles par ann√©e
    print("üìÖ Graphique 5: √âvolution temporelle...")
    medals_by_year = medals_complete.groupby(['year', 'medal']).size().reset_index(name='count')
    
    fig5 = px.line(
        medals_by_year,
        x='year',
        y='count',
        color='medal',
        title="√âvolution du nombre de m√©dailles par ann√©e",
        color_discrete_map={'GOLD': '#FFD700', 'SILVER': '#C0C0C0', 'BRONZE': '#CD7F32'}
    )
    fig5.show()
    
    # 6. Relation √¢ge vs performance (m√©dailles)
    print("üìä Graphique 6: √Çge vs Performance...")
    medal_score = {'GOLD': 3, 'SILVER': 2, 'BRONZE': 1}
    medals_complete['medal_score'] = medals_complete['medal'].map(medal_score)
    
    fig6 = px.scatter(
        medals_complete.dropna(subset=['age']),
        x='age',
        y='medal_score',
        color='season',
        title="Relation entre l'√¢ge et la performance (m√©daille)",
        labels={'age': '√Çge', 'medal_score': 'Score m√©daille (Or=3, Argent=2, Bronze=1)'}
    )
    fig6.show()
    
    print("‚úÖ Toutes les visualisations cr√©√©es!")
    
    return fig1, fig2, fig3, fig4, fig5, fig6

# Cr√©ation des visualisations
if 'medals_complete' in locals():
    visualizations = create_olympic_visualizations(medals_complete, hosts_clean, athletes_clean)
else:
    print("‚ùå Dataset unifi√© non disponible - ex√©cutez d'abord les cellules pr√©c√©dentes")

## 5. Ing√©nierie des caract√©ristiques (Feature Engineering)

In [None]:
# Ing√©nierie des caract√©ristiques pour le Machine Learning
def create_ml_features(medals_complete, results_clean):
    """Cr√©e et pr√©pare les features pour le machine learning"""
    
    print("‚öôÔ∏è ING√âNIERIE DES CARACT√âRISTIQUES")
    print("=" * 40)
    
    # Pr√©paration du dataset pour ML
    ml_data = medals_complete.copy()
    
    # 1. Cr√©ation de nouvelles features
    print("üîß Cr√©ation de nouvelles caract√©ristiques...")
    
    # Cat√©gories d'√¢ge
    ml_data['age_category'] = pd.cut(
        ml_data['age'], 
        bins=[0, 20, 25, 30, 35, 100], 
        labels=['<20', '20-25', '25-30', '30-35', '35+']
    )
    
    # Score de performance par athl√®te
    athlete_performance = ml_data.groupby('athlete_id').agg({
        'medal_score': ['sum', 'mean', 'count']
    }).round(2)
    athlete_performance.columns = ['total_score', 'avg_score', 'medal_count']
    athlete_performance = athlete_performance.reset_index()
    
    # Ajout au dataset principal
    ml_data = ml_data.merge(athlete_performance, on='athlete_id', how='left')
    
    # 2. Encodage des variables cat√©gorielles
    print("üè∑Ô∏è Encodage des variables cat√©gorielles...")
    
    # LabelEncoder pour les variables cat√©gorielles
    label_encoders = {}
    categorical_cols = ['sport', 'season', 'country_host', 'medal']
    
    for col in categorical_cols:
        if col in ml_data.columns:
            le = LabelEncoder()
            ml_data[f'{col}_encoded'] = le.fit_transform(ml_data[col].astype(str))
            label_encoders[col] = le
    
    # 3. Features num√©riques pour ML
    print("üî¢ S√©lection des features num√©riques...")
    
    feature_columns = ['age', 'year', 'medal_score', 'total_score', 'avg_score', 'medal_count']
    feature_columns += [f'{col}_encoded' for col in categorical_cols if col != 'medal']
    
    # Cr√©ation du dataset final pour ML
    ml_features = ml_data[feature_columns + ['medal_encoded']].dropna()
    
    # 4. Normalisation des features
    print("üìè Normalisation des features...")
    
    X = ml_features.drop('medal_encoded', axis=1)
    y = ml_features['medal_encoded']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    
    print(f"‚úÖ Dataset ML cr√©√©:")
    print(f"   üìä Forme: {X_scaled_df.shape}")
    print(f"   üéØ Classes cibles: {sorted(y.unique())}")
    print(f"   üìã Features: {list(X.columns)}")
    
    return X_scaled_df, y, scaler, label_encoders, ml_data

# Application de l'ing√©nierie des features
if 'medals_complete' in locals():
    X_features, y_target, scaler, encoders, ml_dataset = create_ml_features(medals_complete, results_clean)
    
    # Affichage des statistiques
    print(f"\nüìà STATISTIQUES DU DATASET ML")
    print("=" * 35)
    print(f"Nombre d'√©chantillons: {len(X_features):,}")
    print(f"Nombre de features: {X_features.shape[1]}")
    print(f"Distribution des classes:")
    for medal_code, count in y_target.value_counts().sort_index().items():
        medal_name = {0: 'BRONZE', 1: 'GOLD', 2: 'SILVER'}[medal_code]
        print(f"   {medal_name}: {count:,} ({count/len(y_target)*100:.1f}%)")
        
else:
    print("‚ùå Dataset unifi√© non disponible")

## 6. Machine Learning avec Random Forest

In [None]:
# Pr√©paration des donn√©es pour Random Forest
def prepare_ml_data():
    """Pr√©pare les donn√©es pour les mod√®les de Machine Learning"""
    print("Pr√©paration des donn√©es pour le Machine Learning...")
    
    # R√©cup√©ration des donn√©es enrichies
    query = """
    SELECT 
        r.athlete_id,
        a.name as athlete_name,
        a.gender,
        r.age,
        r.age_category,
        a.height,
        a.weight,
        r.team,
        r.sport,
        r.event,
        r.medal,
        r.performance_score,
        r.season,
        r.year,
        h.country as host_country,
        h.continent as host_continent
    FROM results r
    JOIN athletes a ON r.athlete_id = a.id
    JOIN hosts h ON r.year = h.year AND r.season = h.season
    WHERE r.age IS NOT NULL 
    AND a.height IS NOT NULL 
    AND a.weight IS NOT NULL
    AND r.medal IS NOT NULL
    """
    
    return pd.read_sql(query, connection)

# Chargement des donn√©es
ml_data = prepare_ml_data()
print(f"Donn√©es ML: {ml_data.shape}")
print(f"M√©dailles par type: {ml_data['medal'].value_counts()}")

# Affichage des premi√®res lignes
print("\nPremi√®res lignes des donn√©es ML:")
ml_data.head()

In [None]:
# Pr√©paration des caract√©ristiques pour Random Forest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score
import seaborn as sns

def prepare_features_rf(data):
    """Pr√©pare les caract√©ristiques pour Random Forest"""
    df = data.copy()
    
    # Encodage des variables cat√©gorielles
    categorical_features = ['gender', 'team', 'sport', 'event', 'season', 'host_country', 'host_continent', 'age_category']
    
    # Utilisation du LabelEncoder pour chaque variable cat√©gorielle
    encoded_features = {}
    for feature in categorical_features:
        if feature in df.columns:
            le = LabelEncoder()
            df[f'{feature}_encoded'] = le.fit_transform(df[feature].astype(str))
            encoded_features[feature] = le
    
    # S√©lection des features num√©riques et encod√©es
    feature_columns = ['age', 'height', 'weight', 'performance_score', 'year'] + \
                     [f'{f}_encoded' for f in categorical_features if f in df.columns]
    
    X = df[feature_columns]
    
    return X, encoded_features

# Pr√©paration des donn√©es
X, feature_encoders = prepare_features_rf(ml_data)
print(f"Features pr√©par√©es: {X.shape}")
print(f"Colonnes utilis√©es: {list(X.columns)}")

# Affichage des statistiques descriptives
print("\nStatistiques des features:")
X.describe()

In [None]:
# Random Forest Classifier - Pr√©diction du type de m√©daille
print("=== RANDOM FOREST CLASSIFIER ===")
print("Pr√©diction du type de m√©daille bas√©e sur les caract√©ristiques de l'athl√®te")

# Pr√©paration de la variable cible
y_medal = ml_data['medal']

# Division train/test
X_train, X_test, y_train_medal, y_test_medal = train_test_split(
    X, y_medal, test_size=0.2, random_state=42, stratify=y_medal
)

print(f"Donn√©es d'entra√Ænement: {X_train.shape}")
print(f"Donn√©es de test: {X_test.shape}")
print(f"Distribution des m√©dailles dans l'entra√Ænement:")
print(y_train_medal.value_counts())

# Entra√Ænement du Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("\nEntra√Ænement du mod√®le...")
rf_classifier.fit(X_train, y_train_medal)

# Pr√©dictions
y_pred_medal = rf_classifier.predict(X_test)

# √âvaluation
print("\n=== R√âSULTATS DU CLASSIFICATEUR ===")
print(f"Accuracy: {rf_classifier.score(X_test, y_test_medal):.4f}")
print("\nRapport de classification:")
print(classification_report(y_test_medal, y_pred_medal))

In [None]:
# Matrice de confusion et importance des features
plt.figure(figsize=(15, 5))

# Matrice de confusion
plt.subplot(1, 3, 1)
cm = confusion_matrix(y_test_medal, y_pred_medal)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=rf_classifier.classes_, 
            yticklabels=rf_classifier.classes_)
plt.title('Matrice de Confusion\nPr√©diction des M√©dailles')
plt.xlabel('Pr√©diction')
plt.ylabel('R√©alit√©')

# Importance des features
plt.subplot(1, 3, 2)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

plt.barh(range(len(feature_importance)), feature_importance['importance'])
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.title('Importance des Features\nClassification des M√©dailles')
plt.xlabel('Importance')

# Top 10 des features les plus importantes
plt.subplot(1, 3, 3)
top_features = feature_importance.head(10)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.title('Top 10 Features\nLes Plus Importantes')
plt.xlabel('Importance')

plt.tight_layout()
plt.show()

print("\nTop 10 des features les plus importantes:")
print(feature_importance.head(10))

In [None]:
# Random Forest Regressor - Pr√©diction du score de performance
print("\n=== RANDOM FOREST REGRESSOR ===")
print("Pr√©diction du score de performance bas√©e sur les caract√©ristiques")

# Pr√©paration de la variable cible pour la r√©gression
y_performance = ml_data['performance_score']

# Division train/test pour la r√©gression
X_train_reg, X_test_reg, y_train_perf, y_test_perf = train_test_split(
    X, y_performance, test_size=0.2, random_state=42
)

# Entra√Ænement du Random Forest Regressor
rf_regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Entra√Ænement du r√©gresseur...")
rf_regressor.fit(X_train_reg, y_train_perf)

# Pr√©dictions
y_pred_perf = rf_regressor.predict(X_test_reg)

# √âvaluation
mse = mean_squared_error(y_test_perf, y_pred_perf)
r2 = r2_score(y_test_perf, y_pred_perf)

print(f"\n=== R√âSULTATS DU R√âGRESSEUR ===")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {np.sqrt(mse):.4f}")
print(f"R¬≤ Score: {r2:.4f}")

# Validation crois√©e
cv_scores = cross_val_score(rf_regressor, X_train_reg, y_train_perf, cv=5, scoring='r2')
print(f"\nValidation crois√©e R¬≤ (5-fold): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## 7. Clustering avec K-Means

In [None]:
# Clustering K-Means des athl√®tes
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

print("=== CLUSTERING K-MEANS ===")
print("Identification de profils d'athl√®tes similaires")

# Pr√©paration des donn√©es pour le clustering
def prepare_clustering_data():
    """Pr√©pare les donn√©es pour le clustering"""
    # S√©lection des features num√©riques pour le clustering
    numeric_features = ['age', 'height', 'weight', 'performance_score', 'year']
    
    # R√©cup√©ration des donn√©es sans valeurs manquantes
    cluster_data = ml_data[numeric_features + ['gender', 'sport', 'medal', 'athlete_name']].copy()
    cluster_data = cluster_data.dropna()
    
    # Standardisation des features num√©riques
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(cluster_data[numeric_features])
    
    return X_scaled, cluster_data, scaler

X_cluster, cluster_info, scaler_cluster = prepare_clustering_data()
print(f"Donn√©es pour clustering: {X_cluster.shape}")

# D√©termination du nombre optimal de clusters avec la m√©thode du coude
inertias = []
silhouette_scores = []
k_range = range(2, 11)

print("\nCalcul du nombre optimal de clusters...")
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_cluster, kmeans.labels_))

# Visualisation de la m√©thode du coude et du score de silhouette
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(k_range, inertias, 'bo-')
plt.xlabel('Nombre de Clusters (k)')
plt.ylabel('Inertie')
plt.title('M√©thode du Coude')
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot(k_range, silhouette_scores, 'ro-')
plt.xlabel('Nombre de Clusters (k)')
plt.ylabel('Score de Silhouette')
plt.title('Score de Silhouette')
plt.grid(True)

# Choix du nombre optimal de clusters (bas√© sur le score de silhouette)
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"\nNombre optimal de clusters: {optimal_k}")
print(f"Meilleur score de silhouette: {max(silhouette_scores):.4f}")

# Application du K-Means avec le nombre optimal de clusters
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_cluster)

plt.subplot(1, 3, 3)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
plt.scatter(kmeans_final.cluster_centers_[:, 0], kmeans_final.cluster_centers_[:, 1], 
           c='red', marker='x', s=200, linewidths=3)
plt.xlabel('√Çge (standardis√©)')
plt.ylabel('Taille (standardis√©)')
plt.title(f'Clusters K-Means (k={optimal_k})')
plt.colorbar()

plt.tight_layout()
plt.show()

In [None]:
# Analyse des clusters
cluster_info['cluster'] = cluster_labels

print("=== ANALYSE DES CLUSTERS ===")
print(f"R√©partition des athl√®tes par cluster:")
print(cluster_info['cluster'].value_counts().sort_index())

# Caract√©ristiques moyennes de chaque cluster
cluster_stats = cluster_info.groupby('cluster').agg({
    'age': ['mean', 'std'],
    'height': ['mean', 'std'],
    'weight': ['mean', 'std'], 
    'performance_score': ['mean', 'std'],
    'year': ['mean', 'min', 'max']
}).round(2)

print("\nCaract√©ristiques moyennes par cluster:")
print(cluster_stats)

# Analyse par genre et sport
print("\n=== COMPOSITION DES CLUSTERS ===")
for cluster_id in sorted(cluster_info['cluster'].unique()):
    cluster_data = cluster_info[cluster_info['cluster'] == cluster_id]
    print(f"\n--- Cluster {cluster_id} ({len(cluster_data)} athl√®tes) ---")
    
    # R√©partition par genre
    gender_dist = cluster_data['gender'].value_counts()
    print(f"Genre: {dict(gender_dist)}")
    
    # Top 5 sports
    top_sports = cluster_data['sport'].value_counts().head(5)
    print(f"Top 5 sports: {dict(top_sports)}")
    
    # R√©partition des m√©dailles
    medal_dist = cluster_data['medal'].value_counts()
    print(f"M√©dailles: {dict(medal_dist)}")
    
    # Statistiques physiques moyennes
    avg_stats = cluster_data[['age', 'height', 'weight', 'performance_score']].mean()
    print(f"Moyennes: Age={avg_stats['age']:.1f}, Taille={avg_stats['height']:.1f}cm, "
          f"Poids={avg_stats['weight']:.1f}kg, Score={avg_stats['performance_score']:.2f}")

In [None]:
# Visualisation avanc√©e des clusters
plt.figure(figsize=(20, 15))

# Distribution des clusters par caract√©ristiques
features_to_plot = ['age', 'height', 'weight', 'performance_score']

for i, feature in enumerate(features_to_plot):
    plt.subplot(3, 4, i+1)
    for cluster_id in sorted(cluster_info['cluster'].unique()):
        cluster_data = cluster_info[cluster_info['cluster'] == cluster_id]
        plt.hist(cluster_data[feature], alpha=0.6, label=f'Cluster {cluster_id}', bins=20)
    plt.xlabel(feature.title())
    plt.ylabel('Fr√©quence')
    plt.title(f'Distribution de {feature.title()} par Cluster')
    plt.legend()

# Box plots par cluster
for i, feature in enumerate(features_to_plot):
    plt.subplot(3, 4, i+5)
    cluster_info.boxplot(column=feature, by='cluster', ax=plt.gca())
    plt.title(f'Box Plot: {feature.title()} par Cluster')
    plt.suptitle('')

# Heatmap des corr√©lations par cluster
plt.subplot(3, 4, 9)
correlation_matrix = cluster_info[features_to_plot + ['cluster']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Matrice de Corr√©lation')

# R√©partition des m√©dailles par cluster
plt.subplot(3, 4, 10)
medal_cluster = pd.crosstab(cluster_info['cluster'], cluster_info['medal'])
medal_cluster_pct = medal_cluster.div(medal_cluster.sum(axis=1), axis=0)
sns.heatmap(medal_cluster_pct, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('R√©partition des M√©dailles\npar Cluster (%)')

# Distribution des sports par cluster (top 10 sports)
plt.subplot(3, 4, 11)
top_sports_overall = cluster_info['sport'].value_counts().head(10).index
sport_cluster = cluster_info[cluster_info['sport'].isin(top_sports_overall)]
sport_counts = pd.crosstab(sport_cluster['cluster'], sport_cluster['sport'])
sport_counts_pct = sport_counts.div(sport_counts.sum(axis=0), axis=1)
sns.heatmap(sport_counts_pct.T, annot=True, fmt='.2f', cmap='Blues')
plt.title('Top 10 Sports par Cluster (%)')
plt.xticks(rotation=0)

# Distribution par genre et cluster
plt.subplot(3, 4, 12)
gender_cluster = pd.crosstab(cluster_info['cluster'], cluster_info['gender'])
gender_cluster.plot(kind='bar', stacked=True, ax=plt.gca())
plt.title('R√©partition Hommes/Femmes\npar Cluster')
plt.xlabel('Cluster')
plt.ylabel('Nombre d\'athl√®tes')
plt.legend(title='Genre')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

## 8. √âvaluation et Comparaison des Mod√®les

In [None]:
# R√©sum√© et comparaison des performances des mod√®les
print("=== R√âSUM√â DE L'ANALYSE MACHINE LEARNING ===")

# R√©capitulatif des mod√®les
models_summary = {
    'Random Forest Classifier': {
        'T√¢che': 'Classification des m√©dailles',
        'Accuracy': rf_classifier.score(X_test, y_test_medal),
        'Features importantes': feature_importance.head(3)['feature'].tolist(),
        'Commentaire': 'Pr√©diction du type de m√©daille (Or/Argent/Bronze)'
    },
    'Random Forest Regressor': {
        'T√¢che': 'Pr√©diction du score de performance', 
        'R¬≤ Score': r2,
        'RMSE': np.sqrt(mse),
        'CV Score': cv_scores.mean(),
        'Commentaire': 'Pr√©diction continue du score de performance'
    },
    'K-Means Clustering': {
        'T√¢che': 'Segmentation des athl√®tes',
        'Nombre de clusters': optimal_k,
        'Score de silhouette': max(silhouette_scores),
        'Commentaire': 'Identification de profils d\'athl√®tes similaires'
    }
}

for model_name, metrics in models_summary.items():
    print(f"\n--- {model_name} ---")
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"{metric}: {value:.4f}")
        elif isinstance(value, list):
            print(f"{metric}: {', '.join(value)}")
        else:
            print(f"{metric}: {value}")

# Graphique de comparaison des performances
plt.figure(figsize=(15, 10))

# Performance des mod√®les
plt.subplot(2, 3, 1)
models = ['RF Classifier', 'RF Regressor', 'K-Means']
scores = [
    rf_classifier.score(X_test, y_test_medal),
    r2,
    max(silhouette_scores)
]
colors = ['skyblue', 'lightgreen', 'lightcoral']
bars = plt.bar(models, scores, color=colors)
plt.title('Scores de Performance des Mod√®les')
plt.ylabel('Score')
plt.ylim(0, 1)
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{score:.3f}', ha='center', va='bottom')

# Distribution des pr√©dictions vs r√©alit√© (Classifier)
plt.subplot(2, 3, 2)
pred_vs_real = pd.DataFrame({'R√©el': y_test_medal, 'Pr√©dit': y_pred_medal})
confusion_data = pd.crosstab(pred_vs_real['R√©el'], pred_vs_real['Pr√©dit'])
sns.heatmap(confusion_data, annot=True, fmt='d', cmap='Blues')
plt.title('Matrice de Confusion\nClassificateur')

# Scatter plot pr√©dictions vs r√©alit√© (Regressor)
plt.subplot(2, 3, 3)
plt.scatter(y_test_perf, y_pred_perf, alpha=0.6)
plt.plot([y_test_perf.min(), y_test_perf.max()], [y_test_perf.min(), y_test_perf.max()], 'r--', lw=2)
plt.xlabel('Score R√©el')
plt.ylabel('Score Pr√©dit')
plt.title('Pr√©dictions vs R√©alit√©\nR√©gresseur')

# Importance des features (top 10)
plt.subplot(2, 3, 4)
top_10_features = feature_importance.head(10)
plt.barh(range(len(top_10_features)), top_10_features['importance'])
plt.yticks(range(len(top_10_features)), top_10_features['feature'])
plt.title('Top 10 Features Importantes')
plt.xlabel('Importance')

# Distribution des clusters
plt.subplot(2, 3, 5)
cluster_counts = cluster_info['cluster'].value_counts().sort_index()
plt.pie(cluster_counts.values, labels=[f'Cluster {i}' for i in cluster_counts.index], 
        autopct='%1.1f%%', startangle=90)
plt.title('R√©partition des Athl√®tes\npar Cluster')

# √âvolution temporelle par cluster
plt.subplot(2, 3, 6)
for cluster_id in sorted(cluster_info['cluster'].unique()):
    cluster_data = cluster_info[cluster_info['cluster'] == cluster_id]
    yearly_counts = cluster_data.groupby('year').size()
    plt.plot(yearly_counts.index, yearly_counts.values, marker='o', label=f'Cluster {cluster_id}')
plt.xlabel('Ann√©e')
plt.ylabel('Nombre d\'athl√®tes')
plt.title('√âvolution Temporelle\ndes Clusters')
plt.legend()
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 9. Conclusions et Insights

In [None]:
# Insights finaux et recommandations
print("=== INSIGHTS CL√âS DE L'ANALYSE ===")

insights = [
    "üèÖ CLASSIFICATION DES M√âDAILLES:",
    f"- Le mod√®le Random Forest atteint une accuracy de {rf_classifier.score(X_test, y_test_medal):.3f}",
    f"- Les features les plus importantes sont: {', '.join(feature_importance.head(3)['feature'].tolist())}",
    "- Cela sugg√®re que certaines caract√©ristiques sont pr√©dictives du succ√®s olympique",
    "",
    "üìä PR√âDICTION DE PERFORMANCE:",
    f"- Le r√©gresseur Random Forest obtient un R¬≤ de {r2:.3f}",
    f"- RMSE de {np.sqrt(mse):.3f} sur l'√©chelle de score de performance",
    "- Validation crois√©e confirme la robustesse du mod√®le",
    "",
    "üë• SEGMENTATION DES ATHL√àTES:",
    f"- {optimal_k} clusters identifi√©s avec un score de silhouette de {max(silhouette_scores):.3f}",
    "- Chaque cluster repr√©sente un profil d'athl√®te distinct",
    "- Permet de comprendre la diversit√© des profils olympiques",
    "",
    "üîç RECOMMANDATIONS:",
    "1. Utiliser les features importantes pour le recrutement sportif",
    "2. Adapter l'entra√Ænement selon le profil de cluster de l'athl√®te",
    "3. Pr√©dire les performances futures pour la planification olympique",
    "4. Analyser l'√©volution des profils au fil du temps",
    "",
    "üí° EXTENSIONS POSSIBLES:",
    "- Mod√®les plus complexes (XGBoost, r√©seaux de neurones)",
    "- Analyse temporelle des tendances",
    "- Pr√©diction par sport sp√©cifique",
    "- Analyse de l'impact des pays h√¥tes",
    "- Clustering hi√©rarchique pour une segmentation plus fine"
]

for insight in insights:
    print(insight)

# Sauvegarde des r√©sultats pour utilisation future
results_summary = {
    'rf_classifier_accuracy': rf_classifier.score(X_test, y_test_medal),
    'rf_regressor_r2': r2,
    'optimal_clusters': optimal_k,
    'silhouette_score': max(silhouette_scores),
    'top_features': feature_importance.head(5)['feature'].tolist(),
    'total_athletes_analyzed': len(ml_data),
    'clusters_distribution': cluster_info['cluster'].value_counts().to_dict()
}

print(f"\n=== R√âSULTATS SAUVEGARD√âS ===")
print("Les m√©triques cl√©s ont √©t√© calcul√©es et sont disponibles pour analyse future.")
print(f"Nombre total d'athl√®tes analys√©s: {results_summary['total_athletes_analyzed']:,}")

# Affichage final des m√©triques de performance
print(f"\n=== M√âTRIQUES FINALES ===")
print(f"üéØ Classification Accuracy: {results_summary['rf_classifier_accuracy']:.3f}")
print(f"üìà Regression R¬≤: {results_summary['rf_regressor_r2']:.3f}")
print(f"üîÑ Clustering Silhouette: {results_summary['silhouette_score']:.3f}")
print(f"üìä Nombre de clusters: {results_summary['optimal_clusters']}")

print("\n‚úÖ ANALYSE COMPL√àTE TERMIN√âE ‚úÖ")
print("Tous les mod√®les ML ont √©t√© entra√Æn√©s et √©valu√©s avec succ√®s!")