In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xml.etree.ElementTree as ET
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

medals_data = pd.read_excel('data/olympic_medals.xlsx')
hosts_data = pd.read_xml('data/olympic_hosts.xml')


In [2]:
merged_data = pd.merge(medals_data, hosts_data, left_on='slug_game', right_on='game_slug')

summer_data = merged_data[merged_data['game_season'] == 'Summer']

summer_medals = summer_data.groupby('country_name').agg({
    'medal_type': lambda x: (x == 'GOLD').sum(),
    'slug_game': 'count'
}).rename(columns={'medal_type': 'gold_medals', 'slug_game': 'total_medals'})

summer_medals['silver_medals'] = summer_data.groupby('country_name')['medal_type'].apply(lambda x: (x == 'SILVER').sum())
summer_medals['bronze_medals'] = summer_data.groupby('country_name')['medal_type'].apply(lambda x: (x == 'BRONZE').sum())

X = summer_medals[['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals']].values
y = summer_medals[['gold_medals', 'silver_medals', 'bronze_medals']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_2024 = np.array([[10, 5, 7, 22]])
X_2024_scaled = scaler.transform(X_2024)

In [3]:
# Modèle 1: Decision Tree
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_2024)
print("Prédictions avec Decision Tree:", y_pred_tree)


Prédictions avec Decision Tree: [[3. 7. 9.]]


In [4]:
# Modèle 2: Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_2024)
print("Prédictions avec Random Forest:", y_pred_rf)

Prédictions avec Random Forest: [[5.51 6.14 8.41]]


In [5]:
# Modèle 3: Support Vector Machine (SVM)
svm_gold = SVR(kernel='linear')
svm_silver = SVR(kernel='linear')
svm_bronze = SVR(kernel='linear')

svm_gold.fit(X_train, y_train[:, 0])
svm_silver.fit(X_train, y_train[:, 1])
svm_bronze.fit(X_train, y_train[:, 2])

y_pred_svm_gold = svm_gold.predict(X_2024)
y_pred_svm_silver = svm_silver.predict(X_2024)
y_pred_svm_bronze = svm_bronze.predict(X_2024)

y_pred_svm = np.array([y_pred_svm_gold, y_pred_svm_silver, y_pred_svm_bronze]).T
print("Prédictions avec SVM:", y_pred_svm)

Prédictions avec SVM: [[9.92438798 5.00466123 7.00365977]]


In [6]:
# Modèle 4: Multilayer Perceptron (MLP)
mlp_model = Sequential()
mlp_model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)))
mlp_model.add(Dense(64, activation='relu'))
mlp_model.add(Dense(3, activation='linear'))

mlp_model.compile(optimizer='adam', loss='mean_squared_error')
mlp_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_test_scaled, y_test))

y_pred_mlp = mlp_model.predict(X_2024_scaled)
print("Prédictions avec MLP:", y_pred_mlp)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [7]:
# Comparaison des modèles
models = {
    "Decision Tree": tree_model,
    "Random Forest": rf_model,
    "SVM (Gold)": svm_gold,
    "SVM (Silver)": svm_silver,
    "SVM (Bronze)": svm_bronze,
    "MLP": mlp_model
}

metrics = {}
for model_name, model in models.items():
    if model_name == "MLP":
        y_pred = model.predict(X_test_scaled)

        metrics[model_name] = {
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
            "MAE": mean_absolute_error(y_test, y_pred),
            "R²": r2_score(y_test, y_pred)
        }
    elif "SVM" in model_name:
        if "Gold" in model_name:
            y_pred = model.predict(X_test)
            true_values = y_test[:, 0]
        elif "Silver" in model_name:
            y_pred = model.predict(X_test)
            true_values = y_test[:, 1]
        elif "Bronze" in model_name:
            y_pred = model.predict(X_test)
            true_values = y_test[:, 2]
        
        metrics[model_name] = {
            "RMSE": np.sqrt(mean_squared_error(true_values, y_pred)),
            "MAE": mean_absolute_error(true_values, y_pred),
            "R²": r2_score(true_values, y_pred)
        }
    else:
        y_pred = model.predict(X_test)
        metrics[model_name] = {
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
            "MAE": mean_absolute_error(y_test, y_pred),
            "R²": r2_score(y_test, y_pred)
        }



In [8]:
for model_name, model_metrics in metrics.items():
    print(f"Metrics for {model_name}:")
    for metric_name, metric_value in model_metrics.items():
        print(f"  {metric_name}: {metric_value}")

Metrics for Decision Tree:
  RMSE: 6.568906882762809
  MAE: 3.021505376344086
  R²: 0.9679235473758162
Metrics for Random Forest:
  RMSE: 3.748531755578529
  MAE: 1.9780645161290324
  R²: 0.9899796559724399
Metrics for SVM (Gold):
  RMSE: 0.06546769317700203
  MAE: 0.06345999187873272
  R²: 0.9999967737794828
Metrics for SVM (Silver):
  RMSE: 0.023601827185278475
  MAE: 0.015077975206972882
  R²: 0.9999995711802342
Metrics for SVM (Bronze):
  RMSE: 0.022905158816686012
  MAE: 0.01779970282734913
  R²: 0.9999996839583136
Metrics for MLP:
  RMSE: 6.79824357832811
  MAE: 4.555787078795895
  R²: 0.967942981248486


In [9]:
top_25_countries = summer_medals.nlargest(25, 'total_medals')
X_top_25 = top_25_countries[['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals']].values
X_top_25_scaled = scaler.transform(X_top_25)

y_pred_top_25_rf = rf_model.predict(X_top_25_scaled)
print("Prédictions avec Random Forest pour le Top 25 des pays participants:", y_pred_top_25_rf)

Prédictions avec Random Forest pour le Top 25 des pays participants: [[2.31 4.21 4.9 ]
 [1.02 1.7  1.97]
 [0.97 1.81 1.93]
 [0.48 1.28 1.84]
 [0.48 1.28 1.84]
 [1.06 1.05 0.69]
 [0.74 0.86 1.51]
 [0.5  0.83 1.59]
 [0.35 0.87 0.9 ]
 [0.31 0.87 0.92]
 [0.35 0.87 0.9 ]
 [0.31 0.87 0.92]
 [0.31 0.87 0.92]
 [0.   0.9  0.92]
 [0.   0.9  0.92]
 [0.31 0.87 0.92]
 [0.   0.9  0.92]
 [0.   0.9  0.92]
 [0.   0.   1.  ]
 [0.18 0.67 0.27]
 [0.18 0.67 0.27]
 [0.18 0.67 0.27]
 [0.18 0.67 0.27]
 [0.18 0.67 0.27]
 [0.18 0.67 0.27]]


In [10]:
athlete_data = pd.DataFrame({
    'athlete_id': np.arange(1, 101),
    'past_performance': np.random.rand(100),
    'gold_medals': np.random.randint(0, 3, size=100),
    'silver_medals': np.random.randint(0, 3, size=100),
    'bronze_medals': np.random.randint(0, 3, size=100)
})

X_athletes = athlete_data[['past_performance', 'gold_medals', 'silver_medals', 'bronze_medals']].values
y_athletes = athlete_data[['gold_medals', 'silver_medals', 'bronze_medals']].values

X_athletes_train, X_athletes_test, y_athletes_train, y_athletes_test = train_test_split(X_athletes, y_athletes, test_size=0.2, random_state=42)

rf_model_athletes = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_athletes.fit(X_athletes_train, y_athletes_train)
y_pred_athletes_rf = rf_model_athletes.predict(X_athletes_test)
print("Prédictions avec Random Forest pour les athlètes:", y_pred_athletes_rf)

# Comparaison des prédictions avec les valeurs réelles pour les athlètes
athlete_metrics = {
    "RMSE": np.sqrt(mean_squared_error(y_athletes_test, y_pred_athletes_rf)),
    "MAE": mean_absolute_error(y_athletes_test, y_pred_athletes_rf),
    "R²": r2_score(y_athletes_test, y_pred_athletes_rf)
}

print("Metrics pour la prédiction des athlètes:")
for metric_name, metric_value in athlete_metrics.items():
    print(f"  {metric_name}: {metric_value}")

Prédictions avec Random Forest pour les athlètes: [[0.27 1.34 1.89]
 [0.04 0.   0.01]
 [0.97 0.03 0.16]
 [0.97 0.03 0.16]
 [2.   2.   0.  ]
 [1.94 1.   0.04]
 [2.   0.01 0.  ]
 [1.11 0.01 1.99]
 [1.98 0.11 0.91]
 [0.98 0.07 0.91]
 [1.79 0.1  1.99]
 [1.75 0.13 1.97]
 [0.88 1.07 1.05]
 [0.13 1.05 0.97]
 [1.99 1.99 0.05]
 [0.02 0.03 0.  ]
 [0.35 1.84 0.13]
 [2.   0.99 1.01]
 [2.   2.   0.01]
 [1.02 1.   0.02]]
Metrics pour la prédiction des athlètes:
  RMSE: 0.127501633976458
  MAE: 0.06966666666666667
  R²: 0.9757512499496962
