In [1]:
import pyodbc
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
server = 'mia10.database.windows.net'
database = 'mia10_db'
username = 'user_reader'
password = '7R&o&o4#~756^z'
driver = '{ODBC Driver 18 for SQL Server}'

connection_string = f'DRIVER={driver};SERVER={server};PORT=1433;DATABASE={database};UID={username};PWD={password}'

conn = pyodbc.connect(connection_string)

query = '''
SELECT m.slug_game, m.event_title, m.medal_type, c.country_name, e.game_year
FROM medals m
JOIN events e ON LOWER(REPLACE(m.slug_game, '-', ' ')) = LOWER(e.game_name)
JOIN country c ON m.athlete_country_id = c.country_id
WHERE e.game_season = 'Summer'
'''

df = pd.read_sql(query, conn, index_col=None)

conn.close()

  df = pd.read_sql(query, conn, index_col=None)


In [3]:
import pandas as pd

country_mapping = {
    "Russian Federation": "Russia",
    "Soviet Union": "Russia",
    "Olympic Athletes from Russia": "Russia",
    "ROC": "China",
    "Federal Republic of Germany": "Germany",
    "German Democratic Republic (Germany)": "Germany",
    "United States of America": "USA",
    "Czechoslovakia": "Czech Republic",
    "Serbia and Montenegro": "Serbia",
    "Yugoslavia": "Serbia",
    "Unified Team": "Russia",
    "United Republic of Tanzania": "Tanzania",
    "North Macedonia": "Macedonia",
    "Hong Kong, China": "China",
    "West Indies Federation": "West Indies",
    "Bohemia": "Czech Republic",
    "Australasia": "Australia",
    "United Arab Republic": "Egypt",
    "People's Republic of China": "China",
}

def map_country_names(data, mapping):
    data['country_name'] = data['country_name'].replace(mapping)
    return data

df = map_country_names(df, country_mapping)

df['country_name'].unique()

array(['Sweden', 'Indonesia', 'Slovenia', 'Australia', 'Hungary',
       'Czech Republic', 'Poland', 'Norway', 'China', 'Republic of Korea',
       'Ireland', 'Russia', 'Germany', 'Switzerland', 'Malaysia',
       'Estonia', 'Ukraine', 'Serbia', 'Cuba', 'New Zealand', 'France',
       'Brazil', 'Netherlands', 'Chile', 'USA', 'Belgium', 'Canada',
       'Bulgaria', 'Greece', 'Mexico', 'Qatar', 'Latvia',
       'Chinese Taipei', 'Republic of Moldova', 'Italy', 'Romania',
       "Democratic People's Republic of Korea", 'Austria', 'South Africa',
       'Lithuania', 'Uruguay', 'Finland', 'Islamic Republic of Iran',
       'Denmark', 'Bahamas', 'Japan', 'Portugal', 'Belarus', 'India',
       'Spain', 'Slovakia', 'Croatia', 'Great Britain',
       'Independent Olympic Athletes', 'San Marino', 'Argentina', 'MIX'],
      dtype=object)

In [4]:
countries_rio_2016 = df[df['slug_game'] == 'rio-2016']['country_name'].unique()
df_filtered = df[df['country_name'].isin(countries_rio_2016)]

# grouper par pays, édition et type de médaille
grouped = df_filtered.groupby(['country_name', 'slug_game', 'medal_type']).size().unstack(fill_value=0)

grouped.columns = [f'{col}_count' for col in grouped.columns]

grouped.reset_index(inplace=True)

grouped.head(1000)

Unnamed: 0,country_name,slug_game,BRONZE_count,GOLD_count,SILVER_count
0,Argentina,athens-2004,4,0,0
1,Argentina,atlanta-1996,0,0,1
2,Argentina,barcelona-1992,2,0,0
3,Argentina,beijing-2008,2,2,0
4,Argentina,berlin-1936,2,0,0
...,...,...,...,...,...
442,Ukraine,beijing-2008,2,0,0
443,Ukraine,london-2012,2,0,0
444,Ukraine,rio-2016,2,0,0
445,Ukraine,sydney-2000,4,0,0


In [5]:
# Encodage des noms de pays
encoder = OneHotEncoder(sparse_output=False)
country_encoded = encoder.fit_transform(grouped[['country_name']])
country_encoded_df = pd.DataFrame(country_encoded, columns=encoder.get_feature_names_out(['country_name']))

grouped = pd.concat([grouped, country_encoded_df], axis=1) # Ajouter les colonnes encodées au DataFrame

# caractéristiques supplémentaires
grouped['total_medals'] = grouped['BRONZE_count'] + grouped['GOLD_count'] + grouped['SILVER_count']
grouped['gold_silver_diff'] = grouped['GOLD_count'] - grouped['SILVER_count']
grouped['gold_bronze_diff'] = grouped['GOLD_count'] - grouped['BRONZE_count']

# préparation des données pour random forest
def create_sequences(data, time_steps=3):
    Xs, ys = [], []
    for i in range(len(data) - time_steps):
        Xs.append(data.iloc[i:(i + time_steps), 2:].values)
        ys.append(data.iloc[i + time_steps, [3, 4, 2]].values) # assuming [3, 4, 2] are the target columns
    return np.array(Xs, dtype=np.float32), np.array(ys, dtype=np.float32)

# séquences
time_steps = 3
X_seq, y_seq = create_sequences(grouped, time_steps)

# Standardiser les données
scaler = StandardScaler()
X_seq_shape = X_seq.shape
X_seq = scaler.fit_transform(X_seq.reshape(-1, X_seq.shape[-1])).reshape(X_seq_shape)

X_seq = X_seq.reshape(X_seq.shape[0], -1) # Aplatir les données

X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

def objective(trial):
    # hyperparamètres
    n_estimators = trial.suggest_int('n_estimators', 100, 300)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_int('max_depth', 10, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print(f'Best Hyperparameters: {study.best_params}')

best_model = RandomForestRegressor(
    **study.best_params,
    random_state=42
)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# évaluer
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

[I 2024-05-31 09:50:27,506] A new study created in memory with name: no-name-f9ef8e99-4df5-4b33-96e3-6217d7f5ddad
[I 2024-05-31 09:50:27,679] Trial 0 finished with value: 5.707262365160397 and parameters: {'n_estimators': 133, 'max_features': 'sqrt', 'max_depth': 25, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 5.707262365160397.
[I 2024-05-31 09:50:27,912] Trial 1 finished with value: 5.721252689377541 and parameters: {'n_estimators': 191, 'max_features': 'sqrt', 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: 5.707262365160397.
[I 2024-05-31 09:50:28,172] Trial 2 finished with value: 5.88327667923161 and parameters: {'n_estimators': 201, 'max_features': 'sqrt', 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 5.707262365160397.
[I 2024-05-31 09:50:28,500] Trial 3 finished with value: 6.300402316420132 and parameters: {'n_estimators': 181, 'max_features': 'sqrt', 'm

Best Hyperparameters: {'n_estimators': 123, 'max_features': 'log2', 'max_depth': 29, 'min_samples_split': 9, 'min_samples_leaf': 3}
Mean Squared Error: 5.606297924655817
R^2 Score: 0.1314099586184205


In [6]:
def predict_medals_next_edition(country, model, scaler, data, encoder, time_steps=3):
    # encodage du pays
    country_encoded = encoder.transform([[country]])
    country_encoded_df = pd.DataFrame(country_encoded, columns=encoder.get_feature_names_out(['country_name']))
    
    # Sélectionner les données du pays
    country_data = data[data['country_name'] == country].sort_values(by='slug_game').iloc[-time_steps:, 2:]
    country_data = pd.concat([country_data.reset_index(drop=True), country_encoded_df], axis=1)
    
    if country_data.shape[0] < time_steps:
        raise ValueError(f"Not enough data for {country} to create a prediction with {time_steps} time steps.")
    
    # Standardiser les données
    expected_features = scaler.n_features_in_
    current_features = country_data.shape[1]
    
    if current_features < expected_features:
        # ajout des 0 si nécessaire
        missing_features = expected_features - current_features
        country_data = np.hstack([country_data.values, np.zeros((country_data.shape[0], missing_features))])
    elif current_features > expected_features:
        # sup des colonnes si nécessaire
        country_data = country_data.iloc[:, :expected_features]

    country_data = scaler.transform(country_data)
    
    # aplatir les données
    country_data = country_data.reshape(1, -1)
    
    predicted_medals = model.predict(country_data)
    return pd.DataFrame(predicted_medals, columns=['GOLD_count', 'SILVER_count', 'BRONZE_count'])

In [7]:
def predict_all_countries_next_edition(model, scaler, data, encoder, time_steps=3):
    unique_countries = data['country_name'].unique()
    predictions = []

    for country in unique_countries:
        try:
            predicted_medals = predict_medals_next_edition(country, model, scaler, data, encoder, time_steps)
            predicted_medals['country_name'] = country
            predictions.append(predicted_medals)
        except ValueError as e:
            print(f"Skipping country {country}: {e}")

    predictions_df = pd.concat(predictions).reset_index(drop=True)
    return predictions_df

predictions_df = predict_all_countries_next_edition(best_model, scaler, grouped, encoder)

total_predicted_medals = predictions_df[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum().sum()

# calculer le facteur de normalisation pour ajuster les médailles à la somme cible (987)
medal_target = 987
normalization_factor = medal_target / total_predicted_medals

# appliquer aux prédictions
predictions_df[['GOLD_count', 'SILVER_count', 'BRONZE_count']] *= normalization_factor

# arrondi entier le plus proche
predictions_df[['GOLD_count', 'SILVER_count', 'BRONZE_count']] = predictions_df[['GOLD_count', 'SILVER_count', 'BRONZE_count']].round().astype(int)

predictions_df['total_medals'] = predictions_df[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum(axis=1)

# sort par nombre total de médailles si égalité prendre or
predictions_df = predictions_df.sort_values(by=['total_medals', 'GOLD_count'], ascending=False)

predictions_df.head(50)



Skipping country Lithuania: Not enough data for Lithuania to create a prediction with 3 time steps.




Unnamed: 0,GOLD_count,SILVER_count,BRONZE_count,country_name,total_medals
5,38,32,23,China,93
11,18,19,14,Germany,51
33,16,16,13,USA,45
4,20,12,12,Canada,44
1,16,13,15,Australia,44
27,15,12,11,Russia,38
12,13,13,12,Great Britain,38
26,13,11,9,Romania,33
21,10,9,13,Netherlands,32
24,8,11,12,Poland,31


In [8]:
# sauvegarde modèle
# import joblib

# joblib.dump(best_model, 'model.pkl')

## Métriques SVM

In [9]:
y_train_gold = y_train[:, 0]
y_train_silver = y_train[:, 1]
y_train_bronze = y_train[:, 2]

y_test_gold = y_test[:, 0]
y_test_silver = y_test[:, 1]
y_test_bronze = y_test[:, 2]

# modèles SVM
svr_gold = make_pipeline(StandardScaler(), SVR(C=1.0, kernel='rbf', gamma='scale'))
svr_silver = make_pipeline(StandardScaler(), SVR(C=1.0, kernel='rbf', gamma='scale'))
svr_bronze = make_pipeline(StandardScaler(), SVR(C=1.0, kernel='rbf', gamma='scale'))

# entraînement
svr_gold.fit(X_train, y_train[:, 0])
svr_silver.fit(X_train, y_train[:, 1])
svr_bronze.fit(X_train, y_train[:, 2])

# prédictions
y_pred_svr_gold = svr_gold.predict(X_test)
y_pred_svr_silver = svr_silver.predict(X_test)
y_pred_svr_bronze = svr_bronze.predict(X_test)

print(f'SVM Gold Mean Squared Error: {mean_squared_error(y_test[:, 0], y_pred_svr_gold)}')
print(f'SVM Gold R^2 Score: {r2_score(y_test[:, 0], y_pred_svr_gold)}')

print(f'SVM Silver Mean Squared Error: {mean_squared_error(y_test[:, 1], y_pred_svr_silver)}')
print(f'SVM Silver R^2 Score: {r2_score(y_test[:, 1], y_pred_svr_silver)}')

print(f'SVM Bronze Mean Squared Error: {mean_squared_error(y_test[:, 2], y_pred_svr_bronze)}')
print(f'SVM Bronze R^2 Score: {r2_score(y_test[:, 2], y_pred_svr_bronze)}')

# y_pred_svr = np.stack([y_pred_gold, y_pred_silver, y_pred_bronze], axis=1)

SVM Gold Mean Squared Error: 9.999978099435765
SVM Gold R^2 Score: -0.019195352295526158
SVM Silver Mean Squared Error: 7.085377448655731
SVM Silver R^2 Score: -0.02974706641784297
SVM Bronze Mean Squared Error: 2.9121830513777036
SVM Bronze R^2 Score: 0.0736727105865137


## Métriques MLP

In [10]:
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

print(f'MLP Mean Squared Error: {mse_mlp}')
print(f'MLP R^2 Score: {r2_mlp}')

MLP Mean Squared Error: 12.000678062438965
MLP R^2 Score: -1.0694584341596853




## Prédictions SVM

In [11]:
def predict_medals_next_edition_svr(country, models, scaler, data, encoder, time_steps=3):
    country_encoded = encoder.transform([[country]])
    country_encoded_df = pd.DataFrame(country_encoded, columns=encoder.get_feature_names_out(['country_name']))
    
    country_data = data[data['country_name'] == country].sort_values(by='slug_game').iloc[-time_steps:, 2:]
    country_data = pd.concat([country_data.reset_index(drop=True), country_encoded_df], axis=1)
    
    if country_data.shape[0] < time_steps:
        raise ValueError(f"Not enough data for {country} to create a prediction with {time_steps} time steps.")
    
    expected_features = scaler.n_features_in_
    current_features = country_data.shape[1]
    
    if current_features < expected_features:
        missing_features = expected_features - current_features
        country_data = np.hstack([country_data.values, np.zeros((country_data.shape[0], missing_features))])
    elif current_features > expected_features:
        country_data = country_data.iloc[:, :expected_features]

    country_data = scaler.transform(country_data)
    country_data = country_data.reshape(1, -1)
    
    predicted_gold = models['gold'].predict(country_data)
    predicted_silver = models['silver'].predict(country_data)
    predicted_bronze = models['bronze'].predict(country_data)
    
    predicted_medals = np.stack([predicted_gold, predicted_silver, predicted_bronze], axis=1)
    return pd.DataFrame(predicted_medals, columns=['GOLD_count', 'SILVER_count', 'BRONZE_count'])

def predict_all_countries_next_edition_svr(models, scaler, data, encoder, time_steps=3):
    unique_countries = data['country_name'].unique()
    predictions = []

    for country in unique_countries:
        try:
            predicted_medals = predict_medals_next_edition_svr(country, models, scaler, data, encoder, time_steps)
            predicted_medals['country_name'] = country
            predictions.append(predicted_medals)
        except ValueError as e:
            print(f"Skipping country {country}: {e}")

    predictions_df = pd.concat(predictions).reset_index(drop=True)
    return predictions_df

svm_models = {'gold': svr_gold, 'silver': svr_silver, 'bronze': svr_bronze}

# Prédictions pour SVM
predictions_df_svr = predict_all_countries_next_edition_svr(svm_models, scaler, grouped, encoder)

total_predicted_medals_svr = predictions_df_svr[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum().sum()
normalization_factor_svr = medal_target / total_predicted_medals_svr
predictions_df_svr[['GOLD_count', 'SILVER_count', 'BRONZE_count']] *= normalization_factor_svr
predictions_df_svr[['GOLD_count', 'SILVER_count', 'BRONZE_count']] = predictions_df_svr[['GOLD_count', 'SILVER_count', 'BRONZE_count']].round().astype(int)
predictions_df_svr['total_medals'] = predictions_df_svr[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum(axis=1)
predictions_df_svr = predictions_df_svr.sort_values(by=['total_medals', 'GOLD_count'], ascending=False)
predictions_df_svr.head(50)


Skipping country Lithuania: Not enough data for Lithuania to create a prediction with 3 time steps.




Unnamed: 0,GOLD_count,SILVER_count,BRONZE_count,country_name,total_medals
27,25,15,11,Russia,51
12,15,19,17,Great Britain,51
33,17,19,13,USA,49
26,17,17,12,Romania,46
11,11,23,10,Germany,44
25,21,5,15,Republic of Korea,41
5,10,15,16,China,41
14,11,10,19,Hungary,40
24,4,17,18,Poland,39
22,13,14,6,New Zealand,33


## Prédictions MLP

In [12]:
predictions_df_mlp = predict_all_countries_next_edition(mlp_model, scaler, grouped, encoder)

# prédictions MLP
total_predicted_medals_mlp = predictions_df_mlp[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum().sum()
normalization_factor_mlp = medal_target / total_predicted_medals_mlp
predictions_df_mlp[['GOLD_count', 'SILVER_count', 'BRONZE_count']] *= normalization_factor_mlp
predictions_df_mlp[['GOLD_count', 'SILVER_count', 'BRONZE_count']] = predictions_df_mlp[['GOLD_count', 'SILVER_count', 'BRONZE_count']].round().astype(int)
predictions_df_mlp['total_medals'] = predictions_df_mlp[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum(axis=1)
predictions_df_mlp = predictions_df_mlp.sort_values(by=['total_medals', 'GOLD_count'], ascending=False)
predictions_df_mlp.head(50)

Skipping country Lithuania: Not enough data for Lithuania to create a prediction with 3 time steps.




Unnamed: 0,GOLD_count,SILVER_count,BRONZE_count,country_name,total_medals
5,75,122,-7,China,190
4,82,28,22,Canada,132
0,33,20,44,Argentina,97
26,20,25,19,Romania,64
24,12,23,19,Poland,54
2,17,10,6,Austria,33
18,4,8,20,Japan,32
14,8,11,12,Hungary,31
27,9,6,13,Russia,28
16,-3,6,25,Ireland,28


## Prédictions fusionnées

In [13]:
# prédictions Random Forest
y_pred_rf = best_model.predict(X_test)

# prédictions SVM
y_pred_svr_gold = svr_gold.predict(X_test)
y_pred_svr_silver = svr_silver.predict(X_test)
y_pred_svr_bronze = svr_bronze.predict(X_test)
y_pred_svr = np.stack([y_pred_svr_gold, y_pred_svr_silver, y_pred_svr_bronze], axis=1)

# prédictions MLP
y_pred_mlp = mlp_model.predict(X_test)

In [14]:
# moyenne simple
y_pred_ensemble = (y_pred_rf + y_pred_svr + y_pred_mlp) / 3

# moyenne pondéré
weights = {'rf': 0.5, 'svr': 0.25, 'mlp': 0.25}
y_pred_ensemble = (weights['rf'] * y_pred_rf + weights['svr'] * y_pred_svr + weights['mlp'] * y_pred_mlp)

In [15]:
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

print(f'Ensemble Mean Squared Error: {mse_ensemble}')
print(f'Ensemble R^2 Score: {r2_ensemble}')

Ensemble Mean Squared Error: 6.0103613412327626
Ensemble R^2 Score: 0.06497216158869963


In [16]:
def predict_medals_next_edition_ensemble(country, models, scaler, data, encoder, time_steps=3):
    country_encoded = encoder.transform([[country]])
    country_encoded_df = pd.DataFrame(country_encoded, columns=encoder.get_feature_names_out(['country_name']))

    country_data = data[data['country_name'] == country].sort_values(by='slug_game').iloc[-time_steps:, 2:]
    country_data = pd.concat([country_data.reset_index(drop=True), country_encoded_df], axis=1)

    if country_data.shape[0] < time_steps:
        raise ValueError(f"Not enough data for {country} to create a prediction with {time_steps} time steps.")

    expected_features = scaler.n_features_in_
    current_features = country_data.shape[1]

    if current_features < expected_features:
        missing_features = expected_features - current_features
        country_data = np.hstack([country_data.values, np.zeros((country_data.shape[0], missing_features))])
    elif current_features > expected_features:
        country_data = country_data.iloc[:, :expected_features]

    country_data = scaler.transform(country_data)
    country_data = country_data.reshape(1, -1)

    predicted_rf = models['rf'].predict(country_data)
    predicted_svr_gold = models['svr_gold'].predict(country_data)
    predicted_svr_silver = models['svr_silver'].predict(country_data)
    predicted_svr_bronze = models['svr_bronze'].predict(country_data)
    predicted_svr = np.stack([predicted_svr_gold, predicted_svr_silver, predicted_svr_bronze], axis=1)
    predicted_mlp = models['mlp'].predict(country_data)

    predicted_ensemble = (weights['rf'] * predicted_rf + weights['svr'] * predicted_svr + weights['mlp'] * predicted_mlp)

    return pd.DataFrame(predicted_ensemble, columns=['GOLD_count', 'SILVER_count', 'BRONZE_count'])

def predict_all_countries_next_edition_ensemble(models, scaler, data, encoder, time_steps=3):
    unique_countries = data['country_name'].unique()
    predictions = []

    for country in unique_countries:
        try:
            predicted_medals = predict_medals_next_edition_ensemble(country, models, scaler, data, encoder, time_steps)
            predicted_medals['country_name'] = country
            predictions.append(predicted_medals)
        except ValueError as e:
            print(f"Skipping country {country}: {e}")

    predictions_df = pd.concat(predictions).reset_index(drop=True)
    return predictions_df

# modèles à utiliser
models_ensemble = {
    'rf': best_model,
    'svr_gold': svr_gold,
    'svr_silver': svr_silver,
    'svr_bronze': svr_bronze,
    'mlp': mlp_model
}

# prédictions
predictions_df_ensemble = predict_all_countries_next_edition_ensemble(models_ensemble, scaler, grouped, encoder)

total_predicted_medals_ensemble = predictions_df_ensemble[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum().sum()
normalization_factor_ensemble = medal_target / total_predicted_medals_ensemble
predictions_df_ensemble[['GOLD_count', 'SILVER_count', 'BRONZE_count']] *= normalization_factor_ensemble
predictions_df_ensemble[['GOLD_count', 'SILVER_count', 'BRONZE_count']] = predictions_df_ensemble[['GOLD_count', 'SILVER_count', 'BRONZE_count']].round().astype(int)
predictions_df_ensemble['total_medals'] = predictions_df_ensemble[['GOLD_count', 'SILVER_count', 'BRONZE_count']].sum(axis=1)
predictions_df_ensemble = predictions_df_ensemble.sort_values(by=['total_medals', 'GOLD_count'], ascending=False)

predictions_df_ensemble.head(50)

Skipping country Lithuania: Not enough data for Lithuania to create a prediction with 3 time steps.




Unnamed: 0,GOLD_count,SILVER_count,BRONZE_count,country_name,total_medals
5,42,51,14,China,107
4,32,16,15,Canada,63
0,14,11,22,Argentina,47
26,16,16,12,Romania,44
33,14,16,11,USA,41
11,12,18,9,Germany,39
27,16,11,11,Russia,38
24,8,15,15,Poland,38
12,10,14,13,Great Britain,37
1,10,9,15,Australia,34
