In [51]:
# Importation des bibliothèques Python
import pandas as pd
import os

In [52]:
# Charger les données
df = pd.read_csv('archive_raw/olympic_results.csv')

In [53]:
# Création de la liste des pays uniques, triés, et sauvegardée
unique_countries = pd.DataFrame(data=sorted(df['country_name'].unique()), columns=['country_name'])
unique_countries.to_csv('archive_raw/country_names.csv', index=False)

In [54]:
# Choix des slugs des jeux Olympiques (de 1948 à 2020)
game_slugs = [
    'tokyo-2020', 
    'rio-2016', 
    'london-2012',
    'beijing-2008',
    'athens-2004', 
    'sydney-2000', 
    'atlanta-1996', 
    'barcelona-1992', 
    'seoul-1988',
    'los-angeles-1984',
    'moscow-1980',
    'montreal-1976',
    'munich-1972',
    'mexico-city-1968',
    'tokyo-1964',
    'rome-1960',
    'melbourne-1956',
    'helsinki-1952',
    'london-1948'
]


In [55]:
# Filtrer les éditions d'été des JO avec les slugs
df = df[df['slug_game'].apply(lambda x: x in game_slugs)]

In [56]:
# Préparer la séparation des éditions pour les périodes 1948-1988 et 1992-2020 (JO 2020 ont eu lieu en 2021)
editions_before_1988 = [
    'seoul-1988', 'los-angeles-1984', 'moscow-1980', 'montreal-1976', 'munich-1972',
    'mexico-city-1968', 'tokyo-1964', 'rome-1960', 'melbourne-1956', 'helsinki-1952', 'london-1948'
]
editions_after_1988 = [
    'tokyo-2020', 'rio-2016', 'london-2012', 'beijing-2008', 'athens-2004', 'sydney-2000',
    'atlanta-1996', 'barcelona-1992'
]

In [57]:
# Préparer le filtrage des sports pour les périodes 1948-1988 et 1992-2020 (JO 2020 ont eu lieu en 2021)
sports_summer_before_1988 = [
    'Shooting', 'Diving', 'Canoe Sprint', 'Cycling Road', 'Football', 'Boxing', 'Basketball',
    'Cycling Track', 'Fencing', 'Water Polo', 'Wrestling', 'Artistic Gymnastics', 'Weightlifting',
    'Modern Pentathlon', 'Hockey', 'Athletics', 'Swimming', 'Sailing', 'Rowing'
]
sports_summer_after_1988 = [
    'Shooting', 'Diving', 'Canoe Sprint', 'Cycling Road', 'Football', 'Boxing', 'Basketball',
    'Cycling Track', 'Fencing', 'Table Tennis', 'Badminton', 'Water Polo', 'Wrestling',
    'Artistic Gymnastics', 'Canoe Slalom', 'Rhythmic Gymnastics', 'Weightlifting', 'Modern Pentathlon',
    'Hockey', 'Volleyball', 'Artistic Swimming', 'Athletics', 'Swimming', 'Sailing', 'Rowing',
    'Tennis', 'Equestrian', 'Archery', 'Handball', 'Judo'
]

In [58]:
# Filtrer les sports pour les périodes 1948-1988 et 1992-2020 (JO 2020 ont eu lieu en 2021)
df_summer_before_1988 = df[df['discipline_title'].apply(lambda x: x in sports_summer_before_1988)]
df_summer_before_1988 = df_summer_before_1988[df_summer_before_1988['slug_game'].apply(lambda x: x in editions_before_1988)]

df_summer_after_1988  = df[df['discipline_title'].apply(lambda x: x in sports_summer_after_1988)]
df_summer_after_1988  = df_summer_after_1988[df_summer_after_1988['slug_game'].apply(lambda x: x in editions_after_1988)]

In [59]:
# Extraire les rangs pour les positions <= n (par défaut : 10)
def extract_n_first(df, n_athletes=10):
    df['rank_position'] = pd.to_numeric(df['rank_position'], errors='coerce')
    df_cl = df.dropna(subset=['rank_position'])
    df_cl['rank_position'] = df_cl['rank_position'].astype(int)
    df_cl = df_cl[df_cl['rank_position'] <= n_athletes]
    return df_cl

# Application de la fonction avec n_athletes, groupement par pays et par édition
n_athletes = 10

filtered_df_before = extract_n_first(df_summer_before_1988, n_athletes)
df_before          = filtered_df_before.groupby(by=['country_name', 'slug_game']).count().reset_index()

filtered_df        = extract_n_first(df_summer_after_1988, n_athletes)
df_after           = filtered_df.groupby(by=['country_name', 'slug_game']).count().reset_index()

df_clean_before    = df_before[['country_name', 'rank_position', 'slug_game']].copy()
df_clean_after     = df_after [['country_name', 'rank_position', 'slug_game']].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['rank_position'] = df_cl['rank_position'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['rank_position'] = df_cl['rank_position'].astype(int)


In [60]:
# Extraire l'année et créer un tableau croisé dynamique en une seule étape
df_clean_before['year'] = df_clean_before['slug_game'].str.extract(r'(\d{4})')
df_pivot = df_clean_before.pivot_table(index='country_name', columns='year', values='rank_position', aggfunc='first')

df_pivot.reset_index(inplace=True)

# Remplacer les valeurs manquantes par 0
df_pivot.fillna(0, inplace=True)

# Transposer et réinitialiser l'index, puis utiliser la première ligne comme noms de colonnes
df_pivot         = df_pivot.transpose()
df_pivot         = df_pivot.iloc[0:].reset_index(drop=True)
df_pivot.columns = df_pivot.iloc[0]
first_part       = df_pivot.drop(0).reset_index(drop=True)

In [61]:
# Extraire l'année et créer un tableau croisé dynamique en une seule étape
df_clean_after['year'] = df_clean_after['slug_game'].str.extract(r'(\d{4})')
df_pivot = df_clean_after.pivot_table(index='country_name', columns='year', values='rank_position', aggfunc='first')

df_pivot.reset_index(inplace=True)

# Remplacer les valeurs manquantes par 0
df_pivot.fillna(0, inplace=True)

# Transposer et réinitialiser l'index, puis utiliser la première ligne comme noms de colonnes
df_pivot         = df_pivot.transpose()
df_pivot         = df_pivot.iloc[0:].reset_index(drop=True)
df_pivot.columns = df_pivot.iloc[0]
second_part      = df_pivot.drop(0).reset_index(drop=True)



In [62]:
# Définition des années pour les fichiers finaux
years = [1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 
         1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020]

In [63]:
# Ajout des années et organisation de first_part pour 1948-1988
first_part.rename(columns={'Unnamed: 0': 'year'}, inplace=True)
first_part['year'] = years[:11]
first_part.set_index('year', inplace=True)

# Ajout des années et organisation de second_part pour 1992-2020 (JO 2020 ont eu lieu en 2021)
second_part.rename(columns={'Unnamed: 0': 'year'}, inplace=True)
second_part['year'] = years[11:]
second_part.set_index('year', inplace=True)

first_part.to_csv('first_part.csv')
second_part.to_csv('second_part.csv')


In [64]:
# Sauvegarde de la version brute de la liste des pays
country_list = second_part.columns[:].tolist()
country_list.sort()
with open('country_list_brut.txt', 'w') as f:
    for country in country_list:
        f.write(f"{country}\n")

In [65]:
# Mapping des noms de pays et nettoyage
name_mapping = {
    "Democratic People's Republic of Korea" : "North Korea",
    "Hong Kong, China" : "Hong Kong China",
    "Islamic Republic of Iran" : "Iran",
    "People's Republic of China" : "China",
    "Republic of Korea" : "South Korea",
    "Republic of Moldova" : "Moldova",
    "Russian Federation" : "Russia",
    "Saint Kitts and Nevis" : "St. Kitts and Nevis",
    "Saint Lucia" : "St. Lucia",
    "Swaziland" : "Eswatini",
    "Syrian Arab Republic" : "Syria",
    "United Arab Emirates" : "UAE",
    "United Republic of Tanzania" : "Tanzania",
    "United States of America" : "USA",
    "Congo" : "Congo Rep.",
    "Democratic Republic of the Congo" : "Congo Dem. Rep."
}

# Liste des pays à supprimer dans la liste des délégations olympiques:
to_del = [
    "Virgin Islands, British", 
    "US Virgin Islands", 
    "The Former Yugoslav Republic of Macedonia",
    "Serbia and Montenegro", 
    "Refugee Olympic Team", 
    "Refugee Olympic Athletes", 
    "ROC", 
    "Puerto Rico", 
    "Netherlands Antilles", 
    "Kosovo",
    "Independent Olympic Athletes", 
    "Guam", 
    "Czechoslovakia", 
    "Chinese Taipei", 
    "Bermuda", 
    "Aruba", 
    "American Samoa",
    "Unified Team",
    "Cayman Islands",
    "Côte d'Ivoire",
    
	"Andorra",
	"Bahrain",
	"Barbados",
	"Cape Verde",
	"Congo Rep.",
	"Federated States of Micronesia",
	"Grenada",
	"Kiribati",
	"Malta",
	"Mauritius",
	"Monaco",
	"Nauru",
	"North Macedonia",
	"Samoa",
	"San Marino",
	"Seychelles",
	"Singapore",
	"St. Kitts and Nevis",
	"St. Lucia",
	"Tonga"]

In [66]:
# Copie et nettoyage des données de second_part pour les noms de pays et les colonnes à supprimer
second_part_ = second_part.copy()
second_part_ = second_part_.rename(columns=name_mapping)
second_part_ = second_part_.drop(columns=[col for col in second_part_.columns if col in to_del])


In [67]:
# Sauvegarde de la version nettoyée de la liste des pays
country_list = second_part_.columns[:].tolist()
country_list.sort()
with open('list_post_traitement.txt', 'w') as f:
    for country in country_list:
        f.write(f"{country}\n")

In [68]:
# Sauvegarde de la version nettoyée de second_part 


## Normalisation médailles

In [69]:
# normalisation des nombre de médailles avec min max scaler
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler, MinMaxScaler
second_part_.to_csv('second_part_countries_cleaned.csv')
medals = pd.read_csv('second_part_countries_cleaned.csv')
no_nor = medals.copy()
no_nor = medals.reindex(sorted(medals.columns), axis=1)
no_nor.to_csv('second_part_countries_cleaned_no_normalized.csv', index=False)

normalized_values = normalize(medals.iloc[:, 1:], axis=1, norm='max')  # Option: norm='l1' or 'l2' for other norms

normalized_medals_df = pd.concat([medals.iloc[:, 0], pd.DataFrame(normalized_values)], axis=1)
normalized_medals_df.columns = medals.columns
# trier les pays par ordre alphabetique sur axe x

medals = normalized_medals_df.reindex(sorted(medals.columns), axis=1)
medals.to_csv('second_part_countries_cleaned_normalized.csv', index=False)

In [70]:
medals['Germany']

0    0.853982
1    0.890909
2    0.857843
3    0.739796
4    0.610000
5    0.638498
6    0.580952
7    0.550847
Name: Germany, dtype: float64

In [71]:
data_cl = pd.read_csv('second_part_countries_cleaned_normalized.csv')
data_cl

Unnamed: 0,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bangladesh,...,USA,Uganda,Ukraine,Uruguay,Uzbekistan,Venezuela,Vietnam,Zambia,Zimbabwe,year
0,0.004425,0.061947,0.004425,0.053097,0.0,0.345133,0.097345,0.0,0.00885,0.0,...,1.0,0.0,0.0,0.0,0.0,0.022124,0.0,0.013274,0.0,1992
1,0.0,0.05,0.004545,0.081818,0.077273,0.477273,0.077273,0.036364,0.027273,0.0,...,1.0,0.009091,0.295455,0.004545,0.072727,0.027273,0.0,0.018182,0.004545,1996
2,0.004902,0.053922,0.004902,0.098039,0.034314,0.705882,0.112745,0.088235,0.044118,0.0,...,1.0,0.0,0.382353,0.009804,0.088235,0.044118,0.0,0.004902,0.004902,2000
3,0.0,0.056122,0.005102,0.086735,0.045918,0.653061,0.091837,0.096939,0.035714,0.0,...,1.0,0.015306,0.418367,0.010204,0.081633,0.045918,0.005102,0.005102,0.020408,2004
4,0.01,0.055,0.0,0.06,0.055,0.615,0.13,0.075,0.03,0.0,...,1.0,0.015,0.31,0.0,0.08,0.06,0.01,0.0,0.03,2008
5,0.014085,0.028169,0.00939,0.075117,0.032864,0.502347,0.075117,0.126761,0.028169,0.0,...,1.0,0.00939,0.276995,0.00939,0.089202,0.065728,0.023474,0.0,0.014085,2012
6,0.009524,0.061905,0.009524,0.090476,0.061905,0.580952,0.071429,0.142857,0.02381,0.0,...,1.0,0.014286,0.228571,0.004762,0.119048,0.061905,0.02381,0.004762,0.004762,2016
7,0.008475,0.038136,0.004237,0.072034,0.038136,0.487288,0.09322,0.067797,0.016949,0.004237,...,1.0,0.029661,0.258475,0.004237,0.122881,0.029661,0.008475,0.016949,0.0,2020


In [72]:
# création de labels pour entrainement de classification 
# classe 0 -> pays peut performant
# classe 1 -> pays moyennement performant
# classe 2 -> pays dans les meilleurs classement

def classify_2_class(row):
    threshold_1 = 0.30
    return row.apply(lambda x: 0 if x < threshold_1 else 1)

def classify_3_class(row):
    threshold_1 = 0.2
    threshold_2 = 0.8
    return row.apply(lambda x: 0 if x < threshold_1 else (1 if x < threshold_2 else 2))                     

year_column = data_cl['year']

classified_data_2 = data_cl.drop(columns=['year']).apply(classify_2_class, axis=1)
classified_data_2['year'] = year_column

classified_data_3 = data_cl.drop(columns=['year']).apply(classify_3_class, axis=1)
classified_data_3['year'] = year_column

classified_data_2.to_csv('labels_class_2_class.csv', index=False)
classified_data_3.to_csv('labels_class_3_class.csv', index=False)

In [73]:
classified_data_2.to_csv('sada.csv', index=False)


## traitement du climat

In [74]:
climat = pd.read_csv('climate_raw/climate_in_continuous.csv')
climat = climat.drop(columns=['Unnamed: 0'])


In [75]:

# récupérer les pays qui sont dans les deux datasets climat inter gapminder
climat = climat[climat['country'].isin(country_list)] 
# trier climat par pays
climat = climat.sort_values(by='country')
climat.to_csv('climate_cleaned.csv', index=False)