In [18]:
from flask import jsonify
from connect_db import execute_query
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pickle
from app import app  # Importer l'application Flask

In [3]:
with app.app_context():
    query = "SELECT * FROM olympic_medals"
    data = execute_query(query)
    json_data = jsonify(data)
    
    # Extraire les données JSON de l'objet Response
    real_data = json_data.get_json()
    real_data

Connexion réussie
Connexion fermée


In [4]:
medals = pd.DataFrame(real_data)
medals.head()

Unnamed: 0,athlete_full_name,athlete_url,country_3_letter_code,country_code,country_name,discipline_title,event_gender,event_title,id,medal_type,participant_title,participant_type,slug_game
0,Stefania CONSTANTINI,https://olympics.com/en/athletes/stefania-cons...,ITA,IT,Italy,Curling,Mixed,Mixed Doubles,21698,GOLD,Italy,GameTeam,beijing-2022
1,Amos MOSANER,https://olympics.com/en/athletes/amos-mosaner,ITA,IT,Italy,Curling,Mixed,Mixed Doubles,21699,GOLD,Italy,GameTeam,beijing-2022
2,Kristin SKASLIEN,https://olympics.com/en/athletes/kristin-skaslien,NOR,NO,Norway,Curling,Mixed,Mixed Doubles,21700,SILVER,Norway,GameTeam,beijing-2022
3,Magnus NEDREGOTTEN,https://olympics.com/en/athletes/magnus-nedreg...,NOR,NO,Norway,Curling,Mixed,Mixed Doubles,21701,SILVER,Norway,GameTeam,beijing-2022
4,Almida DE VAL,https://olympics.com/en/athletes/almida-de-val,SWE,SE,Sweden,Curling,Mixed,Mixed Doubles,21702,BRONZE,Sweden,GameTeam,beijing-2022


In [5]:
# Renommer les colonnes si nécessaire pour faciliter l'accès
# medals.columns = ['','discipline_title', 'slug_game', 'event_title', 'event_gender', 'medal_type', 'participant_type', 'participant_title', 'athlete_url', 'athlete_full_name', 'country_name', 'country_code', 'country_3_letter_code']

medals.drop(['discipline_title', 'event_title', 'event_gender', 'participant_type', 'participant_title', 'athlete_url', 'athlete_full_name', 'country_code', 'country_3_letter_code'], axis=1, inplace=True)
# Créer un DataFrame avec les médailles agrégées par pays et par année
df_medals = medals.groupby(['country_name', 'slug_game', 'medal_type']).size().unstack(fill_value=0).reset_index()
print(df_medals.head(100))

medal_type country_name         slug_game  BRONZE  GOLD  SILVER
0           Afghanistan      beijing-2008       1     0       0
1           Afghanistan       london-2012       1     0       0
2               Algeria      atlanta-1996       1     2       0
3               Algeria    barcelona-1992       1     1       0
4               Algeria      beijing-2008       1     0       1
..                  ...               ...     ...   ...     ...
95              Austria  los-angeles-1984       1     1       1
96              Austria    melbourne-1956       4     0       0
97              Austria  mexico-city-1968       3     0       2
98              Austria     montreal-1976       1     0       0
99              Austria       moscow-1980       1     1       3

[100 rows x 5 columns]


In [6]:
df_medals.rename(columns= {'medal_type': '','country_name': 'country', 'slug_game': 'year', 'BRONZE': 'bronze', 'GOLD': 'gold', 'SILVER': 'silver'}, inplace=True)
df_medals

medal_type,country,year,bronze,gold,silver
0,Afghanistan,beijing-2008,1,0,0
1,Afghanistan,london-2012,1,0,0
2,Algeria,atlanta-1996,1,2,0
3,Algeria,barcelona-1992,1,1,0
4,Algeria,beijing-2008,1,0,1
...,...,...,...,...,...
1774,Zambia,atlanta-1996,0,0,1
1775,Zambia,los-angeles-1984,1,0,0
1776,Zimbabwe,athens-2004,1,1,1
1777,Zimbabwe,beijing-2008,0,1,3


In [7]:
df_medals['year'] = df_medals['year'].apply(lambda x: int(x.split('-')[-1]))
df_medals

medal_type,country,year,bronze,gold,silver
0,Afghanistan,2008,1,0,0
1,Afghanistan,2012,1,0,0
2,Algeria,1996,1,2,0
3,Algeria,1992,1,1,0
4,Algeria,2008,1,0,1
...,...,...,...,...,...
1774,Zambia,1996,0,0,1
1775,Zambia,1984,1,0,0
1776,Zimbabwe,2004,1,1,1
1777,Zimbabwe,2008,0,1,3


In [8]:
countries = df_medals['country'].unique()
i = 0
for country in countries:
    df_medals.loc[df_medals['country'] == country, 'countryId'] = i
    i += int(1)
df_medals 

medal_type,country,year,bronze,gold,silver,countryId
0,Afghanistan,2008,1,0,0,0.0
1,Afghanistan,2012,1,0,0,0.0
2,Algeria,1996,1,2,0,1.0
3,Algeria,1992,1,1,0,1.0
4,Algeria,2008,1,0,1,1.0
...,...,...,...,...,...,...
1774,Zambia,1996,0,0,1,152.0
1775,Zambia,1984,1,0,0,152.0
1776,Zimbabwe,2004,1,1,1,153.0
1777,Zimbabwe,2008,0,1,3,153.0


In [9]:
dict_country = {}
for i in range(len(df_medals)):
    id = df_medals.loc[i, 'countryId']
    if id not in dict_country:
        dict_country[id] = df_medals.loc[i, 'country']

dict_country

{0.0: 'Afghanistan',
 1.0: 'Algeria',
 2.0: 'Argentina',
 3.0: 'Armenia',
 4.0: 'Australasia',
 5.0: 'Australia',
 6.0: 'Austria',
 7.0: 'Azerbaijan',
 8.0: 'Bahamas',
 9.0: 'Bahrain',
 10.0: 'Barbados',
 11.0: 'Belarus',
 12.0: 'Belgium',
 13.0: 'Bermuda',
 14.0: 'Bohemia',
 15.0: 'Botswana',
 16.0: 'Brazil',
 17.0: 'Bulgaria',
 18.0: 'Burkina Faso',
 19.0: 'Burundi',
 20.0: 'Cameroon',
 21.0: 'Canada',
 22.0: 'Chile',
 23.0: 'Chinese Taipei',
 24.0: 'Colombia',
 25.0: 'Costa Rica',
 26.0: 'Croatia',
 27.0: 'Cuba',
 28.0: 'Cyprus',
 29.0: 'Czech Republic',
 30.0: 'Czechoslovakia',
 31.0: "Côte d'Ivoire",
 32.0: "Democratic People's Republic of Korea",
 33.0: 'Denmark',
 34.0: 'Djibouti',
 35.0: 'Dominican Republic',
 36.0: 'Ecuador',
 37.0: 'Egypt',
 38.0: 'Eritrea',
 39.0: 'Estonia',
 40.0: 'Ethiopia',
 41.0: 'Federal Republic of Germany',
 42.0: 'Fiji',
 43.0: 'Finland',
 44.0: 'France',
 45.0: 'Gabon',
 46.0: 'Georgia',
 47.0: 'German Democratic Republic (Germany)',
 48.0: 'Germany

In [10]:
import json

with open("contries.json", "w") as f:
    json.dump(dict_country, f)

In [11]:
# Séparation des caractéristiques et de la cible
mlX, mly = df_medals[['year', 'countryId']].values, df_medals[['gold', 'silver', 'bronze']].values

In [12]:
# Division des données en données d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(mlX, mly, test_size=0.30, random_state=0)

In [13]:
# Entrainement du modèle
modelml = RandomForestRegressor(max_depth=10000, random_state=0)
modelml.fit(mlX, mly)

In [17]:
modelml.predict([[2024, 1.0]])

array([[0.56, 1.3 , 1.24]])

In [15]:
# Importation des bibliothèques
import pickle
import os
# Création d'un dossier s'il n'existe pas
if not os.path.exists('./models'):
    os.mkdir('models')
    pass
# Sauvegarde du modèle
pickle.dump(modelml, open('./models/medals_y_c_model.pkl','wb'))