In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from deep_translator import GoogleTranslator
import pickle
import plotly.express as px

### Translation of Medals data

In [7]:
medals = pd.read_csv("data_csv/medailles.csv")
medals.head()

Unnamed: 0,Place,Pays,Or,Argent,Bronze,Total,Lieu,Annee
0,1,Etats Unis,38,41,34,113,Tokyo,2021
1,2,Chine,36,32,18,86,Tokyo,2021
2,3,Japon,27,12,16,55,Tokyo,2021
3,4,Grande Bretagne,22,21,22,65,Tokyo,2021
4,5,Russie,20,28,23,71,Tokyo,2021


In [8]:
# let's change the column names of the medals dataframe to Place, Country, Gold, Silver, Bronze, Total, Location, Year
medals.columns = ['Place', 'Country', 'Gold', 'Silver', 'Bronze', 'Total', 'Location', 'Year']
medals.head(15)

Unnamed: 0,Place,Country,Gold,Silver,Bronze,Total,Location,Year
0,1,Etats Unis,38,41,34,113,Tokyo,2021
1,2,Chine,36,32,18,86,Tokyo,2021
2,3,Japon,27,12,16,55,Tokyo,2021
3,4,Grande Bretagne,22,21,22,65,Tokyo,2021
4,5,Russie,20,28,23,71,Tokyo,2021
5,6,Australie,17,7,21,45,Tokyo,2021
6,7,Pays-Bas,10,12,14,36,Tokyo,2021
7,8,France,10,12,11,33,Tokyo,2021
8,9,Italie,10,10,20,40,Tokyo,2021
9,10,Allemagne,10,10,16,36,Tokyo,2021


In [9]:
# Create a Translator object
translator = GoogleTranslator(source='fr', target='en')
# Extract unique values from 'Country' and 'Location' columns
unique_countries = medals['Country'].unique()
unique_locations = medals['Location'].unique()

# Create an empty dictionary to store translations
translations_dict = {}

# Translate unique countries
for country in unique_countries:
    try:
        translation = translator.translate(country)
        translations_dict[country] = translation
    except Exception as e:
        translations_dict[country] = country  # Use the original word if translation fails

# Translate unique locations
for location in unique_locations:
    try:
        translation = translator.translate(location)
        translations_dict[location] = translation
    except Exception as e:
        translations_dict[location] = location  # Use the original word if translation fails

# Display the translations dictionary
print(translations_dict)

{'Etats Unis': 'UNITED STATES', 'Chine': 'China', 'Japon': 'Japan', 'Grande Bretagne': 'Britain', 'Russie': 'Russia', 'Australie': 'Australia', 'Pays-Bas': 'The Netherlands', 'France': 'France', 'Italie': 'Italy', 'Allemagne': 'Germany', 'Canada': 'Canada', 'Brésil': 'Brazil', 'Nouvelle-Zélande': 'New Zealand', 'Cuba': 'Cuba', 'Hongrie': 'Hungary', 'Corée du Sud': 'South Korea', 'Pologne': 'Poland', 'République Tchèque': 'Czech Republic', 'Kenya': 'Kenya', 'Norvège': 'Norway', 'Jamaïque': 'Jamaica', 'Espagne': 'Spain', 'Suède': 'Sweden', 'Suisse': 'Swiss', 'Danemark': 'Denmark', 'Croatie': 'Croatia', 'Iran': 'Iran', 'Serbie': 'Serbia', 'Belgique': 'Belgium', 'Slovénie': 'Slovenia', 'Ouzbékistan': 'Uzbekistan', 'Géorgie': 'Georgia', 'Taïwan': 'Taiwan', 'Turquie': 'Türkiye', 'Bulgarie': 'Bulgaria', 'Grèce': 'Greece', 'Ouganda': 'Uganda', 'Equateur': 'Ecuador', 'Irlande': 'Ireland', 'Israël': 'Israel', 'Lettonie': 'Latvia', 'Qatar': 'Qatar', 'Bahamas': 'Bahamas', 'Kosovo': 'Kosovo', 'Ukra

In [10]:
# we create a pickle to save the translation dictionary
with open('pickles/translations_dict.pickle', 'wb') as handle:
    pickle.dump(translations_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# we load the pickle
with open('pickles/translations_dict.pickle', 'rb') as handle:
    translations_dict = pickle.load(handle)

In [12]:
# let's apply the translation to all the elements of the Country and Location columns
medals['Country'] = medals['Country'].apply(lambda x: translations_dict[x])
medals['Location'] = medals['Location'].apply(lambda x: translations_dict[x])
medals.head(15)

Unnamed: 0,Place,Country,Gold,Silver,Bronze,Total,Location,Year
0,1,UNITED STATES,38,41,34,113,Tokyo,2021
1,2,China,36,32,18,86,Tokyo,2021
2,3,Japan,27,12,16,55,Tokyo,2021
3,4,Britain,22,21,22,65,Tokyo,2021
4,5,Russia,20,28,23,71,Tokyo,2021
5,6,Australia,17,7,21,45,Tokyo,2021
6,7,The Netherlands,10,12,14,36,Tokyo,2021
7,8,France,10,12,11,33,Tokyo,2021
8,9,Italy,10,10,20,40,Tokyo,2021
9,10,Germany,10,10,16,36,Tokyo,2021


### Data Analysis from Medals data

In [13]:
# we search every country with "Germany" in the name
medals[medals['Country'].str.contains('Germany')]["Country"].unique()

array(['Germany', 'East Germany', 'West Germany'], dtype=object)

In [14]:
# we replace every "East Germany" or "West Germany" or "Statement in intervention submitted by Germany" with "Germany
medals['Country'] = medals['Country'].replace(['East Germany', 'West Germany', 'Statement in intervention submitted by Germany', 'Germany, Empire'], 'Germany')
medals[medals['Country'].str.contains('Germany')]["Country"].unique()

array(['Germany'], dtype=object)

In [15]:
# the top 10 countries that won the most medals in the history of the olympics
medals.groupby('Country')['Total'].sum().sort_values(ascending=False).head(10)

# on a bar chart px countries in the x axis and total medals in the y axis
fig = px.bar(medals.groupby('Country')['Total'].sum().sort_values(ascending=False).head(10),
             x=medals.groupby('Country')['Total'].sum().sort_values(ascending=False).head(10).index,
             y=medals.groupby('Country')['Total'].sum().sort_values(ascending=False).head(10).values,
             labels={'x': 'Country', 'y': 'Total Medals'})
fig.show()


In [16]:
# evolution of total medals distributed over the years
fig = px.line(medals.groupby('Year')['Total'].sum(),
              x=medals.groupby('Year')['Total'].sum().index,
              y=medals.groupby('Year')['Total'].sum().values,
              labels={'x': 'Year', 'y': 'Total Medals'})
fig.show()

In [17]:
# Plot a scatter plot with the x-axis representing the year and y-axis representing the total number of medals for each country.
# The size of the points should be the total number of medals won by the country.
# The color of the points should be the country name.
fig = px.scatter(medals.groupby(['Year', 'Country'])['Total'].sum().reset_index(),
                 x='Year',
                 y='Total',
                 size='Total',
                 color='Country',
                 labels={'x': 'Year', 'y': 'Total Medals'})
fig.show()

Total number of Gold, Silver, and Bronze medals for each country : 

In [25]:
# we create a new dataframe grouped by country with the sum of Gold, Silver, and Bronze medals
medals_by_country = medals.groupby('Country').sum()[['Gold', 'Silver', 'Bronze', 'Total']]
medals_by_country.head()

Unnamed: 0_level_0,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,0,0,3,3
Algeria,5,4,8,17
Argentina,21,26,31,78
Armenia,2,8,8,18
Australasia,3,4,5,12


In [28]:
# plot of the distribution of Gold, Silver, and Bronze medals for each country
fig = px.bar(medals_by_country, x=medals_by_country.index, y=['Gold', 'Silver', 'Bronze'],
             color_discrete_map={'Gold': 'gold', 'Silver': 'silver', 'Bronze': 'brown'},
             title='Medals Distribution by Country',
             labels={'value': 'Number of Medals', 'variable': 'Medal Type'},
             barmode='group')
fig.show()

Top 10 countries with the most medals :

In [31]:
# same graph but top 10 countries with the most medals
fig = px.bar(medals_by_country.sort_values('Total', ascending=False).head(10),
             x=medals_by_country.sort_values('Total', ascending=False).head(10).index,
             y=['Gold', 'Silver', 'Bronze'],
             color_discrete_map={'Gold': 'gold', 'Silver': 'silver', 'Bronze': 'brown'},
             title='Top 10 Countries - Medals Distribution',
             labels={'value': 'Number of Medals', 'variable': 'Medal Type'},
             barmode='group')
fig.update_xaxes(title_text='Country')
fig.show()

Evolution of the total number of medals won by France over the years :

In [None]:
# line chart showing the progression of france - its total medals over the years.
fig = px.line(medals[medals['Country'] == 'France'].groupby('Year')['Total'].sum(),
              x=medals[medals['Country'] == 'France'].groupby('Year')['Total'].sum().index,
              y=medals[medals['Country'] == 'France'].groupby('Year')['Total'].sum().values,
              labels={'x': 'Year', 'y': 'Total Medals'})
fig.show()

### A historical analysis ...

Total number of medals won by the USA overs the years :

In [None]:
# line chart showing the progression of the United States - its total medals over the years.
fig = px.scatter(medals[medals['Country'] == 'UNITED STATES'].groupby('Year')['Total'].sum(),
              x=medals[medals['Country'] == 'UNITED STATES'].groupby('Year')['Total'].sum().index,
              y=medals[medals['Country'] == 'UNITED STATES'].groupby('Year')['Total'].sum().values,
              labels={'x': 'Year', 'y': 'Total Medals'})
fig.show()

We can observe the absence of the United States in the 1980 Moscow Olympics : the country boycotted the games to protest the Soviet invasion of Afghanistan. The Soviet Union retaliated by boycotting the 1984 Los Angeles Olympics.

Total number of medals won by USSR overs the years :

In [None]:
fig = px.scatter(medals[medals['Country'] == 'U.S.S.R.'].groupby('Year')['Total'].sum(),
              x=medals[medals['Country'] == 'U.S.S.R.'].groupby('Year')['Total'].sum().index,
              y=medals[medals['Country'] == 'U.S.S.R.'].groupby('Year')['Total'].sum().values,
              labels={'x': 'Year', 'y': 'Total Medals'})
fig.show()

## Environmental impact of hosting the Olympics : 

### Do the olympic games have a remarkable impact on a country's greenhouse gas emissions?

In [None]:
GH_emissions = pd.read_csv('data_csv/cp_GH_emissions_final_version.csv')
GH_emissions.head(5)

Unnamed: 0,sector,country,year,GH emissions
0,Energy Industries,Australia,1990,143172.76
1,Energy Industries,Australia,1991,146396.55
2,Energy Industries,Australia,1992,149719.8
3,Energy Industries,Australia,1993,151492.72
4,Energy Industries,Australia,1994,152307.64


In [None]:
# let's see all the sectors in the dataframe
GH_emissions['sector'].unique().tolist()

['Energy Industries',
 'Manufacturing industries and construction',
 'Transport',
 'Residential and other sectors',
 'Energy - Other',
 'Fugitive Emissions from Fuels',
 'CO2 from Transport and Storage']

In [None]:
# Autralia hosted the olympics in 1956 and 2000
# let's see the evolution of the GH emissions in Australia over the years
fig = px.line(GH_emissions[GH_emissions['country'] == 'Australia'].groupby('year')['GH emissions'].sum(),
              x=GH_emissions[GH_emissions['country'] == 'Australia'].groupby('year')['GH emissions'].sum().index,
              y=GH_emissions[GH_emissions['country'] == 'Australia'].groupby('year')['GH emissions'].sum().values,
              labels={'x': 'Year', 'y': 'GH Emissions'})
fig.show()

### Data Analysis on the variations of greenhouse gas emissions of the host countries :

We verify the format of the data when there is missing values

In [None]:
GH_emissions[(GH_emissions['country'] == 'Australia') & (GH_emissions['year'] == 2019) & (GH_emissions['sector'] == 'CO2 from Transport and Storage')]['GH emissions'].values[0] 

nan

We verify that we are not dividing by zero and that the data is coherent

In [None]:
#let's verify all the GH emissions are > 0
GH_emissions[GH_emissions['GH emissions'] <= 0]

Unnamed: 0,sector,country,year,GH emissions


let's create a new column in the dataset corresponding to the % of augmentation of GH emmissions for each year

In [None]:
# let's add a column with the GH % augmentation in the GH emissions dataframe
GH_emissions['GH % augmentation'] = 0

for i in range(len(GH_emissions)):
    # if the year is 1990, we replace the value of the column 'GH % augmentation' by 0
    # we have sector, countryn year, GH % augmentation
    if GH_emissions.iloc[i, 2] == 1990:
        GH_emissions.iloc[i, 4] = np.nan
    # if the year is not 1990, we compute the % of augmentation of GH emissions compared to the previous year
    else:
        # we search the value of GH emissions of the previous year in the dataframe GH_emissions
        # we have sector, country, year, GH emissions
        # we search the value of the previous year
        previous_year = GH_emissions.iloc[i, 2] - 1
        # we search the value of GH emissions of the previous year
        previous_year_GH = GH_emissions[(GH_emissions['sector'] == GH_emissions.iloc[i, 0]) & (GH_emissions['country'] == GH_emissions.iloc[i, 1]) & (GH_emissions['year'] == previous_year)]['GH emissions'].values[0]
        # if previous_year_GH is nan or aumgentation.iloc[i, 3] is nan, we replace the value of the column 'GH % augmentation' by nan
        if np.isnan(previous_year_GH) or np.isnan(GH_emissions.iloc[i, 3]):
            GH_emissions.iloc[i, 4] = np.nan
        else:
            # we compute the % of augmentation of GH emissions compared to the previous year
            GH_emissions.iloc[i, 4] = (GH_emissions.iloc[i, 3] - previous_year_GH) / previous_year_GH * 100

GH_emissions.head(5)

Unnamed: 0,sector,country,year,GH emissions,GH % augmentation
0,Energy Industries,Australia,1990,143172.76,
1,Energy Industries,Australia,1991,146396.55,2.251678
2,Energy Industries,Australia,1992,149719.8,2.270033
3,Energy Industries,Australia,1993,151492.72,1.184159
4,Energy Industries,Australia,1994,152307.64,0.537927


In [None]:
# plot of the evolution of the GH % augmentation of Australia over the years by sector
fig = px.line(GH_emissions[GH_emissions['country'] == 'Australia'].groupby(['year', 'sector'])['GH % augmentation'].sum().reset_index(),
              x='year',
              y='GH % augmentation',
              color='sector',
              labels={'x': 'Year', 'y': 'GH % Augmentation'})
fig.update_layout(title_text='Evolution of GH emissions % of augmentation in Australia over the years by sector')
fig.show()

Australia has hosted the olympics in 2000. We can see a significant increase in GH emmissions in 2000 in the 'Fugitive Emissions from Fuels' category but it cannot be attributed to the olympics. This sector's emissions are too fluctuating for us to be able to draw a conclusion.

In [None]:
# plot of the evolution of the GH % augmentation of Australia over the years by sector
fig = px.line(GH_emissions[GH_emissions['country'] == 'United Kingdom'].groupby(['year', 'sector'])['GH % augmentation'].sum().reset_index(),
              x='year',
              y='GH % augmentation',
              color='sector',
              labels={'x': 'Year', 'y': 'GH % Augmentation'})
fig.update_layout(title_text='Evolution of GH emissions % of augmentation in the UK over the years by sector')
fig.show()

The same conclusion can be drawn for the other host countries. UK has hosted the olympics in 2012, we can see a significant increase of the GH emmissions in the 'Energy Indutries' and 'Residential and other sectors' but it cannot be directly imputed to the olympics as these sectors have too fluctuating GH emmissions.

To conclude, we can say that the olympics have no significant impact on the GH emmissions of the host country on a year to year basis.
We are limited by our data. We could potentially get more probant results by studying the GH emmissions of only the host city and not the whole country.
Getting a monthly or weekly data would also be more probant we couldn't find such data.