##### Project based on the one from datacamp, with olympic medal data compromising records of all events held at the Olympic games between 1896 and 2008. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.plotly as py
import plotly

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Set plotly offline
init_notebook_mode(connected=True)

In [2]:
import cufflinks as cf

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [3]:
pd.options.display.max_rows = 8 # So i can see .head(4) and .tail(4) in the same cell and with only one comand

In [7]:
medals = pd.read_csv('all_medalists.csv')

In [8]:
medals

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
...,...,...,...,...,...,...,...,...,...,...
29212,Beijing,2008,Wrestling,Wrestling Gre-R,"MIZGAITIS, Mindaugas",LTU,Men,96 - 120kg,M,Bronze
29213,Beijing,2008,Wrestling,Wrestling Gre-R,"PATRIKEEV, Yuri",ARM,Men,96 - 120kg,M,Bronze
29214,Beijing,2008,Wrestling,Wrestling Gre-R,"LOPEZ, Mijain",CUB,Men,96 - 120kg,M,Gold
29215,Beijing,2008,Wrestling,Wrestling Gre-R,"BAROEV, Khasan",RUS,Men,96 - 120kg,M,Silver


Medals by country

In [None]:
country_names = medals['NOC']

In [None]:
medal_counts = country_names.value_counts()

Which countries have won the most medals?

In [None]:
print(medal_counts.head())

How many medals has Portugal (my country) won in the olympics? (Remember this database goes only until 2008)

In [None]:
medal_counts[medal_counts.index == 'POR']

How many separate bronze, silver and gold medals has each country won?

In [None]:
#Making a pivot table to separate type of medal by countries
counted = medals.pivot_table(aggfunc='count', index='NOC', columns='Medal', values='Athlete')

In [None]:
counted.head()

In [None]:
# Making a new column Total with the sum of the medals
counted['Total'] = counted.sum(axis='columns')

In [None]:
counted.head()

In [None]:
# Sorting values by countries that have won the most medals
counted = counted.sort_values(by='Total', ascending=False)

In [None]:
counted.head()

In [None]:
# Reordering the columns
counted = counted[['Gold', 'Silver', 'Bronze', 'Total']]

In [None]:
counted.head()

Let's see how my country (Portugal) did for each medal

In [None]:
counted[counted.index == 'POR']

Which countries won the most medals in the 1936 Edition in nazi germany?

In [None]:
# Let's take a peak on the main DataFrame again
medals.head()

In [None]:
# Making a pivot table to separate Edition by countries
medals_by_edition = medals.pivot_table(aggfunc='count', index='NOC', columns='Edition', values='Athlete')

In [None]:
medals_by_edition.head()

In [None]:
medals_by_edition.loc[:, 1936].sort_values(ascending=False).head()

Do you think that maybe there was home bias for the host country? Let's see how Germany did in terms of medals in the editions just before 1936. 

Medals in the 1932 edition

In [None]:
medals_by_edition.loc['GER', 1932]

Medals in the 1928 edition

In [None]:
medals_by_edition.loc['GER', 1928]

###### Cleaning the data

What could be the difference between the 'Event_gender' and 'Gender' columns? You should be able to evaluate your guess by looking
at the unique values of the pairs (Event_gender, Gender) in the data. In particular, you should not see something like
(Event_gender='M', Gender='Women'). However, you will see that, strangely enough, there is an observation with
(Event_gender='W', Gender='Men')

In [None]:
# Selecting both columns for comparison
ev_gen = medals[['Event_gender', 'Gender']]

In [None]:
# Dropping duplicates
ev_gen_uniques = ev_gen.drop_duplicates()

In [None]:
medals_by_gender = medals.groupby(['Event_gender', 'Gender'])

In [None]:
medals_by_gender.head(1)

In [None]:
medal_count_by_gender = medals_by_gender.count()

In [None]:
medal_count_by_gender.head()

We can see the data error with Event_gender 'W' and Gender 'Men.

We will now inspect the suspect record by locating the offending row.

In [None]:
# Creating boolean series for filtering
# Even gender 'W' AND Gender 'Men'
filters = (medals.Event_gender == 'W') & (medals.Gender == 'Men')

# Filtering the dataframe
medals[filters]

In [None]:
# Making a copy of medals to correct (You shouldn't mess with the original data)
medals_clean = medals.copy()

In [None]:
medals_clean.iloc[23675, 6] = 'Women'

In [None]:
medals_clean[filters]

Comparing USA vs USSR during the Cold War

In [None]:
# Filters
during_cold_war = (medals.Edition >= 1952) & (medals.Edition <= 1988)
is_usa_urs = (medals.NOC == 'USA') | (medals.NOC == 'URS')

In [None]:
cold_war_medals = medals[(during_cold_war) & (is_usa_urs)]

In [None]:
cold_war_medals # everything seems ok

In [None]:
country_grouped = cold_war_medals.groupby('NOC')

In [None]:
country_grouped.head()

In [None]:
# Number of unique sports in which the USA and USSR won medals
Nsports = country_grouped['Sport'].nunique().sort_values(ascending=False)

In [None]:
Nsports

Counting USA vs. USSR Cold War Olympic Medals

In [None]:
# Medals won by the USA and medals won by the USSR
medals_won_by_country = cold_war_medals.pivot_table(aggfunc='count', index='Edition', columns='NOC', values='Athlete')

In [None]:
medals_won_by_country.loc['Total'] = pd.Series(medals_won_by_country[['USA', 'URS']].sum(), index = ['USA', 'URS'])

In [None]:
medals_won_by_country # The NaN values was because of boycots each country made when the olympics were hosted by the "rival"

Visualizing USA Medal Counts by Edition

In [None]:
# Create the DataFrame: usa
usa = medals[(medals.NOC == 'USA')]

In [None]:
# Group usa by ['Edition', 'Medal'] and aggregate over 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()

In [None]:
usa_medals_by_year

In [None]:
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')

In [None]:
usa_medals_by_year.head()

In [None]:
#ordering the columns
usa_medals_by_year = usa_medals_by_year[['Gold', 'Silver', 'Bronze']]

In [None]:
usa_medals_by_year.iplot(kind='scatter',
                         yTitle='Number of medals',
                         title='Number of medals won by the USA by edition',
                         color=['gold','darkgray', 'brown'],fill=True)