# Multiple files with `pandas`

In [1]:
import numpy as np
import pandas as pd
import matplotlib

In [2]:
a_df = pd.DataFrame({
    'Country': ['Germany', 'France', 'Belgium', 'Finland'],
    'Population (M)': [82.8, 67.2, 11.4, 5.5],
    'Capital': ['Berlin', 'Paris', 'Brussels', 'Helsinki']
})
a_df

Unnamed: 0,Country,Population (M),Capital
0,Germany,82.8,Berlin
1,France,67.2,Paris
2,Belgium,11.4,Brussels
3,Finland,5.5,Helsinki


In [3]:
b_df = pd.DataFrame({
    'Country': ['Germany', 'France', 'Belgium', 'Canada'],
    'HDI': [0.936, 0.901, 0.916, 0.926]
})
b_df

Unnamed: 0,Country,HDI
0,Germany,0.936
1,France,0.901
2,Belgium,0.916
3,Canada,0.926


In [4]:
a_df.merge(b_df, on='Country', how='inner')

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916


In [5]:
a_df.merge(b_df, on='Country', how='left')

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Finland,5.5,Helsinki,


In [6]:
a_df.merge(b_df, on='Country', how='right')

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Canada,,,0.926


In [7]:
a_df.merge(b_df, on='Country', how='outer')

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Finland,5.5,Helsinki,
4,Canada,,,0.926


In [8]:
a_df.merge(b_df, on='Country', how='outer').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         5 non-null      object 
 1   Population (M)  4 non-null      float64
 2   Capital         4 non-null      object 
 3   HDI             4 non-null      float64
dtypes: float64(2), object(2)
memory usage: 200.0+ bytes


In [9]:
pd.concat([a_df, b_df], axis="index", sort=False)

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,
1,France,67.2,Paris,
2,Belgium,11.4,Brussels,
3,Finland,5.5,Helsinki,
0,Germany,,,0.936
1,France,,,0.901
2,Belgium,,,0.916
3,Canada,,,0.926


#### Get Info

In [10]:
countries_df = pd.read_csv('dictionary.csv')
summer_df  = pd.read_csv('summer.csv')
winter_df = pd.read_csv('winter.csv')

summer_df.rename(columns={"Country": "Code"}, inplace=True)
winter_df.rename(columns={"Country": "Code"}, inplace=True)

In [11]:
summer_countries_df = countries_df.merge(summer_df, on='Code')
summer_countries_df['Season'] = 'Summer' 

In [12]:
winter_countries_df = countries_df.merge(winter_df, on='Code')
winter_countries_df['Season'] = 'Winter' 

In [13]:
all_df  = pd.concat([summer_countries_df, winter_countries_df])

Top 10 Countries since 1984

Use boolean indexing, grouping & sorting to create a new dataframe consisting of the Top 10 countries who won the most medals since 1984. Save it in the top_10_df variable. Then plot it. Go step by step!

In [14]:
top_10_df = all_df[all_df['Year']>=1984].groupby('Code').count().sort_values('Country', ascending=False).head(10)[['Country']].rename(columns={"Country": "Medal"}, inplace=False)

### Check your code

In [15]:
from nbresult import ChallengeResult

result = ChallengeResult('olympic_games',
    summer_countries_shape=summer_countries_df.shape,
    all_countries_shape=all_df.shape,
    top_country_1=top_10_df.iloc[0]['Medal'],
    top_country_10=top_10_df.iloc[9]['Medal'],
)
result.write()

In [16]:
print(result.check())

platform linux -- Python 3.8.6, pytest-6.2.4, py-1.10.0, pluggy-0.13.1 -- /home/nandosoq/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/nandosoq/code/Nandosoq/data-challenges/02-Data-Toolkit/01-Data-Analysis/03-Multiple-Files-With-Pandas
plugins: anyio-3.2.1, dash-1.21.0
[1mcollecting ... [0mcollected 3 items

tests/test_olympic_games.py::TestOlympicGames::test_all_countries_df_shape [32mPASSED[0m[32m [ 33%][0m
tests/test_olympic_games.py::TestOlympicGames::test_summer_countries_df_shape [32mPASSED[0m[32m [ 66%][0m
tests/test_olympic_games.py::TestOlympicGames::test_top_10_countries_medals [32mPASSED[0m[32m [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/olympic_games.pickle

[32mgit[39m commit -m [33m'Completed olympic_games step'[39m

[32mgit[39m push origin master
