# Multiple files with `pandas`

In [1]:
import numpy as np
import pandas as pd
import matplotlib

## Merge Practice

In [2]:
a_df = pd.DataFrame({
    'Country': ['Germany', 'France', 'Belgium', 'Finland'],
    'Population (M)': [82.8, 67.2, 11.4, 5.5],
    'Capital': ['Berlin', 'Paris', 'Brussels', 'Helsinki']
})
a_df

Unnamed: 0,Country,Population (M),Capital
0,Germany,82.8,Berlin
1,France,67.2,Paris
2,Belgium,11.4,Brussels
3,Finland,5.5,Helsinki


In [3]:
b_df = pd.DataFrame({
    'Country': ['Germany', 'France', 'Belgium', 'Canada'],
    'HDI': [0.936, 0.901, 0.916, 0.926]
})
b_df

Unnamed: 0,Country,HDI
0,Germany,0.936
1,France,0.901
2,Belgium,0.916
3,Canada,0.926


### Inner Merge Practice

In [11]:
inner_merged_df = pd.merge(a_df, b_df)
inner_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916


#### Check your code

In [12]:
from nbresult import ChallengeResult

result = ChallengeResult('inner_merge',
    inner_merged_shape=inner_merged_df.shape,
    inner_merged_nulls=sum(inner_merged_df.isnull().sum())
)
result.write()

print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /Users/humbert/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/humbert/code/HumbertMonnot/data-challenges/02-Data-Toolkit/01-Data-Analysis/04-Multiple-Files-With-Pandas
plugins: anyio-3.4.0, dash-2.0.0
[1mcollecting ... [0mcollected 2 items

tests/test_inner_merge.py::TestInnerMerge::test_inner_merged_nulls [32mPASSED[0m[32m [ 50%][0m
tests/test_inner_merge.py::TestInnerMerge::test_inner_merged_shape [32mPASSED[0m[32m [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/inner_merge.pickle

[32mgit[39m commit -m [33m'Completed inner_merge step'[39m

[32mgit[39m push origin master


### Left Merge Practice

In [20]:
left_merged_df = a_df.merge(b_df, on = "Country", how ='left')
left_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Finland,5.5,Helsinki,


#### Check your code

In [21]:
from nbresult import ChallengeResult

result = ChallengeResult('left_merge',
    left_merged_shape=left_merged_df.shape,
    left_merged_nulls=sum(left_merged_df.isnull().sum())
)
result.write()

print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /Users/humbert/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/humbert/code/HumbertMonnot/data-challenges/02-Data-Toolkit/01-Data-Analysis/04-Multiple-Files-With-Pandas
plugins: anyio-3.4.0, dash-2.0.0
[1mcollecting ... [0mcollected 2 items

tests/test_left_merge.py::TestLeftMerge::test_left_merged_df_shape [32mPASSED[0m[32m [ 50%][0m
tests/test_left_merge.py::TestLeftMerge::test_left_merged_nulls [32mPASSED[0m[32m   [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/left_merge.pickle

[32mgit[39m commit -m [33m'Completed left_merge step'[39m

[32mgit[39m push origin master


### Right Merge Practice

In [22]:
right_merged_df = a_df.merge(b_df, on = "Country", how ='right')
right_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Canada,,,0.926


#### Check your code

In [23]:
from nbresult import ChallengeResult

result = ChallengeResult('right_merge',
    right_merged_shape=right_merged_df.shape,
    right_merged_nulls=sum(right_merged_df.isnull().sum())
)
result.write()

print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /Users/humbert/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/humbert/code/HumbertMonnot/data-challenges/02-Data-Toolkit/01-Data-Analysis/04-Multiple-Files-With-Pandas
plugins: anyio-3.4.0, dash-2.0.0
[1mcollecting ... [0mcollected 2 items

tests/test_right_merge.py::TestRightMerge::test_right_merged_df_shape [32mPASSED[0m[32m [ 50%][0m
tests/test_right_merge.py::TestRightMerge::test_right_merged_nulls [32mPASSED[0m[32m [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/right_merge.pickle

[32mgit[39m commit -m [33m'Completed right_merge step'[39m

[32mgit[39m push origin master


### Outer Merge Practice

In [24]:
outer_merged_df = a_df.merge(b_df, on = "Country", how ='outer')
outer_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Finland,5.5,Helsinki,
4,Canada,,,0.926


#### Check your code

In [25]:
from nbresult import ChallengeResult

result = ChallengeResult('outer_merge',
    outer_merged_shape=outer_merged_df.shape,
    outer_merged_nulls=sum(outer_merged_df.isnull().sum())
)
result.write()

print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /Users/humbert/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/humbert/code/HumbertMonnot/data-challenges/02-Data-Toolkit/01-Data-Analysis/04-Multiple-Files-With-Pandas
plugins: anyio-3.4.0, dash-2.0.0
[1mcollecting ... [0mcollected 2 items

tests/test_outer_merge.py::TestOuterMerge::test_outer_merged_df_shape [32mPASSED[0m[32m [ 50%][0m
tests/test_outer_merge.py::TestOuterMerge::test_outer_merged_nulls [32mPASSED[0m[32m [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/outer_merge.pickle

[32mgit[39m commit -m [33m'Completed outer_merge step'[39m

[32mgit[39m push origin master


## Join Practice

In [26]:
aa_df = a_df.set_index("Country")
aa_df

Unnamed: 0_level_0,Population (M),Capital
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Germany,82.8,Berlin
France,67.2,Paris
Belgium,11.4,Brussels
Finland,5.5,Helsinki


In [27]:
bb_df = b_df.set_index("Country")
bb_df

Unnamed: 0_level_0,HDI
Country,Unnamed: 1_level_1
Germany,0.936
France,0.901
Belgium,0.916
Canada,0.926


In [28]:
aa_df.join(bb_df)

Unnamed: 0_level_0,Population (M),Capital,HDI
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Germany,82.8,Berlin,0.936
France,67.2,Paris,0.901
Belgium,11.4,Brussels,0.916
Finland,5.5,Helsinki,


## Concat Practice

In [29]:
concat_df = pd.concat([a_df, b_df], axis="index", sort=False)
concat_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,
1,France,67.2,Paris,
2,Belgium,11.4,Brussels,
3,Finland,5.5,Helsinki,
0,Germany,,,0.936
1,France,,,0.901
2,Belgium,,,0.916
3,Canada,,,0.926


In [31]:
file1 = "dictionary.csv"
file2 = "summer.csv"
file3 = "winter.csv"

countries_df = pd.read_csv(file1, decimal=",")
summer_df = pd.read_csv(file2, decimal=",")
winter_df = pd.read_csv(file3, decimal=",")

In [35]:
countries_df.head(5)

Unnamed: 0,Country,Code,Population,GDP per Capita
0,Afghanistan,AFG,32526562.0,594.323081219966
1,Albania,ALB,2889167.0,3945.21758150914
2,Algeria,ALG,39666519.0,4206.03123244958
3,American Samoa*,ASA,55538.0,
4,Andorra,AND,70473.0,


In [38]:
winter_df.head(5)

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1924,Chamonix,Biathlon,Biathlon,"BERTHET, G.",FRA,Men,Military Patrol,Bronze
1,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, C.",FRA,Men,Military Patrol,Bronze
2,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, Maurice",FRA,Men,Military Patrol,Bronze
3,1924,Chamonix,Biathlon,Biathlon,"VANDELLE, André",FRA,Men,Military Patrol,Bronze
4,1924,Chamonix,Biathlon,Biathlon,"AUFDENBLATTEN, Adolf",SUI,Men,Military Patrol,Gold


In [99]:
summer_df.rename(columns={"Country":"Code"}, inplace=True)
summer_df["Code"].isnull().sum()

4

In [92]:
winter_df.rename(columns={"Country":"Code"}, inplace=True)
winter_df['Code'].unique()

array(['FRA', 'SUI', 'FIN', 'BEL', 'GBR', 'SWE', 'CAN', 'USA', 'AUT',
       'NOR', 'GER', 'TCH', 'HUN', 'ITA', 'FRG', 'NED', 'URS', 'EUA',
       'JPN', 'POL', 'PRK', 'ROU', 'GDR', 'ESP', 'LIE', 'BUL', 'YUG',
       'EUN', 'KOR', 'CHN', 'LUX', 'NZL', 'RUS', 'UKR', 'BLR', 'AUS',
       'SLO', 'KAZ', 'UZB', 'DEN', 'CZE', 'CRO', 'EST', 'LAT', 'SVK'],
      dtype=object)

### Combining The Data

In [56]:
summer_countries_df = summer_df.merge(countries_df, on = "Code", how= "inner")
summer_countries_df["Season"]="Summer"
summer_countries_df

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Code,Gender,Event,Medal,Country,Population,GDP per Capita,Season
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold,Hungary,9844686.0,12363.5434596539,Summer
1,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,1200M Freestyle,Gold,Hungary,9844686.0,12363.5434596539,Summer
2,1896,Athens,Athletics,Athletics,"SZOKOLYI, Alajos",HUN,Men,100M,Bronze,Hungary,9844686.0,12363.5434596539,Summer
3,1896,Athens,Athletics,Athletics,"DANI, Nandor",HUN,Men,800M,Silver,Hungary,9844686.0,12363.5434596539,Summer
4,1896,Athens,Athletics,Athletics,"KELLNER, Gyula",HUN,Men,Marathon,Bronze,Hungary,9844686.0,12363.5434596539,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25737,2012,London,Athletics,Athletics,"BARRONDO, Erick",GUA,Men,20KM Walk,Silver,Guatemala,16342897.0,3903.47885604613,Summer
25738,2012,London,Athletics,Athletics,"JAMES, Kirani",GRN,Men,400M,Gold,Grenada,106825.0,9212.02035173484,Summer
25739,2012,London,Athletics,Athletics,"AMOS, Nijel",BOT,Men,800M,Silver,Botswana,2262485.0,6360.13822018837,Summer
25740,2012,London,Sailing,Sailing,"KONTIDES, Pavlos",CYP,Men,Laser,Silver,Cyprus,1165300.0,23242.8400685313,Summer


In [58]:
winter_countries_df = winter_df.merge(countries_df, on = "Code", how= "inner")
winter_countries_df["Season"]="Winter"
winter_countries_df

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Code,Gender,Event,Medal,Country,Population,GDP per Capita,Season
0,1924,Chamonix,Biathlon,Biathlon,"BERTHET, G.",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
1,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, C.",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
2,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, Maurice",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
3,1924,Chamonix,Biathlon,Biathlon,"VANDELLE, André",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
4,1924,Chamonix,Curling,Curling,"ALDEERT, H.",FRA,Men,Curling,Bronze,France,66808385.0,36205.5681017036,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4821,2006,Turin,Skiing,Snowboard,"ZIDEK, Radoslav",SVK,Men,Snowboard Cross,Silver,Slovakia,5424050.0,16088.2775872723,Winter
4822,2010,Vancouver,Biathlon,Biathlon,"KUZMINA, Anastazia",SVK,Women,10KM Pursuit,Silver,Slovakia,5424050.0,16088.2775872723,Winter
4823,2010,Vancouver,Biathlon,Biathlon,"HURAJT, Pavol",SVK,Men,15KM Mass Start,Bronze,Slovakia,5424050.0,16088.2775872723,Winter
4824,2010,Vancouver,Biathlon,Biathlon,"KUZMINA, Anastazia",SVK,Women,7.5KM,Gold,Slovakia,5424050.0,16088.2775872723,Winter


In [59]:
all_df = pd.concat([winter_countries_df, summer_countries_df], axis="index", sort=False)
all_df

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Code,Gender,Event,Medal,Country,Population,GDP per Capita,Season
0,1924,Chamonix,Biathlon,Biathlon,"BERTHET, G.",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
1,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, C.",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
2,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, Maurice",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
3,1924,Chamonix,Biathlon,Biathlon,"VANDELLE, André",FRA,Men,Military Patrol,Bronze,France,66808385.0,36205.5681017036,Winter
4,1924,Chamonix,Curling,Curling,"ALDEERT, H.",FRA,Men,Curling,Bronze,France,66808385.0,36205.5681017036,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25737,2012,London,Athletics,Athletics,"BARRONDO, Erick",GUA,Men,20KM Walk,Silver,Guatemala,16342897.0,3903.47885604613,Summer
25738,2012,London,Athletics,Athletics,"JAMES, Kirani",GRN,Men,400M,Gold,Grenada,106825.0,9212.02035173484,Summer
25739,2012,London,Athletics,Athletics,"AMOS, Nijel",BOT,Men,800M,Silver,Botswana,2262485.0,6360.13822018837,Summer
25740,2012,London,Sailing,Sailing,"KONTIDES, Pavlos",CYP,Men,Laser,Silver,Cyprus,1165300.0,23242.8400685313,Summer


#### Check your code

In [60]:
from nbresult import ChallengeResult

result = ChallengeResult('all_df',
    all_df_shape=all_df.shape,
    all_df_columns=set(all_df.columns)
)
result.write()

print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /Users/humbert/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/humbert/code/HumbertMonnot/data-challenges/02-Data-Toolkit/01-Data-Analysis/04-Multiple-Files-With-Pandas
plugins: anyio-3.4.0, dash-2.0.0
[1mcollecting ... [0mcollected 2 items

tests/test_all_df.py::TestAllDf::test_all_df_columns [32mPASSED[0m[32m              [ 50%][0m
tests/test_all_df.py::TestAllDf::test_all_df_shape [32mPASSED[0m[32m                [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/all_df.pickle

[32mgit[39m commit -m [33m'Completed all_df step'[39m

[32mgit[39m push origin master


### Top Countries Analysis

In [103]:
top_10_df['Count'] = all_df[all_df['Year']>=1984].groupby('Country').count().sort_values(by='Year', ascending=False)

top_10_df

ValueError: Columns must be same length as key

In [90]:
top_10_df.rename(columns={"Year":"Medal Count"}, inplace=True)

#### Check your code

In [91]:
from nbresult import ChallengeResult

result = ChallengeResult('olympic_games',
    top_country_1=top_10_df.iloc[0]['Medal Count'],
    top_country_10=top_10_df.iloc[9]['Medal Count']
)
result.write()

print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /Users/humbert/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/humbert/code/HumbertMonnot/data-challenges/02-Data-Toolkit/01-Data-Analysis/04-Multiple-Files-With-Pandas
plugins: anyio-3.4.0, dash-2.0.0
[1mcollecting ... [0mcollected 1 item

tests/test_olympic_games.py::TestOlympicGames::test_top_10_countries_medals [32mPASSED[0m[32m [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/olympic_games.pickle

[32mgit[39m commit -m [33m'Completed olympic_games step'[39m

[32mgit[39m push origin master
