In [29]:
import pandas as pd
import numpy as np
from IPython.display import display

## Merge, join, concatenate and compare

pandas provides various methods for combining and comparing Series or DataFrame

## concat()

In [30]:
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)
display(df1)
df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index=[0, 1, 2, 3],
)
display(df2)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


Unnamed: 0,A,B,C,D
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [31]:
result = pd.concat([df1, df2], axis=0)
display(result)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [32]:
result = pd.concat([df1, df2], axis=1)
display(result)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,A4,B4,C4,D4
1,A1,B1,C1,D1,A5,B5,C5,D5
2,A2,B2,C2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,A7,B7,C7,D7


## Merge()

merge() performs join operations similar to relational databases like SQL

In [33]:
left = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)
right = pd.DataFrame(
    {
        "key": ["K0", "K1", "K20", "K30"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)
display(left)
display(right)
result = pd.merge(left, right, on="key", how="inner")
display(result)

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K20,C2,D2
3,K30,C3,D3


Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1


In [34]:
result = pd.merge(left, right, on="key", how="left")
display(result)

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,,
3,K3,A3,B3,,


In [35]:
result = pd.merge(left, right, on="key", how="right")
display(result)

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K20,,,C2,D2
3,K30,,,C3,D3


In [36]:
result = pd.merge(left, right, on="key", how="outer")
display(left)
display(right)
display(result)

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K20,C2,D2
3,K30,C3,D3


Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,,
3,K20,,,C2,D2
4,K3,A3,B3,,
5,K30,,,C3,D3


## Group by: split-apply-combine

By “group by” we are referring to a process involving one or more of the following steps:

- Splitting the data into groups based on some criteria.

- Applying a function to each group independently.

- Combining the results into a data structure.

In [37]:
animals = pd.DataFrame(
    {
        "kind": ["cat", "dog", "cat", "dog"],
        "height": [9.1, 6.0, 9.5, 34.0],
        "weight": [7.9, 7.5, 9.9, 198.0],
    }
)
display(animals)

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [38]:
# assume we want to know mean height and max weight for every kind ?
grp = animals.groupby(by=["kind"]).aggregate({"height": np.mean, "weight": np.min})
display(grp)

  grp = animals.groupby(by=["kind"]).aggregate({"height": np.mean, "weight": np.min})
  grp = animals.groupby(by=["kind"]).aggregate({"height": np.mean, "weight": np.min})


Unnamed: 0_level_0,height,weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.3,7.9
dog,20.0,7.5


In [39]:
# we want to know max and min height for every kind
grp = animals.groupby(by=["kind"]).aggregate({"height": np.max, "height": np.min})
display(grp)

  grp = animals.groupby(by=["kind"]).aggregate({"height": np.max, "height": np.min})


Unnamed: 0_level_0,height
kind,Unnamed: 1_level_1
cat,9.1
dog,6.0


In [40]:
# we can user rename() function to give agg column meaninigfful names
grp = (
    animals.groupby(by=["kind"])
    .aggregate({"height": np.mean, "weight": np.min})
    .rename(columns={"height": "mean_height", "weight": "min_weight"})
)
display(grp)

  .aggregate({"height": np.mean, "weight": np.min})
  .aggregate({"height": np.mean, "weight": np.min})


Unnamed: 0_level_0,mean_height,min_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.3,7.9
dog,20.0,7.5


In [41]:
# we can use custom aggregation function
# assume we want to compute for each group the difference between max height - min height
from typing import List


def max_minus_min(group: List) -> float:
    """accepts group as list return max(list)-min(list)

    Args:
        group (List): splitted group after groupby()

    Returns:
        float: max(list)-min(list)
    """
    return max(group) - min(group)


display(animals)
grp = animals.groupby(["kind"]).aggregate({"height": max_minus_min})
display(grp)

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


Unnamed: 0_level_0,height
kind,Unnamed: 1_level_1
cat,0.4
dog,28.0


In [42]:
# hands on lab
countries_df = pd.read_csv(
    "https://gist.githubusercontent.com/aakashns/28b2e504b3350afd9bdb157893f9725c/raw/994b65665757f4f8887db1c85986a897abb23d84/countries.csv"
)
countries_df

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
0,Afghanistan,Asia,38928341.0,64.83,0.50,1803.987
1,Albania,Europe,2877800.0,78.57,2.89,11803.431
2,Algeria,Africa,43851043.0,76.88,1.90,13913.839
3,Andorra,Europe,77265.0,83.73,,
4,Angola,Africa,32866268.0,61.15,,5819.495
...,...,...,...,...,...,...
205,Vietnam,Asia,97338583.0,75.40,2.60,6171.884
206,Western Sahara,Africa,597330.0,70.26,,
207,Yemen,Asia,29825968.0,66.12,0.70,1479.147
208,Zambia,Africa,18383956.0,63.89,2.00,3689.251


In [43]:
# Q1 show total number of locations per conetinet , max life_expectancy , average hospital_beds_per_thousand

In [44]:
covid_data_url = "https://gist.githubusercontent.com/aakashns/b2a968a6cfd9fbbb0ff3d6bd0f26262b/raw/b115ed1dfa17f10fc88bf966236cd4d9032f1df8/covid-countries-data.csv"

covid_df = pd.read_csv(covid_data_url)
covid_df

Unnamed: 0,location,total_cases,total_deaths,total_tests
0,Afghanistan,38243.0,1409.0,
1,Albania,9728.0,296.0,
2,Algeria,45158.0,1525.0,
3,Andorra,1199.0,53.0,
4,Angola,2729.0,109.0,
...,...,...,...,...
207,Western Sahara,766.0,1.0,
208,World,26059065.0,863535.0,
209,Yemen,1976.0,571.0,
210,Zambia,12415.0,292.0,


In [45]:
# Q2 merge covid_df with countries_df how=inner

In [46]:
# Q3 find new population after total_deathes for every location

In [47]:
# Q4 for every continetn find total_death per countery , median total_cases , hint np.median