In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
#reading data
flood_aff = pd.read_csv('../data/raw/total-affected-by-floods/affected.csv')
flood_death = pd.read_csv('../data/raw/death-rate-from-floods/death.csv')

In [6]:
flood_aff

Unnamed: 0,Country name,Year,"Total number of people affected by floods per 100,000"
0,Afghanistan,1954,
1,Afghanistan,1956,0.0
2,Afghanistan,1963,0.0
3,Afghanistan,1969,
4,Afghanistan,1971,
...,...,...,...
7713,Zimbabwe,2020,
7714,Zimbabwe,2021,
7715,Zimbabwe,2022,0.0
7716,Zimbabwe,2023,


In [9]:
flood_death

Unnamed: 0,Country name,Year,Death rates from floods
0,Afghanistan,1954,
1,Afghanistan,1956,0.603243
2,Afghanistan,1963,1.114062
3,Afghanistan,1969,
4,Afghanistan,1971,
...,...,...,...
7713,Zimbabwe,2020,
7714,Zimbabwe,2021,
7715,Zimbabwe,2022,0.000000
7716,Zimbabwe,2023,


In [5]:
flood_aff.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Total number of people affected by floods per 100,000': 'affected'
    },
    inplace=True
)
flood_death.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Death rates from floods': 'death'
    },
    inplace=True
)


In [6]:
#checking if both tables are the same or not
same_ = flood_aff['year'] == flood_death['year']
set(same_)

{True}

In [9]:
flood_df = pd.merge(flood_aff, flood_death, on=['country', 'year'])
flood_df


Unnamed: 0,country,year,affected,death
0,Afghanistan,1954,,
1,Afghanistan,1956,0.0,0.603243
2,Afghanistan,1963,0.0,1.114062
3,Afghanistan,1969,,
4,Afghanistan,1971,,
...,...,...,...,...
7713,Zimbabwe,2020,,
7714,Zimbabwe,2021,,
7715,Zimbabwe,2022,0.0,0.000000
7716,Zimbabwe,2023,,


In [11]:
flood_df.to_csv("../data/processed/flood_data.csv")

In [17]:
flood_df.isnull().sum()

country        0
year           0
affected    3409
death       3409
dtype: int64

In [18]:
flood_df = flood_df.dropna(subset=["affected", "death"], how="all")
flood_df

Unnamed: 0,country,year,affected,death
1,Afghanistan,1956,0.000000,0.603243
2,Afghanistan,1963,0.000000,1.114062
5,Afghanistan,1972,2109.046400,1.265428
7,Afghanistan,1976,612.563960,0.390510
8,Afghanistan,1978,1995.997000,0.881611
...,...,...,...,...
7706,Zimbabwe,2013,69.217430,0.891977
7707,Zimbabwe,2014,12.331631,0.056309
7708,Zimbabwe,2015,5.208692,0.041670
7712,Zimbabwe,2019,1.964459,0.170253


In [19]:
flood_df.isnull().sum()

country     0
year        0
affected    0
death       0
dtype: int64

In [20]:
flood_df.to_csv("../data/processed/flood_data.csv")

In [22]:
flood_df["period"] = (flood_df["year"] // 5) * 5
flood_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flood_df["period"] = (flood_df["year"] // 5) * 5


Unnamed: 0,country,year,affected,death,period
1,Afghanistan,1956,0.000000,0.603243,1955
2,Afghanistan,1963,0.000000,1.114062,1960
5,Afghanistan,1972,2109.046400,1.265428,1970
7,Afghanistan,1976,612.563960,0.390510,1975
8,Afghanistan,1978,1995.997000,0.881611,1975
...,...,...,...,...,...
7706,Zimbabwe,2013,69.217430,0.891977,2010
7707,Zimbabwe,2014,12.331631,0.056309,2010
7708,Zimbabwe,2015,5.208692,0.041670,2015
7712,Zimbabwe,2019,1.964459,0.170253,2015


In [51]:
agg = flood_df.groupby(["country", "period"]).agg(
    affected=("affected", "mean"),
    death=("death", "mean"),
    events=("year", "count")
).reset_index()
agg

Unnamed: 0,country,period,affected,death,events
0,Afghanistan,1955,0.000000,0.603243,1
1,Afghanistan,1960,0.000000,1.114062,1
2,Afghanistan,1970,2109.046400,1.265428,1
3,Afghanistan,1975,1304.280480,0.636060,2
4,Afghanistan,1980,227.802320,0.000000,1
...,...,...,...,...,...
1642,Zimbabwe,2000,878.175137,0.237855,3
1643,Zimbabwe,2005,66.372705,0.105248,2
1644,Zimbabwe,2010,21.922093,0.237072,4
1645,Zimbabwe,2015,3.586576,0.105961,2


In [52]:
agg = agg[agg["events"] > 1]

In [53]:
agg

Unnamed: 0,country,period,affected,death,events
3,Afghanistan,1975,1304.280480,0.636060,2
6,Afghanistan,1990,584.557995,7.133021,2
7,Afghanistan,1995,43.192060,0.217644,5
8,Afghanistan,2000,20.515456,0.348350,3
9,Afghanistan,2005,110.719391,0.732489,5
...,...,...,...,...,...
1640,Zambia,2020,1554.099067,0.010209,3
1642,Zimbabwe,2000,878.175137,0.237855,3
1643,Zimbabwe,2005,66.372705,0.105248,2
1644,Zimbabwe,2010,21.922093,0.237072,4


In [54]:
#LOG TRANSFORMATION to reduce skewness
agg["affected"] = np.log1p(agg["affected"])
agg["death"] = np.log1p(agg["death"])
agg


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agg["affected"] = np.log1p(agg["affected"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agg["death"] = np.log1p(agg["death"])


Unnamed: 0,country,period,affected,death,events
3,Afghanistan,1975,7.174173,0.492291,2
6,Afghanistan,1990,6.372565,2.095932,2
7,Afghanistan,1995,3.788545,0.196918,5
8,Afghanistan,2000,3.068772,0.298882,3
9,Afghanistan,2005,4.715990,0.549559,5
...,...,...,...,...,...
1640,Zambia,2020,7.349295,0.010157,3
1642,Zimbabwe,2000,6.778984,0.213380,3
1643,Zimbabwe,2005,4.210240,0.100070,2
1644,Zimbabwe,2010,3.132101,0.212747,4


In [55]:
#standardizing the data, (so that countries are comparable)
from sklearn.preprocessing import StandardScaler

agg

scaler = StandardScaler()
agg[["affected_z", "death_z"]] = scaler.fit_transform(
    agg[["affected", "death"]]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agg[["affected_z", "death_z"]] = scaler.fit_transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agg[["affected_z", "death_z"]] = scaler.fit_transform(


In [56]:
agg

Unnamed: 0,country,period,affected,death,events,affected_z,death_z
3,Afghanistan,1975,7.174173,0.492291,2,0.980165,0.477347
6,Afghanistan,1990,6.372565,2.095932,2,0.624722,3.379254
7,Afghanistan,1995,3.788545,0.196918,5,-0.521064,-0.057153
8,Afghanistan,2000,3.068772,0.298882,3,-0.840220,0.127358
9,Afghanistan,2005,4.715990,0.549559,5,-0.109823,0.580977
...,...,...,...,...,...,...,...
1640,Zambia,2020,7.349295,0.010157,3,1.057816,-0.395110
1642,Zimbabwe,2000,6.778984,0.213380,3,0.804934,-0.027363
1643,Zimbabwe,2005,4.210240,0.100070,2,-0.334079,-0.232406
1644,Zimbabwe,2010,3.132101,0.212747,4,-0.812139,-0.028509


In [57]:
agg["flood_impact"] = agg["affected_z"] + agg["death_z"]
agg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agg["flood_impact"] = agg["affected_z"] + agg["death_z"]


Unnamed: 0,country,period,affected,death,events,affected_z,death_z,flood_impact
3,Afghanistan,1975,7.174173,0.492291,2,0.980165,0.477347,1.457512
6,Afghanistan,1990,6.372565,2.095932,2,0.624722,3.379254,4.003976
7,Afghanistan,1995,3.788545,0.196918,5,-0.521064,-0.057153,-0.578217
8,Afghanistan,2000,3.068772,0.298882,3,-0.840220,0.127358,-0.712863
9,Afghanistan,2005,4.715990,0.549559,5,-0.109823,0.580977,0.471154
...,...,...,...,...,...,...,...,...
1640,Zambia,2020,7.349295,0.010157,3,1.057816,-0.395110,0.662706
1642,Zimbabwe,2000,6.778984,0.213380,3,0.804934,-0.027363,0.777571
1643,Zimbabwe,2005,4.210240,0.100070,2,-0.334079,-0.232406,-0.566486
1644,Zimbabwe,2010,3.132101,0.212747,4,-0.812139,-0.028509,-0.840649


In [58]:
flood_final = agg[["country", "period", "flood_impact"]]
flood_final.to_csv("../data/processed/flood_data.csv")

162