In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import missingno as msno
import folium
import geopandas as gpd


In [34]:
# load data to df (file 1900-2021)
df = pd.read_csv('/Users/Alfred/Documents/Trabajo/Personal/Climate/Data/1900_2021_DISASTERS.xlsx - emdat data.csv')

In [22]:
# info of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16126 entries, 0 to 16125
Data columns (total 45 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Year                        16126 non-null  int64  
 1   Seq                         16126 non-null  int64  
 2   Glide                       1581 non-null   object 
 3   Disaster Group              16126 non-null  object 
 4   Disaster Subgroup           16126 non-null  object 
 5   Disaster Type               16126 non-null  object 
 6   Disaster Subtype            13016 non-null  object 
 7   Disaster Subsubtype         1077 non-null   object 
 8   Event Name                  3861 non-null   object 
 9   Country                     16126 non-null  object 
 10  ISO                         16126 non-null  object 
 11  Region                      16126 non-null  object 
 12  Continent                   16126 non-null  object 
 13  Location                    143

In [23]:
# summary of df
df.describe()

Unnamed: 0,Year,Seq,Aid Contribution,Dis Mag Value,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Insured Damages ('000 US$),Total Damages ('000 US$),CPI
count,16126.0,16126.0,677.0,4946.0,16126.0,15739.0,12498.0,16126.0,15418.0,12570.0,11413.0,3895.0,9220.0,2430.0,11617.0,1096.0,5245.0,15811.0
mean,1996.76479,714.78482,125413.6,47350.38,1996.77837,6.444374,15.233957,1996.835607,6.576728,15.77502,2842.866,2621.102,882361.2,73293.14,716508.8,798651.4,724783.5,63.215103
std,20.159065,1929.635089,2997875.0,309424.2,20.15571,3.393965,8.953821,20.14301,3.352965,8.865486,68605.95,34403.43,8573913.0,523005.8,7718598.0,3057638.0,4723131.0,26.734285
min,1900.0,1.0,1.0,-57.0,1900.0,1.0,1.0,1900.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,34.0,2.0,3.221647
25%,1989.0,93.0,175.0,7.0,1989.0,4.0,7.0,1989.0,4.0,8.0,6.0,14.0,1244.75,572.5,650.0,50000.0,8300.0,45.692897
50%,2001.0,270.0,721.0,151.5,2001.0,7.0,15.0,2001.0,7.0,16.0,20.0,50.0,10000.0,3000.0,5965.0,172500.0,60000.0,68.415379
75%,2011.0,486.0,3511.0,11296.5,2011.0,9.0,23.0,2011.0,9.0,23.0,63.0,200.0,91823.0,17500.0,58255.0,500000.0,317300.0,84.252733
max,2021.0,9881.0,78000000.0,13025870.0,2021.0,12.0,31.0,2021.0,12.0,31.0,3700000.0,1800000.0,330000000.0,15850000.0,330000000.0,60000000.0,210000000.0,100.0


In [24]:
# show columns
df.columns

Index(['Year', 'Seq', 'Glide', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype',
       'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value',
       'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin',
       'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month',
       'End Day', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless',
       'Total Affected', 'Insured Damages ('000 US$)',
       'Total Damages ('000 US$)', 'CPI', 'Adm Level', 'Admin1 Code',
       'Admin2 Code', 'Geo Locations'],
      dtype='object')

In [41]:
# df with Year, Disaster Subtype = 'Riverine flood', Country, Region, Continent, Location, Dis Mag Value, Dis Mag Scale,  Total Damages ('000 US$)
df2 = df[['Year', 'Disaster Subtype', 'Country', 'Region', 'Continent', 'Location', 'Dis Mag Value', 'Dis Mag Scale', "Total Damages ('000 US$)"]]

# only Riverine flood
df2 = df2[df2['Disaster Subtype'] == 'Riverine flood']

# only Country = United States of America
df2 = df2[df2['Country'] == 'United States of America (the)']

# null values
df2.isnull().sum()


Year                        0.0
Disaster Subtype            0.0
Country                     0.0
Region                      0.0
Continent                   0.0
Location                    0.0
Dis Mag Value               0.0
Dis Mag Scale               0.0
Total Damages ('000 US$)    0.0
dtype: float64

In [26]:
# unique Disaster Subgroup
df['Disaster Subgroup'].unique()



array(['Climatological', 'Geophysical', 'Meteorological', 'Hydrological',
       'Biological', 'Extra-terrestrial'], dtype=object)

In [27]:
# create a df with Disaster Subgroup ...
df1 = df[['Disaster Subgroup', 'Disaster Subtype', 'Disaster Subsubtype']]
df1


Unnamed: 0,Disaster Subgroup,Disaster Subtype,Disaster Subsubtype
0,Climatological,Drought,
1,Climatological,Drought,
2,Geophysical,Ground movement,
3,Geophysical,Ash fall,
4,Geophysical,Ash fall,
...,...,...,...
16121,Hydrological,,
16122,Hydrological,,
16123,Biological,Viral disease,
16124,Hydrological,,


In [28]:
# list of unique DIsaster Subtype and count
df1['Disaster Subtype'].value_counts()


Disaster Subtype
Riverine flood                      2660
Tropical cyclone                    2420
Ground movement                     1484
Convective storm                    1118
Flash flood                          779
Drought                              769
Bacterial disease                    768
Landslide                            570
Viral disease                        543
Forest fire                          313
Cold wave                            307
Ash fall                             246
Heat wave                            219
Extra-tropical storm                 131
Land fire (Brush, Bush, Pasture)     123
Avalanche                            119
Coastal flood                         85
Severe winter conditions              77
Mudslide                              76
Locust                                62
Tsunami                               57
Parasitic disease                     49
Grasshopper                           16
Rockfall                              12

In [29]:
# Distribution of disasters by Disaster Type, Disaster Subgroup, Country, Region, Continent


Disaster Type
Flood                    5551
Storm                    4496
Earthquake               1544
Epidemic                 1501
Landslide                 776
Drought                   770
Extreme temperature       603
Wildfire                  471
Volcanic activity         265
Insect infestation         96
Mass movement (dry)        48
Glacial lake outburst       2
Fog                         1
Impact                      1
Animal accident             1
Name: count, dtype: int64

In [30]:
# null values in df
df.isnull().sum()


Year                              0
Seq                               0
Glide                         14545
Disaster Group                    0
Disaster Subgroup                 0
Disaster Type                     0
Disaster Subtype               3110
Disaster Subsubtype           15049
Event Name                    12265
Country                           0
ISO                               0
Region                            0
Continent                         0
Location                       1792
Origin                        12332
Associated Dis                12778
Associated Dis2               15419
OFDA Response                 14432
Appeal                        13557
Declaration                   12870
Aid Contribution              15449
Dis Mag Value                 11180
Dis Mag Scale                  1190
Latitude                      13397
Longitude                     13394
Local Time                    15023
River Basin                   14839
Start Year                  