In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import process


In [2]:
df_original = pd.read_csv("..\Data_Sets\originalData\\natural-disasters.csv") 

# I'll use a processed Data Set from an earlier notebook as the stardard for Country Names
df = pd.read_csv("..\Data_Sets\processed\economicData_1960-2022_noNaN-drops.csv")
df_refNames = pd.DataFrame({
    'Standard Names': df['Country Name'].unique()
})

df_region = pd.read_csv("..\\Data_Sets\\processed\\addData_countriesSize_Regions.csv")

In [3]:
# Selecting and renaming relevant columns
df_disasters = df_original[['Entity', 'Year', 'Death rates from disasters']]

df_disasters.rename(columns={
    'Entity': 'Country Name',
    'Year': 'Index Year (Decade)'
}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_disasters.rename(columns={


In [4]:
# Selecting relevant years
df_disasters = df_disasters[df_disasters['Index Year (Decade)'] >= 1990]
df_disasters.loc[df_disasters['Index Year (Decade)'] == 1990, 'Death rates from disasters'] /= 2 # Slicing the 90s in half, because our Data in other DataSets starts from 1995.

In [5]:
def fuzzySearchName(name, refNames=df_refNames, score_threshold = 85):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    exceptions = {
        "Kyrgyzstan": "Kyrgyz Republic",
        "Congo": "Republic of Congo",
        "Czechia": "Czech Republic",
        "Slovakia": "Slovak Republic",
        "Macao": "Macau",
        'Democratic Republic Of The Congo': 'Democratic Republic of Congo',
        'Republic Of The Congo': 'Republic of Congo'
    }
    
    if name in exceptions:
        return exceptions[name]
    
    match, score, _ = process.extractOne(name, refNames['Standard Names'])
    
    # Hardcoding exceptions, due to sharing common words (South, North)
    dubiousFuzzyNames = [
        'North Korea', 'South Korea', 'South Africa',
        'North Macedonia', 'Saint Vincent and the Grenadines'
    ]

    if match in dubiousFuzzyNames:
        score_threshold = max(95, score_threshold)
        

    # If a close match is found, return the match
    if score > score_threshold:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"

In [6]:
#Selecting relevant countries
removeList = [
    'British Indian Ocean Territory', 'Africa', 'Europe', 'North America', 'South America', 'Asia', 'Oceania',
    'Sint Maarten (Dutch part)', 'Saint Martin (French Part)', 'Antigua and Barbuda',
    'Netherlands Antilles', 'Northern Mariana Islands', 'Saint Barthelemy',
    'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis',
    'Serbia-Montenegro', "Turks and Caicos Islands", 'United States Virgin Islands',
    'Wallis and Futuna', 'Yugoslavia', 'Cocos (Keeling) Islands', 'Isle Of Man', 'South Georgia And South Sandwich Islands',
    'South Sudan','United States Minor Outlying Islands','Saint Pierre And Miquelon', 'American Samoa',
    ]

mask = ~df_disasters['Country Name'].isin(removeList)
df_disasters = df_disasters[mask]

df_disasters['New Country Name'] = df_disasters['Country Name'].apply(fuzzySearchName)

In [7]:
# Excluding "PLEASE FILL MANUALLY"
df_disasters = df_disasters[~df_disasters['New Country Name'].isin(["PLEASE FILL MANUALLY"])]

# Checking for duplicated countries (top freq should be 3)
display(df_disasters['New Country Name'].describe(include=['object']))

count             519
unique            178
top       Afghanistan
freq                3
Name: New Country Name, dtype: object

In [8]:
df_disasters['New Country Name'].nunique()

178

In [9]:


# renaming and excluding Country Name
df_disasters = df_disasters[['New Country Name', 'Index Year (Decade)', 'Death rates from disasters']]

df_disasters.rename(columns={
    'New Country Name': 'Country Name'
}, inplace=True)

In [10]:
# Filling missingCountries with the average of subRegion
set_A = set(df_disasters['Country Name'].unique())
set_B = set(df_refNames['Standard Names'].unique())

missingCountries = set_B - set_A

# Getting subregion information 
df_disasters = df_disasters.merge(df_region, on='Country Name', how='left')
mean_disasters = df_disasters.groupby(['SubRegion', 'Index Year (Decade)'])['Death rates from disasters'].mean().reset_index()

# Storing temp data
countries = []
years = []
death_rates = []
subregions = []

for country in missingCountries:
    subregion = df_region[df_region['Country Name'] == country]['SubRegion'].values[0]
    
    for year in [1990, 2000, 2010]:
        death_rate = mean_disasters[(mean_disasters['SubRegion'] == subregion)
                                    & (mean_disasters['Index Year (Decade)'] == year)]['Death rates from disasters'].values[0]
        
        # Append data to the lists
        countries.append(country)
        years.append(year)
        death_rates.append(death_rate)
        subregions.append(subregion)


# Concatenate with the original df_disasters
missing_df = pd.DataFrame({
    'Country Name': countries,
    'Index Year (Decade)': years,
    'Death rates from disasters': death_rates,
    'SubRegion': subregions
})
df_disasters = pd.concat([df_disasters, missing_df], ignore_index=True)


In [11]:
# Selecting only the columns we actually need to export 
df_disasters = df_disasters[['Country Name', 'Index Year (Decade)', 'Death rates from disasters']]
df_disasters.head()

Unnamed: 0,Country Name,Index Year (Decade),Death rates from disasters
0,Afghanistan,1990,2.915111
1,Afghanistan,2000,1.796376
2,Afghanistan,2010,0.793282
3,Albania,1990,0.023369
4,Albania,2000,0.048252


In [13]:
df_disasters.to_csv(f'..\Data_Sets\processed\\addData_disasterData.csv', index=False)