In [1]:
#Importdependancy.
import pandas as pd
import numpy as np
import os
import csv
import warnings
warnings.filterwarnings('ignore')
from io import StringIO
import zipfile
from pathlib import Path
import sqlalchemy as db
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, func
from sqlalchemy import select

# Population Data Cleaning.

In [2]:
#Read dataframe.
population_df=pd.read_csv("Resources/population.csv",header=0)
population_df

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Refugees under UNHCR's mandate
0,2000,Iraq,IRQ,Albania,ALB,9
1,2000,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,507
2,2000,TÃ¼rkiye,TUR,Albania,ALB,5
3,2000,Chad,TCD,Algeria,DZA,20
4,2000,Cameroon,CMR,Algeria,DZA,5
...,...,...,...,...,...,...
81598,2022,Rwanda,RWA,Zimbabwe,ZWE,661
81599,2022,Somalia,SOM,Zimbabwe,ZWE,13
81600,2022,Sudan,SDN,Zimbabwe,ZWE,6
81601,2022,South Sudan,SSD,Zimbabwe,ZWE,8


In [3]:
#Check columns names.
population_df.columns

Index(['Year', 'Country of origin', 'Country of origin (ISO)',
       'Country of asylum', 'Country of asylum (ISO)',
       'Refugees under UNHCR's mandate'],
      dtype='object')

In [4]:
#Drop Refugees under UNHCR's mandate column.
population_df_dropped= population_df.drop(["Refugees under UNHCR's mandate"], axis="columns")
population_df_dropped

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO)
0,2000,Iraq,IRQ,Albania,ALB
1,2000,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB
2,2000,TÃ¼rkiye,TUR,Albania,ALB
3,2000,Chad,TCD,Algeria,DZA
4,2000,Cameroon,CMR,Algeria,DZA
...,...,...,...,...,...
81598,2022,Rwanda,RWA,Zimbabwe,ZWE
81599,2022,Somalia,SOM,Zimbabwe,ZWE
81600,2022,Sudan,SDN,Zimbabwe,ZWE
81601,2022,South Sudan,SSD,Zimbabwe,ZWE


In [5]:
population_df_dropped.columns

Index(['Year', 'Country of origin', 'Country of origin (ISO)',
       'Country of asylum', 'Country of asylum (ISO)'],
      dtype='object')

In [6]:
#Renaming Columns.
population_df_dropped=population_df_dropped.rename(columns={'Year':'year','Country of origin':'country_origin', 'Country of origin (ISO)': 'country_origin(ISO)',\
                                                        'Country of asylum':'country_asylum','Country of asylum (ISO)':'country_asylum(ISO)'})
population_df_dropped

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO)
0,2000,Iraq,IRQ,Albania,ALB
1,2000,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB
2,2000,TÃ¼rkiye,TUR,Albania,ALB
3,2000,Chad,TCD,Algeria,DZA
4,2000,Cameroon,CMR,Algeria,DZA
...,...,...,...,...,...
81598,2022,Rwanda,RWA,Zimbabwe,ZWE
81599,2022,Somalia,SOM,Zimbabwe,ZWE
81600,2022,Sudan,SDN,Zimbabwe,ZWE
81601,2022,South Sudan,SSD,Zimbabwe,ZWE


In [7]:
#Drop year 2000 and 2022 from the rows using Index.
data_index=population_df_dropped.set_index("year")
data_index=data_index.drop([2000,2022], axis=0)
data_index

Unnamed: 0_level_0,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO)
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG
2001,Iraq,IRQ,Afghanistan,AFG
2001,Iraq,IRQ,Albania,ALB
2001,North Macedonia,MKD,Albania,ALB
2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB
...,...,...,...,...
2021,Rwanda,RWA,Zimbabwe,ZWE
2021,Somalia,SOM,Zimbabwe,ZWE
2021,Sudan,SDN,Zimbabwe,ZWE
2021,South Sudan,SSD,Zimbabwe,ZWE


In [8]:
#Resetting Year as a column.
population_df_dropped_rows=data_index.reset_index()
population_df_dropped_rows

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO)
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG
1,2001,Iraq,IRQ,Afghanistan,AFG
2,2001,Iraq,IRQ,Albania,ALB
3,2001,North Macedonia,MKD,Albania,ALB
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB
...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE
74569,2021,Somalia,SOM,Zimbabwe,ZWE
74570,2021,Sudan,SDN,Zimbabwe,ZWE
74571,2021,South Sudan,SSD,Zimbabwe,ZWE


In [9]:
#Replace "TÃ¼rkiye" with "Turkey".
pop_rep=population_df_dropped_rows.replace(to_replace="TÃ¼rkiye",
           value="Turkey")
pop_rep

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO)
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG
1,2001,Iraq,IRQ,Afghanistan,AFG
2,2001,Iraq,IRQ,Albania,ALB
3,2001,North Macedonia,MKD,Albania,ALB
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB
...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE
74569,2021,Somalia,SOM,Zimbabwe,ZWE
74570,2021,Sudan,SDN,Zimbabwe,ZWE
74571,2021,South Sudan,SSD,Zimbabwe,ZWE


In [10]:
#Examine data type of each columns.
pop_rep.dtypes

year                    int64
country_origin         object
country_origin(ISO)    object
country_asylum         object
country_asylum(ISO)    object
dtype: object

In [11]:
#Check for duplicates.
pop_rep.duplicated().sum()

0

In [12]:
#Check for null values.
pop_rep.isnull().sum()

year                     0
country_origin           0
country_origin(ISO)    793
country_asylum           0
country_asylum(ISO)      0
dtype: int64

In [13]:
#Check dropping null values.
cleaned_population=pop_rep.dropna()
cleaned_population

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO)
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG
1,2001,Iraq,IRQ,Afghanistan,AFG
2,2001,Iraq,IRQ,Albania,ALB
3,2001,North Macedonia,MKD,Albania,ALB
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB
...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE
74569,2021,Somalia,SOM,Zimbabwe,ZWE
74570,2021,Sudan,SDN,Zimbabwe,ZWE
74571,2021,South Sudan,SSD,Zimbabwe,ZWE


In [14]:
#Examine if null values are dropped.
cleaned_population.isnull().sum()

year                   0
country_origin         0
country_origin(ISO)    0
country_asylum         0
country_asylum(ISO)    0
dtype: int64

In [15]:
#Save the cleaned DataFrame as a new CSV file for analysis.
cleaned_population.to_csv('Resources/cleaned population.csv', index=False)

# Disasters Data Cleaning.

In [16]:
#Read dataframe.
disasters_df=pd.read_csv("Resources/disasters.csv",header=0)
disasters_df

Unnamed: 0,Dis No,Year,Seq,Glide,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,...,Total Affected,Insured Damages,"Insured Damages, Adjusted",Total Damages,"Total Damages, Adjusted",CPI,Adm Level,Admin1 Code,Admin2 Code,Geo Locations
0,2000-0372-CHN,2000,372,,Natural,Hydrological,Flood,Riverine flood,,,...,31010.0,,,19000.0,32291.0,58.840648,1;2,914,13006;13009;13010,"Jiangxi Sheng (Adm1). Quanzhou, Xiamen, Zhangz..."
1,2000-9186-AFG,2000,9186,,Natural,Climatological,Drought,Drought,,,...,2580000.0,,,50.0,85.0,58.840648,1,272;273;274;277;279;281;282;285;287;289;293;29...,,"Badakhshan, Badghis, Baghlan, Farah, Ghazni, H..."
2,2000-0373-BGD,2000,373,,Natural,Hydrological,Flood,Flash flood,,,...,200050.0,,,,,58.840648,2,,5770;5772,"Chittagong, Cox's Bazar (Adm2)."
3,2000-0905-BOL,2000,905,,Natural,Hydrological,Flood,,,,...,10.0,,,,,58.840648,1,40444;40445;40446;40447;40448;40450,,"Chuquisaca, Cochabamba, La Paz, Oruro, Potosi,..."
4,2000-0019-BRA,2000,19,,Natural,Hydrological,Flood,Riverine flood,,,...,70000.0,,,,,58.840648,2,,8467;8555;9902;9956;9961;9985;11089;11543,"Barra Mansa, Campos Do Jordao, Pirangucu, Rese..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15064,2023-0017-UGA,2023,17,,Technological,Technological,Transport accident,Road,,,...,,,,,,,,,,
15065,2023-0018-UGA,2023,18,,Technological,Technological,Miscellaneous accident,Other,,Stampede,...,,,,,,,,,,
15066,2023-0013-UKR,2023,13,,Technological,Technological,Transport accident,Air,,Helicopter,...,22.0,,,,,,,,,
15067,2023-0032-USA,2023,32,,Natural,Meteorological,Storm,Convective storm,Tornado,,...,49.0,,,,,,1,3214;3224,,"Alabama, Georgia (Adm1)."


In [17]:
#Check columns names.
disasters_df.columns

Index(['Dis No', 'Year', 'Seq', 'Glide', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype',
       'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'AID Contribution ', 'Dis Mag Value',
       'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin',
       'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month',
       'End Day', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless',
       'Total Affected', 'Insured Damages', 'Insured Damages, Adjusted ',
       'Total Damages ', 'Total Damages, Adjusted ', 'CPI', 'Adm Level',
       'Admin1 Code', 'Admin2 Code', 'Geo Locations'],
      dtype='object')

In [18]:
#Dropping columns that would not be used in analysis.
disasters_df_col= disasters_df.drop(['Dis No','Seq', 'Glide','Disaster Subsubtype','Event Name','Origin','Associated Dis', 'Associated Dis2', 'OFDA Response',\
                                    'Appeal', 'Declaration', 'AID Contribution ','Dis Mag Value','Dis Mag Scale','Latitude', 'Longitude','Local Time', 'River Basin',\
                                    'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month','End Day','No Injured', 'No Affected', 'No Homeless',\
                                   'Insured Damages', 'Insured Damages, Adjusted ','Total Damages, Adjusted ','Total Damages ','CPI','Adm Level', 'Admin1 Code','Admin2 Code'], axis="columns")
disasters_df_col

Unnamed: 0,Year,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,ISO,Region,Continent,Location,Total Deaths,Total Affected,Geo Locations
0,2000,Natural,Hydrological,Flood,Riverine flood,China,CHN,Eastern Asia,Asia,"Quanzhou, Zhangzhou, Xiamen districts (Fujian ...",43.0,31010.0,"Jiangxi Sheng (Adm1). Quanzhou, Xiamen, Zhangz..."
1,2000,Natural,Climatological,Drought,Drought,Afghanistan,AFG,Southern Asia,Asia,"Kandahar, Hilmand, Nimroz, Zabul, Uruzgan prov...",37.0,2580000.0,"Badakhshan, Badghis, Baghlan, Farah, Ghazni, H..."
2,2000,Natural,Hydrological,Flood,Flash flood,Bangladesh,BGD,Southern Asia,Asia,"Bakalia, Kotwali, Chandgaon, Pahartali, Hathaz...",11.0,200050.0,"Chittagong, Cox's Bazar (Adm2)."
3,2000,Natural,Hydrological,Flood,,Bolivia (Plurinational State of),BOL,South America,Americas,"Chuquisaca, Cochabamba, La Paz, Oruro, Potosi,...",30.0,10.0,"Chuquisaca, Cochabamba, La Paz, Oruro, Potosi,..."
4,2000,Natural,Hydrological,Flood,Riverine flood,Brazil,BRA,South America,Americas,"Rio de Janeiro city (Rio de Janeiro district, ...",26.0,70000.0,"Barra Mansa, Campos Do Jordao, Pirangucu, Rese..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15064,2023,Technological,Technological,Transport accident,Road,Uganda,UGA,Eastern Africa,Africa,Near Gulu,16.0,,
15065,2023,Technological,Technological,Miscellaneous accident,Other,Uganda,UGA,Eastern Africa,Africa,Kampala,10.0,,
15066,2023,Technological,Technological,Transport accident,Air,Ukraine,UKR,Eastern Europe,Europe,"Brovary, (near Kiev)",16.0,22.0,
15067,2023,Natural,Meteorological,Storm,Convective storm,United States of America,USA,Northern America,Americas,Alabama and Georgia,11.0,49.0,"Alabama, Georgia (Adm1)."


In [19]:
#Examine column names.
disasters_df_col.columns

Index(['Year', 'Disaster Group', 'Disaster Subgroup', 'Disaster Type',
       'Disaster Subtype', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Total Deaths', 'Total Affected', 'Geo Locations'],
      dtype='object')

In [20]:
#Renaming Columns.
disasters_df_col_renamed=disasters_df_col.rename(columns={'Year':'year','Disaster Group':'disaster_group','Disaster Subgroup':'disaster_subgroup','Disaster Type':'disaster_type',\
                                                           'Disaster Subtype': 'disaster_subtype','Country':'country_origin','ISO':'country_origin(ISO)',\
                                                          'Region':'region','Continent':'continent','Location':'location','Total Deaths':'total_deaths','Total Affected':'total_affected','Geo Locations':'geo_locations'})
disasters_df_col_renamed

Unnamed: 0,year,disaster_group,disaster_subgroup,disaster_type,disaster_subtype,country_origin,country_origin(ISO),region,continent,location,total_deaths,total_affected,geo_locations
0,2000,Natural,Hydrological,Flood,Riverine flood,China,CHN,Eastern Asia,Asia,"Quanzhou, Zhangzhou, Xiamen districts (Fujian ...",43.0,31010.0,"Jiangxi Sheng (Adm1). Quanzhou, Xiamen, Zhangz..."
1,2000,Natural,Climatological,Drought,Drought,Afghanistan,AFG,Southern Asia,Asia,"Kandahar, Hilmand, Nimroz, Zabul, Uruzgan prov...",37.0,2580000.0,"Badakhshan, Badghis, Baghlan, Farah, Ghazni, H..."
2,2000,Natural,Hydrological,Flood,Flash flood,Bangladesh,BGD,Southern Asia,Asia,"Bakalia, Kotwali, Chandgaon, Pahartali, Hathaz...",11.0,200050.0,"Chittagong, Cox's Bazar (Adm2)."
3,2000,Natural,Hydrological,Flood,,Bolivia (Plurinational State of),BOL,South America,Americas,"Chuquisaca, Cochabamba, La Paz, Oruro, Potosi,...",30.0,10.0,"Chuquisaca, Cochabamba, La Paz, Oruro, Potosi,..."
4,2000,Natural,Hydrological,Flood,Riverine flood,Brazil,BRA,South America,Americas,"Rio de Janeiro city (Rio de Janeiro district, ...",26.0,70000.0,"Barra Mansa, Campos Do Jordao, Pirangucu, Rese..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15064,2023,Technological,Technological,Transport accident,Road,Uganda,UGA,Eastern Africa,Africa,Near Gulu,16.0,,
15065,2023,Technological,Technological,Miscellaneous accident,Other,Uganda,UGA,Eastern Africa,Africa,Kampala,10.0,,
15066,2023,Technological,Technological,Transport accident,Air,Ukraine,UKR,Eastern Europe,Europe,"Brovary, (near Kiev)",16.0,22.0,
15067,2023,Natural,Meteorological,Storm,Convective storm,United States of America,USA,Northern America,Americas,Alabama and Georgia,11.0,49.0,"Alabama, Georgia (Adm1)."


In [21]:
#Drop year 2000,2022 and 2023 from the rows using Index.
data_index_1=disasters_df_col_renamed.set_index("year")
data_index_1=data_index_1.drop([2000,2022,2023], axis=0)
data_index_1

Unnamed: 0_level_0,disaster_group,disaster_subgroup,disaster_type,disaster_subtype,country_origin,country_origin(ISO),region,continent,location,total_deaths,total_affected,geo_locations
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001,Natural,Geophysical,Earthquake,Ground movement,Afghanistan,AFG,Southern Asia,Asia,Fayzabad district (Badakhshan province),,,Fayzabad (Adm2).
2001,Natural,Meteorological,Extreme temperature,Cold wave,Afghanistan,AFG,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",150.0,100000.0,"Baghlan, Balkh, Faryab, Hirat, Jawzjan, Kunduz..."
2001,Technological,Technological,Transport accident,Air,Angola,AGO,Middle Africa,Africa,Near Lubango,16.0,,
2001,Natural,Hydrological,Flood,,Angola,AGO,Middle Africa,Africa,Kuito district (Bie province),9.0,5.0,Kuito (Adm2).
2001,Natural,Meteorological,Storm,,Argentina,ARG,South America,Americas,Buenos Aires province,4.0,650.0,Buenos Aires (Adm1).
...,...,...,...,...,...,...,...,...,...,...,...,...
2020,Natural,Biological,Epidemic,Parasitic disease,Sudan,SDN,Northern Africa,Africa,,38.0,2137.0,
2021,Natural,Climatological,Drought,Drought,"Tanzania, United Republic of",TZA,Eastern Africa,Africa,"Handeni, Longido, Mkinga, and Monduli",,497000.0,
2021,Technological,Technological,Miscellaneous accident,Fire,Iraq,IRQ,Western Asia,Asia,Baghdad,82.0,110.0,
2021,Technological,Technological,Miscellaneous accident,Other,Israel,ISR,Western Asia,Asia,Mont Meron,45.0,,


In [22]:
#Resetting Year as a column.
disasters_df_dropped_rows=data_index_1.reset_index()
disasters_df_dropped_rows

Unnamed: 0,year,disaster_group,disaster_subgroup,disaster_type,disaster_subtype,country_origin,country_origin(ISO),region,continent,location,total_deaths,total_affected,geo_locations
0,2001,Natural,Geophysical,Earthquake,Ground movement,Afghanistan,AFG,Southern Asia,Asia,Fayzabad district (Badakhshan province),,,Fayzabad (Adm2).
1,2001,Natural,Meteorological,Extreme temperature,Cold wave,Afghanistan,AFG,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",150.0,100000.0,"Baghlan, Balkh, Faryab, Hirat, Jawzjan, Kunduz..."
2,2001,Technological,Technological,Transport accident,Air,Angola,AGO,Middle Africa,Africa,Near Lubango,16.0,,
3,2001,Natural,Hydrological,Flood,,Angola,AGO,Middle Africa,Africa,Kuito district (Bie province),9.0,5.0,Kuito (Adm2).
4,2001,Natural,Meteorological,Storm,,Argentina,ARG,South America,Americas,Buenos Aires province,4.0,650.0,Buenos Aires (Adm1).
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13569,2020,Natural,Biological,Epidemic,Parasitic disease,Sudan,SDN,Northern Africa,Africa,,38.0,2137.0,
13570,2021,Natural,Climatological,Drought,Drought,"Tanzania, United Republic of",TZA,Eastern Africa,Africa,"Handeni, Longido, Mkinga, and Monduli",,497000.0,
13571,2021,Technological,Technological,Miscellaneous accident,Fire,Iraq,IRQ,Western Asia,Asia,Baghdad,82.0,110.0,
13572,2021,Technological,Technological,Miscellaneous accident,Other,Israel,ISR,Western Asia,Asia,Mont Meron,45.0,,


In [23]:
#Examine data type of each columns.
disasters_df_dropped_rows.dtypes

year                     int64
disaster_group          object
disaster_subgroup       object
disaster_type           object
disaster_subtype        object
country_origin          object
country_origin(ISO)     object
region                  object
continent               object
location                object
total_deaths           float64
total_affected         float64
geo_locations           object
dtype: object

In [24]:
#Check for duplicates.
disasters_df_dropped_rows.duplicated().sum()

8

In [25]:
#Removes duplicates.
disasters_dup=disasters_df_dropped_rows.drop_duplicates()
disasters_dup

Unnamed: 0,year,disaster_group,disaster_subgroup,disaster_type,disaster_subtype,country_origin,country_origin(ISO),region,continent,location,total_deaths,total_affected,geo_locations
0,2001,Natural,Geophysical,Earthquake,Ground movement,Afghanistan,AFG,Southern Asia,Asia,Fayzabad district (Badakhshan province),,,Fayzabad (Adm2).
1,2001,Natural,Meteorological,Extreme temperature,Cold wave,Afghanistan,AFG,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",150.0,100000.0,"Baghlan, Balkh, Faryab, Hirat, Jawzjan, Kunduz..."
2,2001,Technological,Technological,Transport accident,Air,Angola,AGO,Middle Africa,Africa,Near Lubango,16.0,,
3,2001,Natural,Hydrological,Flood,,Angola,AGO,Middle Africa,Africa,Kuito district (Bie province),9.0,5.0,Kuito (Adm2).
4,2001,Natural,Meteorological,Storm,,Argentina,ARG,South America,Americas,Buenos Aires province,4.0,650.0,Buenos Aires (Adm1).
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13569,2020,Natural,Biological,Epidemic,Parasitic disease,Sudan,SDN,Northern Africa,Africa,,38.0,2137.0,
13570,2021,Natural,Climatological,Drought,Drought,"Tanzania, United Republic of",TZA,Eastern Africa,Africa,"Handeni, Longido, Mkinga, and Monduli",,497000.0,
13571,2021,Technological,Technological,Miscellaneous accident,Fire,Iraq,IRQ,Western Asia,Asia,Baghdad,82.0,110.0,
13572,2021,Technological,Technological,Miscellaneous accident,Other,Israel,ISR,Western Asia,Asia,Mont Meron,45.0,,


In [26]:
#Examine for duplicates.
disasters_dup.duplicated().sum()

0

In [27]:
#Check for null values.
disasters_dup.isnull().sum()

year                      0
disaster_group            0
disaster_subgroup         0
disaster_type             0
disaster_subtype       1221
country_origin            0
country_origin(ISO)       0
region                    0
continent                 0
location                480
total_deaths           2649
total_affected         3537
geo_locations          5845
dtype: int64

In [28]:
#Check dropping null values.
cleaned_disasters=disasters_dup.dropna()
cleaned_disasters

Unnamed: 0,year,disaster_group,disaster_subgroup,disaster_type,disaster_subtype,country_origin,country_origin(ISO),region,continent,location,total_deaths,total_affected,geo_locations
1,2001,Natural,Meteorological,Extreme temperature,Cold wave,Afghanistan,AFG,Southern Asia,Asia,"Hirat, Faryab, Jawzjan, Balkh, Samangan, Sar-e...",150.0,100000.0,"Baghlan, Balkh, Faryab, Hirat, Jawzjan, Kunduz..."
11,2001,Natural,Hydrological,Flood,Riverine flood,Bolivia (Plurinational State of),BOL,South America,Americas,"Beni, Chuquisaca, Cochabamba, La Paz, Oruro, P...",41.0,357250.0,"Beni, Chuquisaca, Cochabamba, La Paz, Oruro, P..."
16,2001,Natural,Meteorological,Storm,Convective storm,China,CHN,Eastern Asia,Asia,"Xilin Gol, Xing'an, Hulunbuir, Ulaan Chab, Chi...",49.0,2574871.0,"Chifeng, Hulunbuir, Tongliao, Ulaan Chab, Xili..."
17,2001,Natural,Geophysical,Earthquake,Ground movement,China,CHN,Eastern Asia,Asia,"Yajiang Xian, Kangding Xian areas (Garze Tibet...",3.0,300109.0,Garzê Tibetan (Adm2).
19,2001,Natural,Hydrological,Flood,Flash flood,Indonesia,IDN,South-Eastern Asia,Asia,"Bogor district (Jawa Barat province), Bojonego...",130.0,80000.0,"Dki Jakarta (Adm1). Bogor, Bojonegoro, Jember,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13524,2021,Natural,Climatological,Wildfire,Forest fire,United States of America,USA,Northern America,Americas,Plumas County (California),3.0,1261.0,Plumas (Adm2).
13539,2021,Natural,Hydrological,Landslide,Mudslide,Uzbekistan,UZB,Central Asia,Asia,"Ferghana valley, Namangan Region",8.0,6.0,"Fergana, Namangan (Adm1)."
13544,2021,Natural,Hydrological,Flood,Flash flood,Viet Nam,VNM,South-Eastern Asia,Asia,"Minh Luong Commune, Viet Tien (Van Ban Distric...",3.0,180.0,"Ha Giang (Adm1). Tran Yen, Van Ban (Adm2)."
13548,2021,Natural,Hydrological,Flood,Flash flood,Yemen,YEM,Western Asia,Asia,"Sanaa, Ibb, Shabwa, Hodeida, Aden, Abyan, Al D...",13.0,22380.0,"Abyan, Aden, Al Dhale'e, Al Hudaydah, Hadramau..."


In [29]:
#Examine if null values are dropped.
cleaned_disasters.isnull().sum()

year                   0
disaster_group         0
disaster_subgroup      0
disaster_type          0
disaster_subtype       0
country_origin         0
country_origin(ISO)    0
region                 0
continent              0
location               0
total_deaths           0
total_affected         0
geo_locations          0
dtype: int64

In [30]:
#Save the cleaned DataFrame as a new CSV file for analysis.
cleaned_disasters.to_csv('Resources/cleaned disasters.csv', index=False)

# Demographics Data Cleaning.

In [31]:
#Read dataframe.
demographics_df=pd.read_csv("Resources/demographics.csv",header=0)
demographics_df

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female other,Female total,Male 0 - 4,Male 5 - 11,Male 12 - 17,Male 18 - 59,Male 60,Male other,Male total,Total
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG,0,0,0,0,0,0,0,0,0,0,5,0,0,5,5
1,2001,Iraq,IRQ,Afghanistan,AFG,0,5,0,0,0,0,5,0,0,0,0,0,0,0,5
2,2001,Iraq,IRQ,Albania,ALB,0,0,0,0,0,0,0,0,0,0,7,0,0,7,7
3,2001,North Macedonia,MKD,Albania,ALB,0,0,0,6,0,0,6,0,0,0,0,0,0,0,6
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,14,20,18,89,8,0,149,14,18,16,45,18,0,111,278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE,29,48,49,179,14,0,319,25,54,51,188,23,0,341,660
74569,2021,Somalia,SOM,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,13,0,0,13,13
74570,2021,Sudan,SDN,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8,8
74571,2021,South Sudan,SSD,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8,8


In [32]:
#Check columns names.
demographics_df.columns

Index(['Year', 'Country of origin', 'Country of origin (ISO)',
       'Country of asylum', 'Country of asylum (ISO)', 'Female 0 - 4',
       'Female 5 - 11', 'Female 12 - 17', 'Female 18 - 59', 'Female 60',
       'Female other', 'Female total', 'Male 0 - 4', 'Male 5 - 11',
       'Male 12 - 17', 'Male 18 - 59', 'Male 60', 'Male other', 'Male total',
       'Total'],
      dtype='object')

In [33]:
#Drop Total column.
demographics_df_dropped= demographics_df.drop(['Total'], axis="columns")
demographics_df_dropped

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female other,Female total,Male 0 - 4,Male 5 - 11,Male 12 - 17,Male 18 - 59,Male 60,Male other,Male total
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG,0,0,0,0,0,0,0,0,0,0,5,0,0,5
1,2001,Iraq,IRQ,Afghanistan,AFG,0,5,0,0,0,0,5,0,0,0,0,0,0,0
2,2001,Iraq,IRQ,Albania,ALB,0,0,0,0,0,0,0,0,0,0,7,0,0,7
3,2001,North Macedonia,MKD,Albania,ALB,0,0,0,6,0,0,6,0,0,0,0,0,0,0
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,14,20,18,89,8,0,149,14,18,16,45,18,0,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE,29,48,49,179,14,0,319,25,54,51,188,23,0,341
74569,2021,Somalia,SOM,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,13,0,0,13
74570,2021,Sudan,SDN,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8
74571,2021,South Sudan,SSD,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8


In [34]:
#Replace "TÃ¼rkiye" with "Turkey".
demo_rep=demographics_df_dropped.replace(to_replace="TÃ¼rkiye",
           value="Turkey")
demo_rep

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female other,Female total,Male 0 - 4,Male 5 - 11,Male 12 - 17,Male 18 - 59,Male 60,Male other,Male total
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG,0,0,0,0,0,0,0,0,0,0,5,0,0,5
1,2001,Iraq,IRQ,Afghanistan,AFG,0,5,0,0,0,0,5,0,0,0,0,0,0,0
2,2001,Iraq,IRQ,Albania,ALB,0,0,0,0,0,0,0,0,0,0,7,0,0,7
3,2001,North Macedonia,MKD,Albania,ALB,0,0,0,6,0,0,6,0,0,0,0,0,0,0
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,14,20,18,89,8,0,149,14,18,16,45,18,0,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE,29,48,49,179,14,0,319,25,54,51,188,23,0,341
74569,2021,Somalia,SOM,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,13,0,0,13
74570,2021,Sudan,SDN,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8
74571,2021,South Sudan,SSD,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8


In [35]:
#Examine data type of each columns.
demo_rep.dtypes

Year                        int64
Country of origin          object
Country of origin (ISO)    object
Country of asylum          object
Country of asylum (ISO)    object
Female 0 - 4                int64
Female 5 - 11               int64
Female 12 - 17              int64
Female 18 - 59              int64
Female 60                   int64
Female other                int64
Female total                int64
Male 0 - 4                  int64
Male 5 - 11                 int64
Male 12 - 17                int64
Male 18 - 59                int64
Male 60                     int64
Male other                  int64
Male total                  int64
dtype: object

In [36]:
#Check for duplicates.
demo_rep.duplicated().sum()

0

In [37]:
#Check for null values.
demo_rep.isnull().sum()

Year                         0
Country of origin            0
Country of origin (ISO)    793
Country of asylum            0
Country of asylum (ISO)      0
Female 0 - 4                 0
Female 5 - 11                0
Female 12 - 17               0
Female 18 - 59               0
Female 60                    0
Female other                 0
Female total                 0
Male 0 - 4                   0
Male 5 - 11                  0
Male 12 - 17                 0
Male 18 - 59                 0
Male 60                      0
Male other                   0
Male total                   0
dtype: int64

In [38]:
#Check dropping null values.
cleaned_demographics =demo_rep.dropna()
cleaned_demographics 

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female other,Female total,Male 0 - 4,Male 5 - 11,Male 12 - 17,Male 18 - 59,Male 60,Male other,Male total
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG,0,0,0,0,0,0,0,0,0,0,5,0,0,5
1,2001,Iraq,IRQ,Afghanistan,AFG,0,5,0,0,0,0,5,0,0,0,0,0,0,0
2,2001,Iraq,IRQ,Albania,ALB,0,0,0,0,0,0,0,0,0,0,7,0,0,7
3,2001,North Macedonia,MKD,Albania,ALB,0,0,0,6,0,0,6,0,0,0,0,0,0,0
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,14,20,18,89,8,0,149,14,18,16,45,18,0,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE,29,48,49,179,14,0,319,25,54,51,188,23,0,341
74569,2021,Somalia,SOM,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,13,0,0,13
74570,2021,Sudan,SDN,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8
74571,2021,South Sudan,SSD,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8


In [39]:
#Examine if null values are dropped.
cleaned_demographics.isnull().sum()

Year                       0
Country of origin          0
Country of origin (ISO)    0
Country of asylum          0
Country of asylum (ISO)    0
Female 0 - 4               0
Female 5 - 11              0
Female 12 - 17             0
Female 18 - 59             0
Female 60                  0
Female other               0
Female total               0
Male 0 - 4                 0
Male 5 - 11                0
Male 12 - 17               0
Male 18 - 59               0
Male 60                    0
Male other                 0
Male total                 0
dtype: int64

In [40]:
cleaned_demographics['Total gender'] =cleaned_demographics ['Female total']+ cleaned_demographics['Male total']
cleaned_demographics

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female other,Female total,Male 0 - 4,Male 5 - 11,Male 12 - 17,Male 18 - 59,Male 60,Male other,Male total,Total gender
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG,0,0,0,0,0,0,0,0,0,0,5,0,0,5,5
1,2001,Iraq,IRQ,Afghanistan,AFG,0,5,0,0,0,0,5,0,0,0,0,0,0,0,5
2,2001,Iraq,IRQ,Albania,ALB,0,0,0,0,0,0,0,0,0,0,7,0,0,7,7
3,2001,North Macedonia,MKD,Albania,ALB,0,0,0,6,0,0,6,0,0,0,0,0,0,0,6
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,14,20,18,89,8,0,149,14,18,16,45,18,0,111,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE,29,48,49,179,14,0,319,25,54,51,188,23,0,341,660
74569,2021,Somalia,SOM,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,13,0,0,13,13
74570,2021,Sudan,SDN,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8,8
74571,2021,South Sudan,SSD,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8,8


In [41]:
cleaned_demographics.columns

Index(['Year', 'Country of origin', 'Country of origin (ISO)',
       'Country of asylum', 'Country of asylum (ISO)', 'Female 0 - 4',
       'Female 5 - 11', 'Female 12 - 17', 'Female 18 - 59', 'Female 60',
       'Female other', 'Female total', 'Male 0 - 4', 'Male 5 - 11',
       'Male 12 - 17', 'Male 18 - 59', 'Male 60', 'Male other', 'Male total',
       'Total gender'],
      dtype='object')

In [42]:
#Renaming Columns.
cleaned_demographics=cleaned_demographics.rename(columns={'Year':'year','Country of origin':'country_origin','Country of origin (ISO)':'country_origin(ISO)','Country of asylum':'country_asylum',\
                                                         'Country of asylum (ISO)':'country_asylum(ISO)','Female 0 - 4':'female_0to4', 'Female 5 - 11': 'female_5to11','Female 12 - 17':'female_12to17',\
                                                         'Female 18 - 59':'female_18to59','Female 60':'female_60','Female other':'female_other','Female total':'female_total','Male_0to4':'male_0to4',\
                                                         'Male 5 - 11':'male_5to11','Male 12 - 17':'male_12to17','Male 18 - 59':'male_18to59','Male 60':'male_60','Male other':'male_other',\
                                                         'Male total':'male_total','Total gender':'total_gender'})
cleaned_demographics

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO),female_0to4,female_5to11,female_12to17,female_18to59,female_60,female_other,female_total,Male 0 - 4,male_5to11,male_12to17,male_18to59,male_60,male_other,male_total,total_gender
0,2001,Iran (Islamic Rep. of),IRN,Afghanistan,AFG,0,0,0,0,0,0,0,0,0,0,5,0,0,5,5
1,2001,Iraq,IRQ,Afghanistan,AFG,0,5,0,0,0,0,5,0,0,0,0,0,0,0,5
2,2001,Iraq,IRQ,Albania,ALB,0,0,0,0,0,0,0,0,0,0,7,0,0,7,7
3,2001,North Macedonia,MKD,Albania,ALB,0,0,0,6,0,0,6,0,0,0,0,0,0,0,6
4,2001,Serbia and Kosovo: S/RES/1244 (1999),SRB,Albania,ALB,14,20,18,89,8,0,149,14,18,16,45,18,0,111,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74568,2021,Rwanda,RWA,Zimbabwe,ZWE,29,48,49,179,14,0,319,25,54,51,188,23,0,341,660
74569,2021,Somalia,SOM,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,13,0,0,13,13
74570,2021,Sudan,SDN,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8,8
74571,2021,South Sudan,SSD,Zimbabwe,ZWE,0,0,0,0,0,0,0,0,0,0,8,0,0,8,8


In [43]:
#Save the cleaned DataFrame as a new CSV file for analysis.
cleaned_demographics.to_csv('Resources/cleaned demographics.csv', index=False)

# Conflict Data Cleaning.

In [44]:
from zipfile import ZipFile
with ZipFile("Resources/conflict.zip") as myzip:
    conflict_data = myzip.open("conflict.csv")

#Now, we can read in the data
conflict_df = pd.read_csv(conflict_data)

conflict_df.head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
0,244657,IRQ-2017-1-524-322,2017,1,Clear,1,259,259,Iraq: Government,524,...,00:00.0,0,4,0,2,6,6,6,645,
1,412700,IRQ-2021-1-524-145,2021,1,Clear,1,259,259,Iraq: Government,524,...,00:00.0,13,1,141,28,183,184,171,645,
2,413023,IRQ-2021-1-524-143,2021,1,Clear,1,259,259,Iraq: Government,524,...,00:00.0,0,2,0,0,2,3,0,645,
3,412909,IRQ-2021-1-524-144,2021,1,Clear,1,259,259,Iraq: Government,524,...,00:00.0,0,0,10,0,10,10,9,645,
4,132140,AFG-1989-1-411-2,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,00:00.0,6,0,0,0,6,6,6,700,


In [45]:
#Check columns names.
conflict_df.columns

Index(['id', 'relid', 'year', 'active_year', 'code_status', 'type_of_violence',
       'conflict_dset_id', 'conflict_new_id', 'conflict_name', 'dyad_dset_id',
       'dyad_new_id', 'dyad_name', 'side_a_dset_id', 'side_a_new_id', 'side_a',
       'side_b_dset_id', 'side_b_new_id', 'side_b', 'number_of_sources',
       'source_article', 'source_office', 'source_date', 'source_headline',
       'source_original', 'where_prec', 'where_coordinates',
       'where_description', 'adm_1', 'adm_2', 'latitude', 'longitude',
       'geom_wkt', 'priogrid_gid', 'country', 'country_id', 'region',
       'event_clarity', 'date_prec', 'date_start', 'date_end', 'deaths_a',
       'deaths_b', 'deaths_civilians', 'deaths_unknown', 'best', 'high', 'low',
       'gwnoa', 'gwnob'],
      dtype='object')

In [46]:
#Dropping columns that would not be used in analysis.
conflict_df_col= conflict_df.drop(['id', 'relid','active_year', 'code_status','type_of_violence','conflict_dset_id', 'conflict_new_id',\
                                  'dyad_dset_id','dyad_new_id','side_a_dset_id', 'side_a_new_id', 'side_a','side_b_dset_id',\
                                   'side_b_new_id', 'side_b', 'number_of_sources','source_article', 'source_office', 'source_date',\
                                   'source_headline','source_original', 'where_prec','adm_1', 'adm_2','geom_wkt', 'priogrid_gid', \
                                  'country_id','event_clarity', 'date_prec', 'date_start', 'date_end','deaths_a',\
                                  'deaths_b', 'deaths_unknown','deaths_civilians','high', 'low','gwnoa', 'gwnob'], axis="columns")
conflict_df_col

Unnamed: 0,year,conflict_name,dyad_name,where_coordinates,where_description,latitude,longitude,country,region,best
0,2017,Iraq: Government,Government of Iraq - IS,Kabul city,Iraqi embassy in Kabul,34.531094,69.162796,Afghanistan,Asia,6
1,2021,Iraq: Government,Government of Iraq - IS,Kabul international airport,Kabul airport (Abbey gate entrance),34.564444,69.217222,Afghanistan,Asia,183
2,2021,Iraq: Government,Government of Iraq - IS,Jalalabad town,Police District 7 of Jalalabad city,34.428844,70.455750,Afghanistan,Asia,2
3,2021,Iraq: Government,Government of Iraq - IS,Kabul city,Kabul city (district 15),34.531094,69.162796,Afghanistan,Asia,10
4,1989,Afghanistan: Government,Government of Afghanistan - Jam'iyyat-i Islami...,Nangarhar province,Nangarhar province,34.333330,70.416670,Afghanistan,Asia,6
...,...,...,...,...,...,...,...,...,...,...
293629,1989,Renamo - Civilians,Renamo - Civilians,Chipinge district,Chipinge district,-20.500000,32.500000,Zimbabwe (Rhodesia),Africa,9
293630,1989,Renamo - Civilians,Renamo - Civilians,"Eastern Zimbabwe, near the Mozambique border",Zimbabwe eastern (Near the eastern border with...,-19.000000,32.500000,Zimbabwe (Rhodesia),Africa,9
293631,1990,Renamo - Civilians,Renamo - Civilians,Nyamaropa village,Nyamaropa village,-18.000000,32.833333,Zimbabwe (Rhodesia),Africa,7
293632,1990,Renamo - Civilians,Renamo - Civilians,"Eastern Zimbabwe, near the Mozambique border",Zimbabwe eastern (Pungwe communal lands settle...,-19.000000,32.500000,Zimbabwe (Rhodesia),Africa,1


In [47]:
#Examine column names.
conflict_df_col.columns

Index(['year', 'conflict_name', 'dyad_name', 'where_coordinates',
       'where_description', 'latitude', 'longitude', 'country', 'region',
       'best'],
      dtype='object')

In [48]:
#Renaming Columns.
conflict_df_col_renamed=conflict_df_col.rename(columns={'year':'year','best':'deaths_civilians'})
conflict_df_col_renamed

Unnamed: 0,year,conflict_name,dyad_name,where_coordinates,where_description,latitude,longitude,country,region,deaths_civilians
0,2017,Iraq: Government,Government of Iraq - IS,Kabul city,Iraqi embassy in Kabul,34.531094,69.162796,Afghanistan,Asia,6
1,2021,Iraq: Government,Government of Iraq - IS,Kabul international airport,Kabul airport (Abbey gate entrance),34.564444,69.217222,Afghanistan,Asia,183
2,2021,Iraq: Government,Government of Iraq - IS,Jalalabad town,Police District 7 of Jalalabad city,34.428844,70.455750,Afghanistan,Asia,2
3,2021,Iraq: Government,Government of Iraq - IS,Kabul city,Kabul city (district 15),34.531094,69.162796,Afghanistan,Asia,10
4,1989,Afghanistan: Government,Government of Afghanistan - Jam'iyyat-i Islami...,Nangarhar province,Nangarhar province,34.333330,70.416670,Afghanistan,Asia,6
...,...,...,...,...,...,...,...,...,...,...
293629,1989,Renamo - Civilians,Renamo - Civilians,Chipinge district,Chipinge district,-20.500000,32.500000,Zimbabwe (Rhodesia),Africa,9
293630,1989,Renamo - Civilians,Renamo - Civilians,"Eastern Zimbabwe, near the Mozambique border",Zimbabwe eastern (Near the eastern border with...,-19.000000,32.500000,Zimbabwe (Rhodesia),Africa,9
293631,1990,Renamo - Civilians,Renamo - Civilians,Nyamaropa village,Nyamaropa village,-18.000000,32.833333,Zimbabwe (Rhodesia),Africa,7
293632,1990,Renamo - Civilians,Renamo - Civilians,"Eastern Zimbabwe, near the Mozambique border",Zimbabwe eastern (Pungwe communal lands settle...,-19.000000,32.500000,Zimbabwe (Rhodesia),Africa,1


In [49]:
#Drop years that would not be used in the analysis.
data_by_index=conflict_df_col_renamed.set_index("year")
data_by_index=data_by_index.drop([1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000], axis=0)
data_by_index

Unnamed: 0_level_0,conflict_name,dyad_name,where_coordinates,where_description,latitude,longitude,country,region,deaths_civilians
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017,Iraq: Government,Government of Iraq - IS,Kabul city,Iraqi embassy in Kabul,34.531094,69.162796,Afghanistan,Asia,6
2021,Iraq: Government,Government of Iraq - IS,Kabul international airport,Kabul airport (Abbey gate entrance),34.564444,69.217222,Afghanistan,Asia,183
2021,Iraq: Government,Government of Iraq - IS,Jalalabad town,Police District 7 of Jalalabad city,34.428844,70.455750,Afghanistan,Asia,2
2021,Iraq: Government,Government of Iraq - IS,Kabul city,Kabul city (district 15),34.531094,69.162796,Afghanistan,Asia,10
2002,Afghanistan: Government,Government of Afghanistan - Hizb-i Islami-yi A...,Kabul city,Kabul city,34.531094,69.162796,Afghanistan,Asia,30
...,...,...,...,...,...,...,...,...,...
2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,"Warren park, Harare city",-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mutare town,Mutare,-18.975973,32.650092,Zimbabwe (Rhodesia),Africa,1
2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,Harare Central Business District,-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mwenezi district,Mwenezi,-21.358380,30.706680,Zimbabwe (Rhodesia),Africa,2


In [50]:
#Resetting Year as a column.
conflict_df_dropped_rows=data_by_index.reset_index()
conflict_df_dropped_rows

Unnamed: 0,year,conflict_name,dyad_name,where_coordinates,where_description,latitude,longitude,country,region,deaths_civilians
0,2017,Iraq: Government,Government of Iraq - IS,Kabul city,Iraqi embassy in Kabul,34.531094,69.162796,Afghanistan,Asia,6
1,2021,Iraq: Government,Government of Iraq - IS,Kabul international airport,Kabul airport (Abbey gate entrance),34.564444,69.217222,Afghanistan,Asia,183
2,2021,Iraq: Government,Government of Iraq - IS,Jalalabad town,Police District 7 of Jalalabad city,34.428844,70.455750,Afghanistan,Asia,2
3,2021,Iraq: Government,Government of Iraq - IS,Kabul city,Kabul city (district 15),34.531094,69.162796,Afghanistan,Asia,10
4,2002,Afghanistan: Government,Government of Afghanistan - Hizb-i Islami-yi A...,Kabul city,Kabul city,34.531094,69.162796,Afghanistan,Asia,30
...,...,...,...,...,...,...,...,...,...,...
242066,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,"Warren park, Harare city",-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
242067,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mutare town,Mutare,-18.975973,32.650092,Zimbabwe (Rhodesia),Africa,1
242068,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,Harare Central Business District,-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
242069,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mwenezi district,Mwenezi,-21.358380,30.706680,Zimbabwe (Rhodesia),Africa,2


In [51]:
#Examine data type of each columns.
conflict_df_dropped_rows.dtypes

year                   int64
conflict_name         object
dyad_name             object
where_coordinates     object
where_description     object
latitude             float64
longitude            float64
country               object
region                object
deaths_civilians       int64
dtype: object

In [52]:
#Check for duplicates.
conflict_df_dropped_rows.duplicated().sum()

41037

In [53]:
#Removes duplicates.
conflict_dup=conflict_df_dropped_rows.drop_duplicates()
conflict_dup

Unnamed: 0,year,conflict_name,dyad_name,where_coordinates,where_description,latitude,longitude,country,region,deaths_civilians
0,2017,Iraq: Government,Government of Iraq - IS,Kabul city,Iraqi embassy in Kabul,34.531094,69.162796,Afghanistan,Asia,6
1,2021,Iraq: Government,Government of Iraq - IS,Kabul international airport,Kabul airport (Abbey gate entrance),34.564444,69.217222,Afghanistan,Asia,183
2,2021,Iraq: Government,Government of Iraq - IS,Jalalabad town,Police District 7 of Jalalabad city,34.428844,70.455750,Afghanistan,Asia,2
3,2021,Iraq: Government,Government of Iraq - IS,Kabul city,Kabul city (district 15),34.531094,69.162796,Afghanistan,Asia,10
4,2002,Afghanistan: Government,Government of Afghanistan - Hizb-i Islami-yi A...,Kabul city,Kabul city,34.531094,69.162796,Afghanistan,Asia,30
...,...,...,...,...,...,...,...,...,...,...
242066,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,"Warren park, Harare city",-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
242067,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mutare town,Mutare,-18.975973,32.650092,Zimbabwe (Rhodesia),Africa,1
242068,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,Harare Central Business District,-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
242069,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mwenezi district,Mwenezi,-21.358380,30.706680,Zimbabwe (Rhodesia),Africa,2


In [54]:
#Examine for duplicates.
conflict_dup.duplicated().sum()

0

In [55]:
#Check for null values.
conflict_dup.isnull().sum()

year                    0
conflict_name           0
dyad_name               0
where_coordinates       0
where_description    3701
latitude                0
longitude               0
country                 0
region                  0
deaths_civilians        0
dtype: int64

In [56]:
#Check dropping null values.
cleaned_conflict =conflict_dup.dropna()
cleaned_conflict

Unnamed: 0,year,conflict_name,dyad_name,where_coordinates,where_description,latitude,longitude,country,region,deaths_civilians
0,2017,Iraq: Government,Government of Iraq - IS,Kabul city,Iraqi embassy in Kabul,34.531094,69.162796,Afghanistan,Asia,6
1,2021,Iraq: Government,Government of Iraq - IS,Kabul international airport,Kabul airport (Abbey gate entrance),34.564444,69.217222,Afghanistan,Asia,183
2,2021,Iraq: Government,Government of Iraq - IS,Jalalabad town,Police District 7 of Jalalabad city,34.428844,70.455750,Afghanistan,Asia,2
3,2021,Iraq: Government,Government of Iraq - IS,Kabul city,Kabul city (district 15),34.531094,69.162796,Afghanistan,Asia,10
4,2002,Afghanistan: Government,Government of Afghanistan - Hizb-i Islami-yi A...,Kabul city,Kabul city,34.531094,69.162796,Afghanistan,Asia,30
...,...,...,...,...,...,...,...,...,...,...
242066,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,"Warren park, Harare city",-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
242067,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mutare town,Mutare,-18.975973,32.650092,Zimbabwe (Rhodesia),Africa,1
242068,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Harare town,Harare Central Business District,-17.817777,31.044722,Zimbabwe (Rhodesia),Africa,1
242069,2019,Government of Zimbabwe (Rhodesia) - Civilians,Government of Zimbabwe (Rhodesia) - Civilians,Mwenezi district,Mwenezi,-21.358380,30.706680,Zimbabwe (Rhodesia),Africa,2


In [57]:
#Examine if null values are dropped.
cleaned_conflict.isnull().sum()

year                 0
conflict_name        0
dyad_name            0
where_coordinates    0
where_description    0
latitude             0
longitude            0
country              0
region               0
deaths_civilians     0
dtype: int64

In [58]:
#Save the cleaned DataFrame as a new CSV file for analysis.
cleaned_conflict.to_csv('Resources/cleaned conflict.csv', index=False)

# Asylum-decisions Data Cleaning.

In [59]:
#Read dataframe.
asylum_df=pd.read_csv("Resources/asylum-decisions.csv",header=0)
asylum_df

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Authority,Stage of procedure,Cases / Persons,Recognized decisions,Complementary protection,Rejected decisions,Otherwise closed,Total decisions
0,2000,Afghanistan,AFG,Australia,AUS,G,AR,C,38,0,25,0,63
1,2000,Albania,ALB,Australia,AUS,G,AR,C,5,0,24,0,29
2,2000,Algeria,DZA,Australia,AUS,G,AR,C,5,0,17,0,22
3,2000,Egypt,EGY,Australia,AUS,G,AR,C,21,0,50,5,76
4,2000,Armenia,ARM,Australia,AUS,G,AR,C,0,0,5,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95801,2022,Egypt,EGY,Jordan,JOR,U,RA,P,0,0,5,0,5
95802,2022,Sudan,SDN,Jordan,JOR,U,RA,P,0,0,5,0,5
95803,2022,Yemen,YEM,Jordan,JOR,U,RA,P,10,0,0,0,10
95804,2022,Afghanistan,AFG,Syrian Arab Rep.,SYR,U,RA,P,16,0,0,0,16


In [60]:
#Check columns names.
asylum_df.columns

Index(['Year', 'Country of origin', 'Country of origin (ISO)',
       'Country of asylum', 'Country of asylum (ISO)', 'Authority',
       'Stage of procedure', 'Cases / Persons', 'Recognized decisions',
       'Complementary protection', 'Rejected decisions', 'Otherwise closed',
       'Total decisions'],
      dtype='object')

In [61]:
#Dropping columns that would not be used in analysis.
asylum_df_col= asylum_df.drop(['Authority','Stage of procedure', 'Cases / Persons'], axis="columns")
asylum_df_col

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Recognized decisions,Complementary protection,Rejected decisions,Otherwise closed,Total decisions
0,2000,Afghanistan,AFG,Australia,AUS,38,0,25,0,63
1,2000,Albania,ALB,Australia,AUS,5,0,24,0,29
2,2000,Algeria,DZA,Australia,AUS,5,0,17,0,22
3,2000,Egypt,EGY,Australia,AUS,21,0,50,5,76
4,2000,Armenia,ARM,Australia,AUS,0,0,5,0,5
...,...,...,...,...,...,...,...,...,...,...
95801,2022,Egypt,EGY,Jordan,JOR,0,0,5,0,5
95802,2022,Sudan,SDN,Jordan,JOR,0,0,5,0,5
95803,2022,Yemen,YEM,Jordan,JOR,10,0,0,0,10
95804,2022,Afghanistan,AFG,Syrian Arab Rep.,SYR,16,0,0,0,16


In [62]:
#Examine column names.
asylum_df_col.columns

Index(['Year', 'Country of origin', 'Country of origin (ISO)',
       'Country of asylum', 'Country of asylum (ISO)', 'Recognized decisions',
       'Complementary protection', 'Rejected decisions', 'Otherwise closed',
       'Total decisions'],
      dtype='object')

In [63]:
#Drop years that would not be used in the analysis.
data_by_index_1=asylum_df_col.set_index("Year")
data_by_index_1=data_by_index_1.drop([2000,2022], axis=0)
data_by_index_1

Unnamed: 0_level_0,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Recognized decisions,Complementary protection,Rejected decisions,Otherwise closed,Total decisions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001,Afghanistan,AFG,Australia,AUS,247,0,93,5,345
2001,Albania,ALB,Australia,AUS,5,0,13,0,18
2001,Algeria,DZA,Australia,AUS,5,0,13,0,18
2001,Egypt,EGY,Australia,AUS,12,0,29,5,46
2001,Burundi,BDI,Australia,AUS,5,0,0,0,5
...,...,...,...,...,...,...,...,...,...
2021,Cambodia,KHM,Thailand,THA,5,0,0,0,5
2021,Dem. Rep. of the Congo,COD,Thailand,THA,5,0,0,0,5
2021,Iran (Islamic Rep. of),IRN,Thailand,THA,5,0,0,0,5
2021,Pakistan,PAK,Thailand,THA,15,0,10,0,25


In [64]:
#Resetting Year as a column.
asylum_dropped_rows=data_by_index_1.reset_index()
asylum_dropped_rows

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Recognized decisions,Complementary protection,Rejected decisions,Otherwise closed,Total decisions
0,2001,Afghanistan,AFG,Australia,AUS,247,0,93,5,345
1,2001,Albania,ALB,Australia,AUS,5,0,13,0,18
2,2001,Algeria,DZA,Australia,AUS,5,0,13,0,18
3,2001,Egypt,EGY,Australia,AUS,12,0,29,5,46
4,2001,Burundi,BDI,Australia,AUS,5,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...
88826,2021,Cambodia,KHM,Thailand,THA,5,0,0,0,5
88827,2021,Dem. Rep. of the Congo,COD,Thailand,THA,5,0,0,0,5
88828,2021,Iran (Islamic Rep. of),IRN,Thailand,THA,5,0,0,0,5
88829,2021,Pakistan,PAK,Thailand,THA,15,0,10,0,25


In [65]:
#Replace "TÃ¼rkiye" with "Turkey".
asylum_rep=asylum_dropped_rows.replace(to_replace="TÃ¼rkiye",
           value="Turkey")
asylum_rep

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Recognized decisions,Complementary protection,Rejected decisions,Otherwise closed,Total decisions
0,2001,Afghanistan,AFG,Australia,AUS,247,0,93,5,345
1,2001,Albania,ALB,Australia,AUS,5,0,13,0,18
2,2001,Algeria,DZA,Australia,AUS,5,0,13,0,18
3,2001,Egypt,EGY,Australia,AUS,12,0,29,5,46
4,2001,Burundi,BDI,Australia,AUS,5,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...
88826,2021,Cambodia,KHM,Thailand,THA,5,0,0,0,5
88827,2021,Dem. Rep. of the Congo,COD,Thailand,THA,5,0,0,0,5
88828,2021,Iran (Islamic Rep. of),IRN,Thailand,THA,5,0,0,0,5
88829,2021,Pakistan,PAK,Thailand,THA,15,0,10,0,25


In [66]:
#Renaming Columns.
asylum_rep=asylum_rep.rename(columns={'Year':'year','Country of origin':'country_origin','Country of origin (ISO)':'country_origin(ISO)','Country of asylum':'country_asylum',\
                                                         'Country of asylum (ISO)':'country_asylum(ISO)','Recognized decisions':'recognized_decisions','Complementary protection':'complementary_protection',\
                                     'Rejected decisions':'rejected_decisions','Otherwise closed':'otherwise_closed','Total decisions':'total_decisions'})
asylum_rep

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO),recognized_decisions,complementary_protection,rejected_decisions,otherwise_closed,total_decisions
0,2001,Afghanistan,AFG,Australia,AUS,247,0,93,5,345
1,2001,Albania,ALB,Australia,AUS,5,0,13,0,18
2,2001,Algeria,DZA,Australia,AUS,5,0,13,0,18
3,2001,Egypt,EGY,Australia,AUS,12,0,29,5,46
4,2001,Burundi,BDI,Australia,AUS,5,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...
88826,2021,Cambodia,KHM,Thailand,THA,5,0,0,0,5
88827,2021,Dem. Rep. of the Congo,COD,Thailand,THA,5,0,0,0,5
88828,2021,Iran (Islamic Rep. of),IRN,Thailand,THA,5,0,0,0,5
88829,2021,Pakistan,PAK,Thailand,THA,15,0,10,0,25


In [67]:
#Examine data type of each columns.
asylum_rep.dtypes

year                         int64
country_origin              object
country_origin(ISO)         object
country_asylum              object
country_asylum(ISO)         object
recognized_decisions         int64
complementary_protection     int64
rejected_decisions           int64
otherwise_closed             int64
total_decisions              int64
dtype: object

In [68]:
#Check for duplicates.
asylum_rep.duplicated().sum()

1266

In [69]:
#Removes duplicates.
asylum_dup=asylum_rep.drop_duplicates()
asylum_dup

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO),recognized_decisions,complementary_protection,rejected_decisions,otherwise_closed,total_decisions
0,2001,Afghanistan,AFG,Australia,AUS,247,0,93,5,345
1,2001,Albania,ALB,Australia,AUS,5,0,13,0,18
2,2001,Algeria,DZA,Australia,AUS,5,0,13,0,18
3,2001,Egypt,EGY,Australia,AUS,12,0,29,5,46
4,2001,Burundi,BDI,Australia,AUS,5,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...
88826,2021,Cambodia,KHM,Thailand,THA,5,0,0,0,5
88827,2021,Dem. Rep. of the Congo,COD,Thailand,THA,5,0,0,0,5
88828,2021,Iran (Islamic Rep. of),IRN,Thailand,THA,5,0,0,0,5
88829,2021,Pakistan,PAK,Thailand,THA,15,0,10,0,25


In [70]:
#Examine for duplicates.
asylum_dup.duplicated().sum()

0

In [71]:
#Check for null values.
asylum_dup.isnull().sum()

year                          0
country_origin                0
country_origin(ISO)         902
country_asylum                0
country_asylum(ISO)           0
recognized_decisions          0
complementary_protection      0
rejected_decisions            0
otherwise_closed              0
total_decisions               0
dtype: int64

In [72]:
#Check dropping null values.
cleaned_asylum =asylum_dup.dropna()
cleaned_asylum

Unnamed: 0,year,country_origin,country_origin(ISO),country_asylum,country_asylum(ISO),recognized_decisions,complementary_protection,rejected_decisions,otherwise_closed,total_decisions
0,2001,Afghanistan,AFG,Australia,AUS,247,0,93,5,345
1,2001,Albania,ALB,Australia,AUS,5,0,13,0,18
2,2001,Algeria,DZA,Australia,AUS,5,0,13,0,18
3,2001,Egypt,EGY,Australia,AUS,12,0,29,5,46
4,2001,Burundi,BDI,Australia,AUS,5,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...
88826,2021,Cambodia,KHM,Thailand,THA,5,0,0,0,5
88827,2021,Dem. Rep. of the Congo,COD,Thailand,THA,5,0,0,0,5
88828,2021,Iran (Islamic Rep. of),IRN,Thailand,THA,5,0,0,0,5
88829,2021,Pakistan,PAK,Thailand,THA,15,0,10,0,25


In [73]:
#Examine if null values are dropped.
cleaned_asylum.isnull().sum()

year                        0
country_origin              0
country_origin(ISO)         0
country_asylum              0
country_asylum(ISO)         0
recognized_decisions        0
complementary_protection    0
rejected_decisions          0
otherwise_closed            0
total_decisions             0
dtype: int64

In [74]:
#Save the cleaned DataFrame as a new CSV file for analysis.
cleaned_asylum.to_csv('Resources/cleaned_asylum.csv', index=False)