# Content

### 00. Import Libraries
### 01. Import Data Sets
### 02. Merge Clean Datasets
### 03. Export Final Data Set


## 00. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pycountry
import os

## 01. Import Data Sets

In [2]:
# Create folder path
path = r"C:\Users\Juliana\Documents\DATA ANALYTICS COURSE\IMMERSION\6"

In [3]:
# Import natural disasters data
nat_events= pd.read_csv(os.path.join(path,'02 Data','Cleaned Data', 'nat_disasters_clean.csv'))

In [4]:
# Import temperature changes data
temp_change = pd.read_csv(os.path.join(path,'02 Data','Cleaned Data', 'temp_changes_clean.csv'))

In [42]:
# Import migration data
net_migration = pd.read_csv(os.path.join(path,'02 Data','Cleaned Data', 'net_migration_clean.csv'))

In [6]:
# Import population data
population = pd.read_csv(os.path.join(path,'02 Data','Cleaned Data', 'population_clean.csv'))

In [7]:
# Import CO2 emissions data
co2_emissions = pd.read_csv(os.path.join(path,'02 Data','Cleaned Data', 'co2_emissions_filtered_clean.csv'))

In [8]:
# Import energy consumtion data
energy_consumption = pd.read_csv(os.path.join(path,'02 Data','Cleaned Data', 'energy_consumption_filtered_clean.csv'))

## 02. Merge Clean Datasets

In [9]:
nat_events.head()

Unnamed: 0.1,Unnamed: 0,Year,Event_type,Event_Category,Disaster Subtype,Country,Country_Code,Region,Continent,Total_deaths,No_Affected
0,353,1965,Geophysical,Earthquake,Ground movement,Indonesia,IDN,South-Eastern Asia,Asia,71.0,15000.0
1,354,1965,Hydrological,Landslide,Landslide,Argentina,ARG,South America,Americas,45.0,
2,355,1965,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,36000.0,10000000.0
3,356,1965,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,12047.0,
4,357,1965,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,874.0,60000.0


In [10]:
nat_events.shape

(14724, 11)

In [11]:
# Calculate Number_of_Events and add it as a new column
nat_events['Number_of_Events'] = nat_events.groupby(['Year', 'Country'])['Event_type'].transform('count')

In [12]:
nat_events.head()

Unnamed: 0.1,Unnamed: 0,Year,Event_type,Event_Category,Disaster Subtype,Country,Country_Code,Region,Continent,Total_deaths,No_Affected,Number_of_Events
0,353,1965,Geophysical,Earthquake,Ground movement,Indonesia,IDN,South-Eastern Asia,Asia,71.0,15000.0,1
1,354,1965,Hydrological,Landslide,Landslide,Argentina,ARG,South America,Americas,45.0,,1
2,355,1965,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,36000.0,10000000.0,3
3,356,1965,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,12047.0,,3
4,357,1965,Meteorological,Storm,Tropical cyclone,Bangladesh,BGD,Southern Asia,Asia,874.0,60000.0,3


In [13]:
nat_events.describe()

Unnamed: 0.1,Unnamed: 0,Year,Total_deaths,No_Affected,Number_of_Events
count,14724.0,14724.0,10423.0,9061.0,14724.0
mean,8477.911098,2000.788509,510.0138,886723.8,7.254007
std,4393.425172,13.567642,16037.67,8642699.0,7.917835
min,353.0,1965.0,1.0,1.0,1.0
25%,4719.75,1992.0,5.0,1200.0,2.0
50%,8526.5,2003.0,17.0,10000.0,4.0
75%,12242.25,2011.0,52.0,90000.0,9.0
max,16125.0,2021.0,1500000.0,330000000.0,43.0


In [14]:
nat_events.columns

Index(['Unnamed: 0', 'Year', 'Event_type', 'Event_Category',
       'Disaster Subtype', 'Country', 'Country_Code', 'Region', 'Continent',
       'Total_deaths', 'No_Affected', 'Number_of_Events'],
      dtype='object')

In [15]:
nat_events.isnull().sum()

Unnamed: 0             0
Year                   0
Event_type             0
Event_Category         0
Disaster Subtype    2780
Country                0
Country_Code           0
Region                 0
Continent              0
Total_deaths        4301
No_Affected         5663
Number_of_Events       0
dtype: int64

In [16]:
# Remove unnecessary or irrelevant columns
nat_events=nat_events.drop(columns = ['Unnamed: 0','Disaster Subtype', 'Total_deaths', 'No_Affected'])

In [17]:
# Verify the updated DataFrame
nat_events.head()

Unnamed: 0,Year,Event_type,Event_Category,Country,Country_Code,Region,Continent,Number_of_Events
0,1965,Geophysical,Earthquake,Indonesia,IDN,South-Eastern Asia,Asia,1
1,1965,Hydrological,Landslide,Argentina,ARG,South America,Americas,1
2,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3
3,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3
4,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3


In [18]:
temp_change.head()

Unnamed: 0.1,Unnamed: 0,Country,Months,Year,Temperature_Change,Country_Code,Month
0,4,Afghanistan,January,1965,1.834,AFG,1.0
1,5,Afghanistan,January,1966,3.78,AFG,1.0
2,6,Afghanistan,January,1967,-1.398,AFG,1.0
3,7,Afghanistan,January,1968,0.349,AFG,1.0
4,8,Afghanistan,January,1969,-2.325,AFG,1.0


Since the analysis aims to understand the effects of temperature changes, using the mean of absolute values is more useful than the mean of the original values. This approach gives equal weight to positive and negative deviations from the norm

In [19]:
# Group the data by 'Year' and 'Country_Code', calculate the mean of the absolute values
temp_change= temp_change.groupby(['Year', 'Country_Code']).agg({'Temperature_Change': 'mean'}).reset_index()

In [20]:
temp_change.head()

Unnamed: 0,Year,Country_Code,Temperature_Change
0,1965,ABW,-0.108182
1,1965,AFG,-0.011667
2,1965,AGO,-0.156083
3,1965,AIA,-0.27025
4,1965,ALB,-0.344


In [21]:
# Merge nat_events and temp_change 
events_temp_merged= pd.merge(nat_events, temp_change, on=['Year', 'Country_Code'], how='outer')

In [22]:
events_temp_merged.shape

(22224, 9)

In [23]:
events_temp_merged.columns

Index(['Year', 'Event_type', 'Event_Category', 'Country', 'Country_Code',
       'Region', 'Continent', 'Number_of_Events', 'Temperature_Change'],
      dtype='object')

In [24]:
#Reviewing the updated merged dataset
events_temp_merged.head()

Unnamed: 0,Year,Event_type,Event_Category,Country,Country_Code,Region,Continent,Number_of_Events,Temperature_Change
0,1965,Geophysical,Earthquake,Indonesia,IDN,South-Eastern Asia,Asia,1.0,-0.1905
1,1965,Hydrological,Landslide,Argentina,ARG,South America,Americas,1.0,0.045583
2,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833
3,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833
4,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833


In [25]:
# Find missing values in events_temp_merged
events_temp_merged.isnull().sum()

Year                     0
Event_type            7500
Event_Category        7500
Country               7500
Country_Code             0
Region                7500
Continent             7500
Number_of_Events      7500
Temperature_Change     693
dtype: int64

All missing values will be adressed at the end.

In [26]:
#Export merged data set
events_temp_merged.to_csv(os.path.join(path,'02 Data','Merged Data', 'events_temp_merged.csv'))

In [43]:
net_migration.head()

Unnamed: 0.1,Unnamed: 0,Country_Code,Country,Year,Net_Migration
0,5,ABW,Aruba,1965,-726.0
1,6,ABW,Aruba,1966,-761.0
2,7,ABW,Aruba,1967,-1175.0
3,8,ABW,Aruba,1968,-1219.0
4,9,ABW,Aruba,1969,-1256.0


In [44]:
population.head()

Unnamed: 0.1,Unnamed: 0,Country,Year,Population,Country_Code
0,1255,Arab World,1965,105736400.0,ARB
1,1256,Caribbean small states,1965,4580374.0,CSS
2,1257,Central Europe and the Baltics,1965,95440990.0,CEB
3,1258,Early-demographic dividend,1965,1103955000.0,EAR
4,1259,East Asia & Pacific,1965,1136691000.0,EAS


In [45]:
population.isnull().sum()

Unnamed: 0        0
Country           0
Year              0
Population      147
Country_Code      0
dtype: int64

In [46]:
# Drop rows where 'Country_Code' is NaN
population = population.dropna(subset=['Country_Code'])

In [47]:
# Merge net_migration and population 
migration_population_merged= pd.merge(net_migration, population, on=['Year', 'Country_Code'],how='outer')

In [48]:
migration_population_merged.shape

(15159, 8)

In [49]:
migration_population_merged.head()

Unnamed: 0,Unnamed: 0_x,Country_Code,Country_x,Year,Net_Migration,Unnamed: 0_y,Country_y,Population
0,5.0,ABW,Aruba,1965,-726.0,1311.0,Aruba,57360.0
1,6.0,ABW,Aruba,1966,-761.0,1562.0,Aruba,57715.0
2,7.0,ABW,Aruba,1967,-1175.0,1813.0,Aruba,58055.0
3,8.0,ABW,Aruba,1968,-1219.0,2064.0,Aruba,58386.0
4,9.0,ABW,Aruba,1969,-1256.0,2315.0,Aruba,58726.0


In [50]:
migration_population_merged.isnull().sum()

Unnamed: 0_x       54
Country_Code        0
Country_x          54
Year                0
Net_Migration      54
Unnamed: 0_y     1011
Country_y        1011
Population       1158
dtype: int64

In [51]:
# Remove unnecessary or irrelevant columns
migration_population_merged=migration_population_merged.drop(columns = ['Unnamed: 0_x','Unnamed: 0_y','Country_y' ])

In [52]:
# Format column names
migration_population_merged.rename(columns={'Country_x' : 'Country'}, inplace = True)

In [53]:
migration_population_merged.head()

Unnamed: 0,Country_Code,Country,Year,Net_Migration,Population
0,ABW,Aruba,1965,-726.0,57360.0
1,ABW,Aruba,1966,-761.0,57715.0
2,ABW,Aruba,1967,-1175.0,58055.0
3,ABW,Aruba,1968,-1219.0,58386.0
4,ABW,Aruba,1969,-1256.0,58726.0


In [55]:
#Export merge data set
migration_population_merged.to_csv(os.path.join(path,'02 Data','Merged Data','migration_population_merged.csv'))

In [56]:
co2_emissions.head()

Unnamed: 0.1,Unnamed: 0,Country_Code,Country,Year,Carbon_Emissions
0,5,ABW,Aruba,1965,10623.299
1,6,ABW,Aruba,1966,9933.903
2,7,ABW,Aruba,1967,12236.779
3,8,ABW,Aruba,1968,11378.701
4,9,ABW,Aruba,1969,14891.687


In [57]:
energy_consumption.head()

Unnamed: 0.1,Unnamed: 0,Country,Country_Code,Year,Energy_Consumption
0,0,Afghanistan,AFG,1980,7.790772
1,1,Afghanistan,AFG,1981,8.77732
2,2,Afghanistan,AFG,1982,9.348327
3,3,Afghanistan,AFG,1983,11.436162
4,4,Afghanistan,AFG,1984,11.489043


In [58]:
energy_consumption.isnull().sum()

Unnamed: 0               0
Country                  0
Country_Code          2537
Year                     0
Energy_Consumption       0
dtype: int64

In [59]:
# Drop rows where 'Country_Code' is NaN
energy_consumption = energy_consumption.dropna(subset=['Country_Code'])

In [60]:
# Merge co2_emissions and energy_consumption
co2_energy_merged= pd.merge(co2_emissions, energy_consumption, on=['Year', 'Country_Code'],how='outer')

In [61]:
co2_energy_merged.columns

Index(['Unnamed: 0_x', 'Country_Code', 'Country_x', 'Year', 'Carbon_Emissions',
       'Unnamed: 0_y', 'Country_y', 'Energy_Consumption'],
      dtype='object')

In [62]:
# Remove unnecessary or irrelevant columns
co2_energy_merged=co2_energy_merged.drop(columns = ['Unnamed: 0_x','Country_y','Unnamed: 0_y'])

In [63]:
# Format column names
co2_energy_merged.rename(columns={'Country_x' : 'Country'}, inplace = True)

In [64]:
co2_energy_merged.head()

Unnamed: 0,Country_Code,Country,Year,Carbon_Emissions,Energy_Consumption
0,ABW,Aruba,1965,10623.299,
1,ABW,Aruba,1966,9933.903,
2,ABW,Aruba,1967,12236.779,
3,ABW,Aruba,1968,11378.701,
4,ABW,Aruba,1969,14891.687,


In [65]:
co2_energy_merged.shape

(14544, 5)

In [66]:
# Merge all of the data sets part 1
merged_data_1= pd.merge(events_temp_merged, migration_population_merged, on=['Year', 'Country_Code'],how='outer')

In [67]:
merged_data_1.head()

Unnamed: 0,Year,Event_type,Event_Category,Country_x,Country_Code,Region,Continent,Number_of_Events,Temperature_Change,Country_y,Net_Migration,Population
0,1965,Geophysical,Earthquake,Indonesia,IDN,South-Eastern Asia,Asia,1.0,-0.1905,Indonesia,-61146.0,100267062.0
1,1965,Hydrological,Landslide,Argentina,ARG,South America,Americas,1.0,0.045583,Argentina,33973.0,22159650.0
2,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,Bangladesh,47618.0,55385112.0
3,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,Bangladesh,47618.0,55385112.0
4,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,Bangladesh,47618.0,55385112.0


In [68]:
merged_data_1.isnull().sum()

Year                      0
Event_type            11619
Event_Category        11619
Country_x             11619
Country_Code              0
Region                11619
Continent             11619
Number_of_Events      11619
Temperature_Change     4812
Country_y              1876
Net_Migration          1876
Population             3755
dtype: int64

In [69]:
merged_data_1.columns

Index(['Year', 'Event_type', 'Event_Category', 'Country_x', 'Country_Code',
       'Region', 'Continent', 'Number_of_Events', 'Temperature_Change',
       'Country_y', 'Net_Migration', 'Population'],
      dtype='object')

In [70]:
# Remove unnecessary or irrelevant columns
merged_data_1=merged_data_1.drop(columns = ['Country_y'])

In [71]:
# Format column names
merged_data_1.rename(columns={'Country_x' : 'Country'}, inplace = True)

In [72]:
# #Reviewing the updated dataFrame
merged_data_1.head()

Unnamed: 0,Year,Event_type,Event_Category,Country,Country_Code,Region,Continent,Number_of_Events,Temperature_Change,Net_Migration,Population
0,1965,Geophysical,Earthquake,Indonesia,IDN,South-Eastern Asia,Asia,1.0,-0.1905,-61146.0,100267062.0
1,1965,Hydrological,Landslide,Argentina,ARG,South America,Americas,1.0,0.045583,33973.0,22159650.0
2,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0
3,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0
4,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0


In [73]:
merged_data_1.isnull().sum()

Year                      0
Event_type            11619
Event_Category        11619
Country               11619
Country_Code              0
Region                11619
Continent             11619
Number_of_Events      11619
Temperature_Change     4812
Net_Migration          1876
Population             3755
dtype: int64

In [74]:
#Export merged data set
merged_data_1.to_csv(os.path.join(path,'02 Data','Merged Data', 'merged_data_1.csv'))

In [105]:
# Merge all of the data sets
merged_data_2= pd.merge(merged_data_1, co2_energy_merged, on=['Year', 'Country_Code'],how='outer')

In [106]:
merged_data_2.head()

Unnamed: 0,Year,Event_type,Event_Category,Country_x,Country_Code,Region,Continent,Number_of_Events,Temperature_Change,Net_Migration,Population,Country_y,Carbon_Emissions,Energy_Consumption
0,1965,Geophysical,Earthquake,Indonesia,IDN,South-Eastern Asia,Asia,1.0,-0.1905,-61146.0,100267062.0,Indonesia,24689.911,84.51602
1,1965,Hydrological,Landslide,Argentina,ARG,South America,Americas,1.0,0.045583,33973.0,22159650.0,Argentina,58866.351,328.52762
2,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0,Bangladesh,19974.149,
3,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0,Bangladesh,19974.149,
4,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0,Bangladesh,19974.149,


In [107]:
merged_data_2.shape

(26500, 14)

In [108]:
merged_data_2.columns

Index(['Year', 'Event_type', 'Event_Category', 'Country_x', 'Country_Code',
       'Region', 'Continent', 'Number_of_Events', 'Temperature_Change',
       'Net_Migration', 'Population', 'Country_y', 'Carbon_Emissions',
       'Energy_Consumption'],
      dtype='object')

In [109]:
# Remove unnecessary or irrelevant columns
merged_data_2=merged_data_2.drop(columns = ['Country_y'])

In [110]:
# Format column names
merged_data_2.rename(columns={'Country_x' : 'Country'}, inplace = True)

In [111]:
merged_data_2.shape

(26500, 13)

In [112]:
# Find duplicates for merged_data_2
merged_data_2_dups= merged_data_2[merged_data_2.duplicated]
#Print the duplicates
merged_data_2_dups

Unnamed: 0,Year,Event_type,Event_Category,Country,Country_Code,Region,Continent,Number_of_Events,Temperature_Change,Net_Migration,Population,Carbon_Emissions,Energy_Consumption
3,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0,19974.149,
4,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0,19974.149,
9,1965,Hydrological,Flood,Brazil,BRA,South America,Americas,2.0,0.001833,10232.0,83373530.0,56398.460,273.869600
25,1965,Geophysical,Earthquake,Greece,GRC,Southern Europe,Europe,2.0,-0.297500,-87226.0,8550333.0,17025.881,79.146180
34,1966,Geophysical,Volcanic activity,Indonesia,IDN,South-Eastern Asia,Asia,4.0,-0.011917,-77641.0,103025426.0,23395.460,83.896090
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14703,2021,Hydrological,Flood,Uganda,UGA,Eastern Africa,Africa,3.0,1.511333,43465.0,,,35.162792
14705,2021,Hydrological,Flood,Venezuela (Bolivarian Republic of),VEN,South America,Americas,2.0,0.723250,-525116.0,,,583.354000
14710,2021,Meteorological,Storm,Viet Nam,VNM,South-Eastern Asia,Asia,4.0,1.125333,-992.0,,,1206.255700
14712,2021,Meteorological,Storm,Viet Nam,VNM,South-Eastern Asia,Asia,4.0,1.125333,-992.0,,,1206.255700


In [113]:
#Deleting duplicates
merged_data_2=merged_data_2.drop_duplicates()

In [114]:
merged_data_2.shape

(20901, 13)

In [115]:
# Check for missing values
merged_data_2.isnull().sum()

Year                      0
Event_type            11776
Event_Category        11776
Country               11776
Country_Code              0
Region                11776
Continent             11776
Number_of_Events      11776
Temperature_Change     4831
Net_Migration          1973
Population             3347
Carbon_Emissions       4316
Energy_Consumption     7039
dtype: int64

In [116]:
# Since it is reasonable to assume that not every country experiences a natural disaster every year, the missing values in the 'Number_of_Events' column have been replaced with 0, and the missing values in 'Event_type' and 'Event_Category' with "No Event".
merged_data_2['Number_of_Events'] = merged_data_2['Number_of_Events'].fillna(0)
merged_data_2['Event_type'] = merged_data_2['Event_type'].fillna('No Event')
merged_data_2['Event_Category'] = merged_data_2['Event_Category'].fillna('No Event')

In [117]:
# Verify data input
merged_data_2.isnull().sum()

Year                      0
Event_type                0
Event_Category            0
Country               11776
Country_Code              0
Region                11776
Continent             11776
Number_of_Events          0
Temperature_Change     4831
Net_Migration          1973
Population             3347
Carbon_Emissions       4316
Energy_Consumption     7039
dtype: int64

In [125]:
#Input missing data for 'Country,' 'Region,' and 'Continent' using a country database
column_names= ['Country_Code', 'Country', 'Region', 'Continent']

# Read the excel file with the country data
country_data = pd.read_excel(os.path.join(path, '02 Data', 'Cleaned Data', 'Merged Data Sub', 'countries_data.xlsx'), names=column_names, header=None)

# Create a dictionary mapping Country_Code to Country, Region, and Continent
country_info_dict = dict(zip(country_data['Country_Code'], country_data[['Country', 'Region', 'Continent']].values.tolist()))

In [126]:
country_data.head()

Unnamed: 0,Country_Code,Country,Region,Continent
0,Country_Code,Country,Region,Continent
1,SUN,Soviet Union,Russian Federation,Europe
2,MAR,Morocco,Northern Africa,Africa
3,DOM,Dominican Republic (the),Caribbean,Americas
4,AGO,Angola,Middle Africa,Africa


In [127]:
# Define a function to fill missing values in columns 'Country', 'Region', and 'Continent'
def fill_missing_info(row):
    country_code = row['Country_Code']
    if pd.isna(row['Country']):
        row['Country'] = country_info_dict.get(country_code, [None, None, None])[0]
    if pd.isna(row['Region']):
        row['Region'] = country_info_dict.get(country_code, [None, None, None])[1]
    if pd.isna(row['Continent']):
        row['Continent'] = country_info_dict.get(country_code, [None, None, None])[2]
    return row

In [128]:
# Apply the function to fill missing values in the dataset
merged_data_2 = merged_data_2.apply(fill_missing_info, axis=1)

In [129]:
merged_data_2.isnull().sum()

Year                     0
Event_type               0
Event_Category           0
Country                 67
Country_Code             0
Region                  67
Continent               67
Number_of_Events         0
Temperature_Change    4831
Net_Migration         1973
Population            3347
Carbon_Emissions      4316
Energy_Consumption    7039
dtype: int64

In [130]:
merged_data_2.shape

(20901, 13)

#### Address Missing Values

In [131]:
# Create a boolean mask for missing values in specified columns
missing_values_mask = merged_data_2[['Temperature_Change', 'Net_Migration', 'Population', 'Carbon_Emissions', 'Energy_Consumption']].isnull()

# Group by 'Country' and 'Country_Code' and count the number of missing values in each column for each country
missing_values_per_country = missing_values_mask.groupby([merged_data_2['Country'], merged_data_2['Country_Code']])[['Temperature_Change', 'Net_Migration', 'Population', 'Carbon_Emissions', 'Energy_Consumption']].sum()

# Create a new column with the aggregated value of missing values across the specified columns
missing_values_per_country['Total_Missing_Values'] = missing_values_per_country.sum(axis=1)

# Filter for countries with total missing values above 34
filtered_countries = missing_values_per_country[missing_values_per_country['Total_Missing_Values'] >= 34]

# Print the resulting DataFrame showing countries with total missing values above 34
filtered_countries.shape


(148, 6)

In [132]:
# Remove rows for countries with more than 34 missing values (representing 10% of total observations per country)
# Get the list of countries to be deleted
countries_to_delete = filtered_countries.index.get_level_values('Country_Code').tolist()

# Delete rows where 'Country_Code' is in the list of countries to be deleted
merged_data_2 = merged_data_2[~merged_data_2['Country_Code'].isin(countries_to_delete)]

In [133]:
# #Reviewing the updated merged_data_2
merged_data_2.head()

Unnamed: 0,Year,Event_type,Event_Category,Country,Country_Code,Region,Continent,Number_of_Events,Temperature_Change,Net_Migration,Population,Carbon_Emissions,Energy_Consumption
0,1965,Geophysical,Earthquake,Indonesia,IDN,South-Eastern Asia,Asia,1.0,-0.1905,-61146.0,100267062.0,24689.911,84.51602
1,1965,Hydrological,Landslide,Argentina,ARG,South America,Americas,1.0,0.045583,33973.0,22159650.0,58866.351,328.52762
2,1965,Meteorological,Storm,Bangladesh,BGD,Southern Asia,Asia,3.0,-0.199833,47618.0,55385112.0,19974.149,
5,1965,Hydrological,Landslide,Bulgaria,BGR,Eastern Europe,Europe,1.0,-0.382333,1838.0,8204168.0,46317.877,119.92565
6,1965,Meteorological,Storm,Bahamas (the),BHS,Caribbean,Americas,1.0,-0.014917,2493.0,140059.0,1316.453,


In [134]:
merged_data_2.shape

(12559, 13)

In [135]:
merged_data_2.isnull().sum()

Year                     0
Event_type               0
Event_Category           0
Country                 53
Country_Code             0
Region                  53
Continent               53
Number_of_Events         0
Temperature_Change      38
Net_Migration          226
Population             943
Carbon_Emissions       711
Energy_Consumption    1646
dtype: int64

In [141]:
# Remove rows corresponding to the following country codes as they are not relevant to the analysis.
not_countries = ['AFE', 'AFW', 'ATF', 'CEB', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS', 'EMU', 'EUU', 'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX', 'INX', 'LAC', 'LCN', 'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 'MIC', 'MNA', 'NAC', 'OED', 'OSS', 'OWID_USS', 'OWID_WRL', 'PRE', 'PSS', 'PST', 'SAS', 'SSA', 'SSF', 'SST', 'TEA', 'TEC', 'TLA', 'TMN', 'TSA', 'TSS', 'UMC', 'WLD','WLF']

merged_data_2 = merged_data_2[~merged_data_2['Country_Code'].isin(not_countries)]

In [142]:
merged_data_2.isnull().sum()

Year                     0
Event_type               0
Event_Category           0
Country                  0
Country_Code             0
Region                   0
Continent                0
Number_of_Events         0
Temperature_Change      38
Net_Migration          168
Population             885
Carbon_Emissions       653
Energy_Consumption    1588
dtype: int64

In [143]:
# Find duplicates for data_merged_2
merged_data_2_dups= merged_data_2[merged_data_2.duplicated]
#Print the duplicates
merged_data_2_dups

Unnamed: 0,Year,Event_type,Event_Category,Country,Country_Code,Region,Continent,Number_of_Events,Temperature_Change,Net_Migration,Population,Carbon_Emissions,Energy_Consumption


In [144]:
merged_data_2.shape

(12501, 13)

## 03. Export Final Data Set

In [140]:
merged_data_2.to_csv(os.path.join(path,'02 Data', 'Merged_Data' 'merged_data.csv'))