In [1]:
# Mithil Patel
# DSC 540 - Milestone 1
# 10/18/2022

# Importing libraries
import pandas as pd

# Importing dataframe
file_df = pd.read_csv("air-pollution-deaths-country.csv")
file_df

Unnamed: 0,Entity,Code,Year,Deaths - Cause: All causes - Risk: Air pollution - Sex: Both - Age: All Ages (Number)
0,Afghanistan,AFG,1990,37231
1,Afghanistan,AFG,1991,38315
2,Afghanistan,AFG,1992,41172
3,Afghanistan,AFG,1993,44488
4,Afghanistan,AFG,1994,46634
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,13246
6836,Zimbabwe,ZWE,2016,13131
6837,Zimbabwe,ZWE,2017,12926
6838,Zimbabwe,ZWE,2018,12745


In [2]:
# Transformation 1: Filter dataset to only display rows with years 1990 and 2017 
# Reason: We would like to see whether the number of death has increase in approximately three decades; therefore, 
#         we can filter data to show rows of two different point in time for each country. 

file_df = file_df[(file_df['Year'] == 1990) | (file_df['Year'] == 2017)]
file_df.head()

Unnamed: 0,Entity,Code,Year,Deaths - Cause: All causes - Risk: Air pollution - Sex: Both - Age: All Ages (Number)
0,Afghanistan,AFG,1990,37231
27,Afghanistan,AFG,2017,37535
30,African Region (WHO),,1990,868866
57,African Region (WHO),,2017,907307
60,Albania,ALB,1990,2965


In [3]:
# Transformation 2: Dropping entire row where the column 'Code' is NaN
# Reason: The 'Code' column shows abbreviation of each country, so we can remove any NaN values because they 
#         indicate a continent or a territory which is irrevelent for our analysis. 

import warnings
warnings.filterwarnings("ignore")

file_df.dropna(subset=['Code'], inplace=True)
file_df.head()

Unnamed: 0,Entity,Code,Year,Deaths - Cause: All causes - Risk: Air pollution - Sex: Both - Age: All Ages (Number)
0,Afghanistan,AFG,1990,37231
27,Afghanistan,AFG,2017,37535
60,Albania,ALB,1990,2965
87,Albania,ALB,2017,2260
90,Algeria,DZA,1990,14223


In [4]:
# Transformation 3: rename column title
# Reason: Renaming header for readability as well as to have a common header to merge with other dataframes.

file_df.rename({'Entity': 'country', 'Year':'year', 'Code': 'code', 'Deaths - Cause: All causes - Risk: Air pollution - Sex: Both - Age: All Ages (Number)': 'total_death'}, axis=1, inplace=True)
file_df.head()

Unnamed: 0,country,code,year,total_death
0,Afghanistan,AFG,1990,37231
27,Afghanistan,AFG,2017,37535
60,Albania,ALB,1990,2965
87,Albania,ALB,2017,2260
90,Algeria,DZA,1990,14223


In [5]:
# Transformation 4: created a pivot table and left out 'code' column
# Reason: Created a pivot table to convert a rows containing years into columns which will be used later to
#         perform calculation. Then, pivot table was converted back to pandas dataframe to prepare for calculation.
# Note: 'code' column was omitted since it served no purpose for our analysis

# Creating pivot table
pivot_df = file_df.pivot_table('total_death', ['country'], 'year')

# converting from pivot table to pandas dataframe
file_df = pivot_df.reset_index()
file_df.head()

year,country,1990,2017
0,Afghanistan,37231,37535
1,Albania,2965,2260
2,Algeria,14223,20756
3,American Samoa,15,14
4,Andorra,10,13


In [6]:
# Transformation 5: Rename columns again
# Reason: Since several columns had header in integer format, we need to convert those into a string format to
#         avoid confusion.

# Renaming columns
file_df.rename({1990: '1990', 2017: '2017'}, axis=1, inplace=True)
file_df.head()

year,country,1990,2017
0,Afghanistan,37231,37535
1,Albania,2965,2260
2,Algeria,14223,20756
3,American Samoa,15,14
4,Andorra,10,13


In [7]:
# Transformation 6: added a new column to the dataframe showing death percent difference
# Reason: To show at which rate the number of death has changed over time for each country. 

file_df['death_percent_diff'] = ((file_df['2017'] - file_df['1990'])/file_df['1990'])*100
file_df.head()

year,country,1990,2017,death_percent_diff
0,Afghanistan,37231,37535,0.816524
1,Albania,2965,2260,-23.777403
2,Algeria,14223,20756,45.932644
3,American Samoa,15,14,-6.666667
4,Andorra,10,13,30.0


In [8]:
# Checking for duplicates
file_df[file_df.duplicated()]

year,country,1990,2017,death_percent_diff


There exist no duplicate values in our dataframe. Also, we will not remove outliers because the data shows the number of deaths due to air polluation in each country, and it is possible more people are likely to die in highly populated countries. 

In [10]:
%store file_df

Stored 'file_df' (DataFrame)
