Importing Libraries

In [1]:
import pandas as pd
import pycountry
from countryinfo import CountryInfo

Loading Dataset

In [2]:
file_path = 'dths_total.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path, encoding='utf-8')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,iso3,year,ghecause,causename,dths1,dths_low1,dths_up1,dths2,dths_low2,dths_up2
0,AFG,2000,0,All Causes,128293.5,104363.1,155448.8,119126.9,95349.63,146933.5
1,AFG,2000,10,"Communicable, maternal, perinatal and nutritio...",73623.32,30248.0,150103.7,74370.29,30905.55,146269.1
2,AFG,2000,20,Infectious and parasitic diseases,36323.68,11730.16,85540.53,34021.09,10151.07,78650.99
3,AFG,2000,30,Tuberculosis,6370.347,0.0,27167.83,7700.545,0.0,32156.3
4,AFG,2000,40,STDs excluding HIV,46.57511,13.08252,114.0615,43.35635,11.66702,108.5951


Dropping Unwanted Columns

In [3]:
df = df.drop(['ghecause', 'dths_low1', 'dths_up1', 'dths2',	'dths_low2', 'dths_up2'], axis=1)

# Display the first few rows after dropping columns
df.head()

Unnamed: 0,iso3,year,causename,dths1
0,AFG,2000,All Causes,128293.5
1,AFG,2000,"Communicable, maternal, perinatal and nutritio...",73623.32
2,AFG,2000,Infectious and parasitic diseases,36323.68
3,AFG,2000,Tuberculosis,6370.347
4,AFG,2000,STDs excluding HIV,46.57511


Renaming Columns

In [4]:
# Rename columns
df.rename(columns={
    'dths1': 'Deaths',
}, inplace=True)

df.head()

Unnamed: 0,iso3,year,causename,Deaths
0,AFG,2000,All Causes,128293.5
1,AFG,2000,"Communicable, maternal, perinatal and nutritio...",73623.32
2,AFG,2000,Infectious and parasitic diseases,36323.68
3,AFG,2000,Tuberculosis,6370.347
4,AFG,2000,STDs excluding HIV,46.57511


In [5]:
df['Tuberculosis_Deaths'] = df[df['causename'] == 'Tuberculosis']['Deaths']
df = df.drop(columns=['causename', 'Deaths'])

Checking Null Values

In [6]:
df.isnull().sum()

iso3                        0
year                        0
Tuberculosis_Deaths    812520
dtype: int64

Dropping Null rows

In [7]:
df = df.dropna()

Creating a new Country column using Country-Abbr column

In [8]:
# Function to get country name from ISO3 code using pycountry
def iso3_to_country_name(iso3):
    try:
        return pycountry.countries.get(alpha_3=iso3).name
    except AttributeError:
        return None  # In case there is no match for an ISO3 code

# Apply the function to create a new 'Country' column
df['Country'] = df['iso3'].apply(iso3_to_country_name)

df.head()

Unnamed: 0,iso3,year,Tuberculosis_Deaths,Country
3,AFG,2000,6370.347,Afghanistan
226,AFG,2001,6116.266,Afghanistan
449,AFG,2002,5808.648,Afghanistan
672,AFG,2003,6188.514,Afghanistan
895,AFG,2004,5735.92,Afghanistan


Normalizing specific countries that have encoding issues

In [9]:
df['Country'] = df['Country'].replace({
    'Bolivia, Plurinational State of': 'Bolivia',
    'Congo, The Democratic Republic of the' : 'Democratic Republic of the Congo',
    'Iran, Islamic Republic of' : 'Iran',
    'Micronesia, Federated States of' : 'Micronesia',
    'Korea, Republic of' : 'South Korea',
    'Korea, Democratic People\'s Republic of' : 'North Korea',
    'Türkiye' : 'Turkey',
    'Venezuela, Bolivarian Republic of' : 'Venezuela',
    'Viet Nam' : 'Vietnam',
    'Moldova, Republic of' : 'Moldova'
}, regex=True)

Creating another dataset to identify unique Countries

In [10]:
unique_countries = df[['Country', 'iso3']].drop_duplicates()

# Display the unique countries with their ISO3 codes
unique_countries

Unnamed: 0,Country,iso3
3,Afghanistan,AFG
4463,Angola,AGO
8923,Albania,ALB
13383,United Arab Emirates,ARE
17843,Argentina,ARG
...,...,...
793883,Samoa,WSM
798343,Yemen,YEM
802803,South Africa,ZAF
807263,Zambia,ZMB


Creating two new columns in the new dataset Region and Subregion

In [11]:
def get_country_info(country_name):
    country = CountryInfo(country_name)
    try:
        region = country.region()
        subregion = country.subregion()
        return region, subregion
    except KeyError:
        # If a country is not found, return 'Unknown' or handle it however you like
        print(f"Country '{country_name}' not found in CountryInfo.")
        return 'Unknown', 'Unknown'

# Apply the function to the dataset
unique_countries['region'], unique_countries['subregion'] = zip(*unique_countries['Country'].apply(get_country_info))
# Optionally, save the result to a new CSV file
unique_countries.to_csv('unique_countries_iso3.csv', index=False)

Country 'Bahamas' not found in CountryInfo.
Country 'Brunei Darussalam' not found in CountryInfo.
Country 'Côte d'Ivoire' not found in CountryInfo.
Country 'Congo' not found in CountryInfo.
Country 'Cabo Verde' not found in CountryInfo.
Country 'Czechia' not found in CountryInfo.
Country 'Micronesia' not found in CountryInfo.
Country 'Gambia' not found in CountryInfo.
Country 'North Macedonia' not found in CountryInfo.
Country 'Myanmar' not found in CountryInfo.
Country 'Montenegro' not found in CountryInfo.
Country 'Sao Tome and Principe' not found in CountryInfo.
Country 'Eswatini' not found in CountryInfo.
Country 'Timor-Leste' not found in CountryInfo.
Country 'Tanzania, United Republic of' not found in CountryInfo.


Merging the two datasets

In [12]:
merged_df = pd.merge(df, unique_countries[['iso3', 'region', 'subregion']], 
                     on='iso3', how='left')

merged_df = merged_df[~merged_df['region'].isin(['Unknown']) & ~merged_df['subregion'].isin(['Unknown'])]

merged_df


Unnamed: 0,iso3,year,Tuberculosis_Deaths,Country,region,subregion
0,AFG,2000,6370.3470,Afghanistan,Asia,Southern Asia
1,AFG,2001,6116.2660,Afghanistan,Asia,Southern Asia
2,AFG,2002,5808.6480,Afghanistan,Asia,Southern Asia
3,AFG,2003,6188.5140,Afghanistan,Asia,Southern Asia
4,AFG,2004,5735.9200,Afghanistan,Asia,Southern Asia
...,...,...,...,...,...,...
3655,ZWE,2015,648.9602,Zimbabwe,Africa,Eastern Africa
3656,ZWE,2016,720.6268,Zimbabwe,Africa,Eastern Africa
3657,ZWE,2017,956.3198,Zimbabwe,Africa,Eastern Africa
3658,ZWE,2018,1046.2450,Zimbabwe,Africa,Eastern Africa


Saving the preprocessed dataset

In [13]:
merged_df.to_csv("Dataset.csv", index=False)