seting up and loading dataset


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

df = pd.read_csv('Human_Development_Index_Dataset.csv')

print(df.shape)
df.head(3)
df.info()


(6798, 30)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6798 entries, 0 to 6797
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 6798 non-null   int64  
 1   iso3                       6798 non-null   object 
 2   country                    6798 non-null   object 
 3   year                       6798 non-null   int64  
 4   hdi                        6171 non-null   float64
 5   life_expectancy            6798 non-null   float64
 6   pop_millions               6798 non-null   float64
 7   hdi_f                      5014 non-null   float64
 8   hdi_m                      5014 non-null   float64
 9   life_expec_f               6798 non-null   float64
 10  life_expec_m               6798 non-null   float64
 11  expec_yr_school            6550 non-null   float64
 12  expec_yr_school_f          6270 non-null   float64
 13  expec_yr_school_m          6270 non-n

Problem 1A – Single year HDI exploration (2022)

1. Extract 2022 data

In [4]:
print(sorted(df['year'].unique()))

hdi_2022_df = df[df['year'] == 2022].copy()
hdi_2022_df.shape


[np.int64(1990), np.int64(1991), np.int64(1992), np.int64(1993), np.int64(1994), np.int64(1995), np.int64(1996), np.int64(1997), np.int64(1998), np.int64(1999), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]


(206, 30)

2. Explore dataset

In [5]:
hdi_2022_df.head(10)
print("Rows, Columns:", hdi_2022_df.shape)
hdi_2022_df.dtypes


Rows, Columns: (206, 30)


Unnamed: 0                     int64
iso3                          object
country                       object
year                           int64
hdi                          float64
life_expectancy              float64
pop_millions                 float64
hdi_f                        float64
hdi_m                        float64
life_expec_f                 float64
life_expec_m                 float64
expec_yr_school              float64
expec_yr_school_f            float64
expec_yr_school_m            float64
mean_yr_school               float64
mean_yr_school_f             float64
mean_yr_school_m             float64
gross_inc_percap             float64
gross_inc_percap_f           float64
gross_inc_percap_m           float64
gender_development           float64
gender_inequality            float64
secondary_education_f_%      float64
secondary_education_m_%      float64
seats_in_parliament_f_%      float64
seats_in_parliament_m_%      float64
labour_participation_f_%     float64
l

3. Check missing values and clean

In [6]:
hdi_2022_df.isna().sum().sort_values(ascending=False)

dup_count = hdi_2022_df.duplicated().sum()
print("Duplicates:", dup_count)
hdi_2022_df = hdi_2022_df.drop_duplicates()

if 'country' in hdi_2022_df.columns:
    hdi_2022_df['country'] = hdi_2022_df['country'].str.strip()

num_cols = ['hdi', 'gross_inc_percap', 'life_expectancy', 'gender_development']
for c in num_cols:
    if c in hdi_2022_df.columns:
        hdi_2022_df[c] = pd.to_numeric(hdi_2022_df[c], errors='coerce')

hdi_2022_df = hdi_2022_df.dropna(subset=['hdi'])


Duplicates: 0


4. Basic statistics

In [7]:
mean_hdi = hdi_2022_df['hdi'].mean()
median_hdi = hdi_2022_df['hdi'].median()
std_hdi = hdi_2022_df['hdi'].std()
print({"mean": mean_hdi, "median": median_hdi, "std": std_hdi})


{'mean': np.float64(0.7228872549019609), 'median': 0.7395, 'std': 0.15302880386427825}


5. Extremes

In [8]:
max_row = hdi_2022_df.loc[hdi_2022_df['hdi'].idxmax()]
min_row = hdi_2022_df.loc[hdi_2022_df['hdi'].idxmin()]
print("Highest HDI:", max_row['country'], max_row['hdi'])
print("Lowest HDI:", min_row['country'], min_row['hdi'])


Highest HDI: Switzerland 0.967
Lowest HDI: Somalia 0.38


6. Filter & sort (HDI ≥ 0.800; sort by GNI)

In [None]:
high_hdi = hdi_2022_df[hdi_2022_df['hdi'] >= 0.800].copy()
if 'gross_inc_percap' in high_hdi.columns:
    top10_gni = high_hdi.sort_values('gross_inc_percap', ascending=False).head(10)
else:
    top10_gni = high_hdi.head(10)  
top10_gni[['country', 'hdi', 'gross_inc_percap']].head(10)


Unnamed: 0,country,hdi,gross_inc_percap
3332,Liechtenstein,0.942,146673.2415
4718,Qatar,0.875,95944.37754
5213,Singapore,0.949,88761.14559
2705,Ireland,0.95,87467.51391
3398,Luxembourg,0.927,78554.2364
6104,United Arab Emirates,0.937,74103.71494
5609,Switzerland,0.967,69432.78669
4322,Norway,0.966,69189.76165
6170,United States,0.927,65564.93798
2474,"Hong Kong, China (SAR)",0.956,62485.50516


7. Add HDI category column and save

In [None]:
def categorize_hdi(val):
    if val < 0.550:
        return "Low"
    elif 0.550 <= val < 0.700:
        return "Medium"
    elif 0.700 <= val < 0.800:
        return "High"
    else:
        return "Very High"

hdi_2022_df['HDI_Category'] = hdi_2022_df['hdi'].apply(categorize_hdi)

hdi_2022_df['HDI_Category'].value_counts()

hdi_2022_df.to_csv('HDI_category_added.csv', index=False)
