# Unemployment Analysis and Prediction in India and Pakistan

## Importing Libraries

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Collection and Preprocessing

### Importing Dataset

In [99]:
unemploy_sex_age_edu = 'data/raw/unemployment_sex_age_edu.csv'

unemploy_sex_age_edu_df = pd.read_csv(unemploy_sex_age_edu)


### Dropping unnecessary columns

In [100]:
unemploy_sex_age_edu_df = unemploy_sex_age_edu_df.drop(['source.label', 'indicator.label', 'obs_status.label', 'note_classif.label', 'note_indicator.label', 'note_source.label'], axis=1)

### Renaming Columns

In [101]:
unemploy_sex_age_edu_df.rename(columns={
    'ref_area.label': 'country',
    'sex.label': 'gender',
    'classif1.label': 'age_bracket',
    'classif2.label': 'education',
    'obs_value': 'value'
}, inplace=True)

### Handling rows and null values

#### Unemployment (Gender, Age, Education) Dataset

In [111]:
unemploy_sex_age_edu_df['age_bracket'].unique()

array(['Age (Youth, adults): 15+', 'Age (Youth, adults): 15-64',
       'Age (Youth, adults): 15-24', 'Age (Youth, adults): 25+',
       'Age (Aggregate bands): Total', 'Age (Aggregate bands): 15-24',
       'Age (Aggregate bands): 25-54', 'Age (Aggregate bands): 55-64',
       'Age (Aggregate bands): 65+', 'Age (10-year bands): Total',
       'Age (10-year bands): 15-24', 'Age (10-year bands): 25-34',
       'Age (10-year bands): 35-44', 'Age (10-year bands): 45-54',
       'Age (10-year bands): 55-64', 'Age (10-year bands): 65+'],
      dtype=object)

In [118]:
value_youth = unemploy_sex_age_edu_df[unemploy_sex_age_edu_df['age_bracket'] == 'Age (Aggregate bands): Total']['value'].sum()
value_10yr  = unemploy_sex_age_edu_df[unemploy_sex_age_edu_df['age_bracket'] == 'Age (10-year bands): Total']['value'].sum()

print(value_10yr, value_youth)


2929406.0 5908824.0


In [103]:
# unemploy_sex_age_edu_df = unemploy_sex_age_edu_df[unemploy_sex_age_edu_df['gender'] != 'Sex: Total']
# unemploy_sex_age_edu_df['gender'] = unemploy_sex_age_edu_df['gender'].str.replace('Sex: ','')
# age_values_to_remove = [
#     'Age (Youth, adults): 15+', 'Age (Youth, adults): 15-64',
#     'Age (Youth, adults): 15-24', 'Age (Youth, adults): 25+',
#     'Age (Aggregate bands): Total', 'Age (Aggregate bands): 15-24',
#     'Age (Aggregate bands): 25-54', 'Age (Aggregate bands): 55-64',
#     'Age (Aggregate bands): 65+', 'Age (10-year bands): Total',
# ]
# unemploy_sex_age_edu_df = unemploy_sex_age_edu_df[~unemploy_sex_age_edu_df['age_bracket'].isin(age_values_to_remove)]
# unemploy_sex_age_edu_df['age_bracket'] = unemploy_sex_age_edu_df['age_bracket'].str.replace('Age (10-year bands): ','')
# unemploy_sex_age_edu_df['education'] = unemploy_sex_age_edu_df['education'].str.replace('Education (Aggregate levels): ','')

##### Change Data Types

In [104]:
# unemploy_sex_age_edu_df['value'] = unemploy_sex_age_edu_df['value'].astype('int')
# unemploy_sex_age_edu_df['gender'] = unemploy_sex_age_edu_df['gender'].astype('category')
# unemploy_sex_age_edu_df['age_bracket'] = unemploy_sex_age_edu_df['age_bracket'].astype('category')

In [105]:
unemploy_sex_age_edu_df.head()

Unnamed: 0,country,gender,age_bracket,education,time,value
0,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Total,2022Q4,3087.892
1,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Less than basic,2022Q4,293.202
2,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Basic,2022Q4,1028.186
3,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Intermediate,2022Q4,975.344
4,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Advanced,2022Q4,738.04


##### Null Values Handle

In [106]:
unemploy_sex_age_edu_df['value'] = unemploy_sex_age_edu_df.groupby(['gender', 'age_bracket'])['value'].transform(lambda x: x.fillna(x.mean()))
unemploy_sex_age_edu_df_value_global_mean = unemploy_sex_age_edu_df['value'].mean()
unemploy_sex_age_edu_df['value'] = unemploy_sex_age_edu_df['value'].fillna(unemploy_sex_age_edu_df_value_global_mean)
unemploy_sex_age_edu_df['value'] = round(unemploy_sex_age_edu_df['value'])

In [107]:
unemploy_sex_age_edu_df

Unnamed: 0,country,gender,age_bracket,education,time,value
0,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Total,2022Q4,3088.0
1,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Less than basic,2022Q4,293.0
2,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Basic,2022Q4,1028.0
3,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Intermediate,2022Q4,975.0
4,Bangladesh,Sex: Total,"Age (Youth, adults): 15+",Education (Aggregate levels): Advanced,2022Q4,738.0
...,...,...,...,...,...,...
18473,Pakistan,Sex: Female,Age (10-year bands): 55-64,Education (Aggregate levels): Intermediate,2018Q1,131.0
18474,Pakistan,Sex: Female,Age (10-year bands): 65+,Education (Aggregate levels): Total,2018Q1,79.0
18475,Pakistan,Sex: Female,Age (10-year bands): 65+,Education (Aggregate levels): Less than basic,2018Q1,79.0
18476,Pakistan,Sex: Female,Age (10-year bands): 65+,Education (Aggregate levels): Basic,2018Q1,79.0
