# Unemployment Analysis and Prediction in India and Pakistan

## Importing Libraries

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Collection and Preprocessing

### Importing Dataset

In [81]:
unemploy_sex_age_edu = 'data/raw/unemployment_sex_age_edu.csv'
unemploy_sex_age_marital = 'data/raw/unemployment_sex_age_martial.csv'
unemploy_sex_age_region = 'data/raw/unemployment_sex_age_region.csv'

unemploy_sex_age_edu_df = pd.read_csv(unemploy_sex_age_edu)
unemploy_sex_age_marital_df = pd.read_csv(unemploy_sex_age_marital)
unemploy_sex_age_region_df = pd.read_csv(unemploy_sex_age_region)

### Dropping unnecessary columns

In [82]:
unemploy_sex_age_edu_df = unemploy_sex_age_edu_df.drop(['source.label', 'indicator.label', 'obs_status.label', 'note_classif.label', 'note_indicator.label', 'note_source.label'], axis=1)
unemploy_sex_age_marital_df = unemploy_sex_age_marital_df.drop(['source.label', 'indicator.label', 'obs_status.label', 'note_classif.label', 'note_indicator.label', 'note_source.label'], axis=1)
unemploy_sex_age_region_df = unemploy_sex_age_region_df.drop(['source.label', 'indicator.label', 'obs_status.label', 'note_classif.label', 'note_indicator.label', 'note_source.label'], axis=1)

### Renaming Columns

In [84]:
unemploy_sex_age_edu_df.rename(columns={
    'ref_area.label': 'country',
    'sex.label': 'gender',
    'classif1.label': 'age_bracket',
    'classif2.label': 'education',
    'obs_value': 'value'
}, inplace=True)

unemploy_sex_age_marital_df.rename(
    columns={
        'ref_area.label': 'country',
        'sex.label': 'gender',
        'classif1.label': 'age_bracket',
        'classif2.label': 'marital_status',
        'obs_value': 'value'
    }, inplace=True
)

unemploy_sex_age_region_df.rename(
    columns={
        'ref_area.label': 'country',
        'sex.label': 'gender',
        'classif1.label': 'age_bracket',
        'classif2.label': 'region',
        'obs_value': 'value'
    }, inplace=True
)

### Handling rows and null values

#### Unemployment (Gender, Age, Education) Dataset

In [86]:
unemploy_sex_age_edu_df['gender'] = unemploy_sex_age_edu_df['gender'].str.replace('Sex: ','')
unemploy_sex_age_marital_df['gender'] = unemploy_sex_age_marital_df['gender'].str.replace('Sex: ', '')
unemploy_sex_age_region_df['gender'] = unemploy_sex_age_region_df['gender'].str.replace('Sex: ', '')

unemploy_sex_age_edu_df['age_bracket'] = unemploy_sex_age_edu_df['age_bracket'].str.replace('Age (Aggregate bands): ','')
unemploy_sex_age_marital_df['age_bracket'] = unemploy_sex_age_marital_df['age_bracket'].str.replace('Age (Aggregate bands): ','')
unemploy_sex_age_region_df['age_bracket'] = unemploy_sex_age_region_df['age_bracket'].str.replace('Age (Aggregate bands): ','')

unemploy_sex_age_edu_df['education'] = unemploy_sex_age_edu_df['education'].str.replace('Education (Aggregate levels): ','')

unemploy_sex_age_marital_df['marital_status'] = unemploy_sex_age_marital_df['marital_status'].str.replace('Marital status (Detailed): ',  '')

unemploy_sex_age_region_df['region'] = unemploy_sex_age_region_df['region'].str.replace('Area type: ',  '')

In [77]:
education_to_replace = {
    'Less than basic': 'Below High School',
    'Basic': 'High School', 
    'Intermediate': 'Bachelor\'s',
    'Advanced': 'Master\'s or Higher',
    'Level not stated': 'Not Stated'
}

unemploy_sex_age_edu_df['education'] = unemploy_sex_age_edu_df['education'].replace(education_to_replace)

##### Null Values Handle

In [78]:
unemploy_sex_age_edu_df['value'] = unemploy_sex_age_edu_df.groupby(['age_bracket', 'education'])['value'].transform(lambda x: x.fillna(x.mean()))
unemploy_sex_age_edu_df_value_global_mean = unemploy_sex_age_edu_df['value'].mean()
unemploy_sex_age_edu_df['value'] = unemploy_sex_age_edu_df['value'].fillna(unemploy_sex_age_edu_df_value_global_mean)
unemploy_sex_age_edu_df['value'] = round(unemploy_sex_age_edu_df['value'])

unemploy_sex_age_marital_df['value'] = unemploy_sex_age_marital_df.groupby(['age_bracket', 'marital_status'])['value'].transform(lambda x: x.fillna(x.mean()))
unemploy_sex_age_marital_df_value_global_mean = unemploy_sex_age_marital_df['value'].mean()
unemploy_sex_age_marital_df['value'] = unemploy_sex_age_marital_df['value'].fillna(unemploy_sex_age_marital_df_value_global_mean)
unemploy_sex_age_marital_df['value'] = round(unemploy_sex_age_marital_df['value'])

##### Convert Data Types

In [79]:
unemploy_sex_age_edu_df['value'] = unemploy_sex_age_edu_df['value'].astype('int')
unemploy_sex_age_edu_df['gender'] = unemploy_sex_age_edu_df['gender'].astype('category')
unemploy_sex_age_edu_df['age_bracket'] = unemploy_sex_age_edu_df['age_bracket'].astype('category')
unemploy_sex_age_edu_df['education'] = unemploy_sex_age_edu_df['education'].astype('category')
unemploy_sex_age_edu_df['time'] =  pd.PeriodIndex(unemploy_sex_age_edu_df['time'], freq='Q')

unemploy_sex_age_marital_df['value'] = unemploy_sex_age_marital_df['value'].astype('int')
unemploy_sex_age_marital_df['gender'] = unemploy_sex_age_marital_df['gender'].astype('category')
unemploy_sex_age_marital_df['age_bracket'] = unemploy_sex_age_marital_df['age_bracket'].astype('category')
unemploy_sex_age_marital_df['marital_status'] = unemploy_sex_age_marital_df['marital_status'].astype('category')
unemploy_sex_age_marital_df['time'] =  pd.PeriodIndex(unemploy_sex_age_marital_df['time'], freq='Q')