<a href="https://colab.research.google.com/github/Muhammad-Shahzaibb/EDA-on-Pak-Employment-Data/blob/main/EDA_on_Pakistan's_Employment_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pakistan-employment-dataset-2023:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3696110%2F6409150%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240417%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240417T170355Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D08d0784af6ee1e885f6ab497381d5192be2d32e3ea4e4126e30fd0748443b570ddf02015e7365538b082e352e46f8a59a702e6275b1317ee8f8019853b215a661d66433cf56cb92ead6a6b1448ce472e59767681ed9ce8949147e70aa77144b5c3af1bfdb8b84bbfae7ff1c42b516b83cbabfe750616f129fb6efc8868c2944ec2e1c58b4e865faf99d7349cf945773cdb0a71ade53a161cd993ceab95248d26fa0b80e4f680ad87a8dcdf4c019b8ebeca518233c19d4f20719056fa188750a399ae698e6fb21f023e0ef4447bac2508f904f57a65b9c93e667c91533ef9f8fd1b7801f04a0174af3bf087cb66145320120c15102fb3b017eda7aee33b805ac9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading pakistan-employment-dataset-2023, 51636 bytes compressed
Downloaded and uncompressed: pakistan-employment-dataset-2023
Data source import complete.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv("../input/pakistan-employment-dataset-2023/GenderBasedEmploymentInPakistan2023.csv")
df.head(10)

Unnamed: 0,Province,Division,District,Indicator,Area Type,Total,Male,Female
0,Balochistan,Kalat Division,Kalat,Working Age Population,Rural,130000.0,70000.0,59000
1,Balochistan,Kalat Division,Kalat,Working Age Population,Urban,28000.0,17000.0,12000
2,Balochistan,Kalat Division,Kalat,Labour Force,Rural,52600.0,49600.0,3100
3,Balochistan,Kalat Division,Kalat,Labour Force,Urban,10800.0,10600.0,300
4,Balochistan,Kalat Division,Kalat,Employed,Rural,49700.0,47400.0,2300
5,Balochistan,Kalat Division,Kalat,Employed,Urban,10000.0,10000.0,-
6,Balochistan,Kalat Division,Kalat,Unemployed,Rural,3000.0,2200.0,800
7,Balochistan,Kalat Division,Kalat,Unemployed,Urban,800.0,600.0,300
8,Balochistan,Kalat Division,Kalat,Refined Participation Rate (%),Rural,40.6,70.7,5.1
9,Balochistan,Kalat Division,Kalat,Refined Participation Rate (%),Urban,38.3,63.6,2.4


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['Total'] = pd.to_numeric(df['Total'], errors='coerce')
df['Male'] = pd.to_numeric(df['Male'], errors='coerce')
df['Female'] = pd.to_numeric(df['Female'], errors='coerce')
df.dtypes


In [None]:
pd.options.display.float_format = '{:.2f}'.format
#This will show you the data in normal form otherwise scientific
df.describe()

In [None]:
# Checking which coloumns have Null Values
[features for features in df.columns if df[features].isnull().sum()>0]


In [None]:
# Checking no. of Null values in the coloumns
df.isnull().sum()

In [None]:
# Filling Null vales of coloumns by their respective mean
df.fillna({
   'Total' : df.Total.mean(),
   'Male' : df.Male.mean(),
   'Female' : df.Female.mean()
},inplace=True)
df


In [None]:
#checking outliers
Q1 = df['Total'].quantile(0.25)
Q3 = df['Total'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
lower_bound,upper_bound


In [None]:
outliers = df[(df['Total'] < lower_bound) | (df['Total'] > upper_bound)]
outliers

In [None]:
province_population = df.groupby(['Province'])['Total'].sum().reset_index()
province_population

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x='Province', y='Total',data=province_population)
plt.title('Total Population Distribution by Province')
plt.xlabel('Province')
plt.ylabel('Total Population')


##### OBSERVATION:
Punjab has the highest population while balochistan has the lowest

In [None]:
District_population = df.groupby(['District'])['Total'].sum().reset_index().sort_values(by='Total',ascending=False).head(10)


In [None]:
plt.figure(figsize=(13,4))
sns.barplot(x='District', y='Total',data=District_population)
plt.title('Top 10 Districts by Population')
plt.xlabel('District')
plt.ylabel('Total Population')


#### OBSERVATION:
Lahore is the highest populated District

In [None]:
Division_population = df.groupby(['Division'])['Total'].sum().reset_index().sort_values(by='Total',
ascending=False).head(5)

In [None]:
plt.figure(figsize=(13,4))
sns.barplot(x='Division', y='Total',data=Division_population)
plt.title('Top 5 Divisions by Population')
plt.xlabel('District')
plt.ylabel('Total Population')


In [None]:
df['Area Type'].unique()

In [None]:
y = ['Rural','Urban']
df_Area_Type=df[df['Area Type'].isin(y)]
df_Area_Type

In [None]:
Area_Type= df_Area_Type.groupby('Area Type')['Total'].sum().reset_index()
Area_Type

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='Area Type', y='Total',data=Area_Type)
plt.title('Population in Urban vs Rural Areas')
plt.xlabel('Area Type')
plt.ylabel('Total Population')



#### OBSERVATION:
Population of Rural Areas is aprox. double  of Urban areas

In [None]:
df.Indicator.unique()

In [None]:
y=['Employment to Population Ratio (%)']
Employed_data = df[df['Indicator'].isin(y)]
Employed_data.head(10)

In [None]:
Employed_population = Employed_data.groupby(['Province'])['Total'].mean().reset_index()
Employed_population

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='Province', y='Total',data=Employed_population)
plt.title('Comparison of Employement Rate of Provinces')
plt.xlabel('Province')
plt.ylabel('Total Population')


#### OBSERVATION:
Sindh has the highest Employment to population Ratio

In [None]:
y=['Unemployment Rate (%)']
Unemployed_data = df[df['Indicator'].isin(y)]
Unemployed_data.head(10)

In [None]:
Unemployed_Rate = Unemployed_data.groupby(['Province'])['Total'].mean().reset_index()
Unemployed_Rate

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='Province', y='Total',data=Unemployed_Rate)
plt.title('Comparison of Unemployement Rate of Provinces')
plt.xlabel('Province')
plt.ylabel('Total Population')


#### OBSERVATION:
KPK has the highest Unemployment Rate

In [None]:
y=['Literacy Rate (%)','Employment to Population Ratio (%)']
Literacy_data = df[df['Indicator'].isin(y)]
Literacy_data.head(10)

In [None]:
Literacy_Rate = Literacy_data.groupby(['Province'])['Total'].mean().reset_index()
Literacy_Rate

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x='Province',y='Total', data=Literacy_Rate)
plt.title('Avg Literacy Rate of provinces')
plt.ylabel('Literacy Rate')
plt.xlabel('Provinces')


#### OBSERVATION:
Punjab has the highest Literacy Rate And Kpk has the lowest

In [None]:
Literacy_Rate_Areas = Literacy_data.groupby(['Area Type'])['Total'].mean().reset_index()
Literacy_Rate_Areas

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='Area Type',y='Total', data=Literacy_Rate_Areas)
plt.title('Avg Literacy Rate of Urban vs Rural Areas')
plt.ylabel('Literacy Rate')
plt.xlabel('Area Types')

#### OBSERVATION:
Literacy Rate of Urban Areas are more than the literacy Rate of Rural Areas

In [None]:
Literacy_Rate_District = Literacy_data.groupby(['District'])['Total'].mean().reset_index().sort_values(by='Total',ascending=False).head(5)
Literacy_Rate_District

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='District',y='Total', data=Literacy_Rate_District)
plt.title('Avg Literacy Rate of Districts(Top 5)')
plt.ylabel('Literacy Rate')
plt.xlabel('Districts')

#### OBSERVATION:
Islamabad is the highest literacy rate among all the Districts.