In [8]:
#import python libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import calendar
%pip install plotly
import plotly.express as px

ModuleNotFoundError: No module named 'numpy'

In [None]:
# import csv file

ua = pd.read_csv("H:/Unemployment_Rate_upto_11_2020.csv")

# EXPLORING THE DATA

In [None]:
ua.shape

In [None]:
ua.head()

In [None]:
ua.info()

In [None]:
#check for null values

pd.isnull(ua).sum()

NameError: name 'pd' is not defined

In [None]:
#drop null values

ua.dropna(inplace=True)

NameError: name 'ua' is not defined

In [None]:
ua.shape

In [None]:
#describe() returns description of the data

ua.describe()

# DATA PREPROCESSING

In [None]:
# Updating Column Names

ua.columns = ua.columns.str.strip()  # Remove leading and trailing whitespaces from column names

ua.rename(columns={
    'Region': 'State',
    'Date': 'Date',
    'Frequency': 'Frequency',
    'Estimated Unemployment Rate (%)': 'Unemployment_Rate',
    'Estimated Employed': 'Employed',
    'Estimated Labour Participation Rate (%)': 'Labor_Participation_Rate',
    'Region.1': 'Region',
    'Longitude': 'Latitude',
    'Latitude': 'Longitude'
}, inplace=True)

print(ua.columns)

In [None]:
ua.head()

In [None]:
ua.State.value_counts()

In [None]:
ua['Date'] = pd.to_datetime(ua['Date'],dayfirst=True)  # Converting 'Date' column to datetime

# Creating a new column for the month
ua['Month'] = ua['Date'].dt.month

print(ua.head())

# DATA VISUALIZATION

In [None]:
# Histogram of the Unemployment Rate
plt.figure(figsize=(8, 6))
sns.histplot(ua['Unemployment_Rate'], bins=20, kde=True)
plt.title('Histogram of Unemployment Rate')
plt.xlabel('Unemployment Rate')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Line plot of Unemployment Rate over time (Date)
plt.figure(figsize=(10, 6))
sns.lineplot(x='Date', y='Unemployment_Rate', data=ua)
plt.title('Unemployment Rate over Time')
plt.xlabel('Month')
plt.ylabel('Unemployment Rate')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Box plot of Unemployment Rate by Region
plt.figure(figsize=(10, 6))
sns.boxplot(x='Region', y='Unemployment_Rate', data=ua)
plt.title('Unemployment Rate by Region')
plt.xlabel('Region')
plt.ylabel('Unemployment Rate')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Scatter plot of Employed vs. Labor Participation Rate
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Employed', y='Labor_Participation_Rate', data=ua)
plt.title('Employed vs. Labor Participation Rate')
plt.xlabel('Employed')
plt.ylabel('Labor Participation Rate')
plt.show()

In [None]:
monthly_avg_unemployment = ua.groupby('Month')['Unemployment_Rate'].mean()
# Plotting the monthly average of Unemployment Rate
plt.figure(figsize=(8, 6))
sns.barplot(x=monthly_avg_unemployment.index, y=monthly_avg_unemployment.values)
plt.title('Monthly Average Unemployment Rate')
plt.xlabel('Month')
plt.ylabel('Average Unemployment Rate')
plt.xticks(np.arange(0, 12), calendar.month_abbr[1:13], rotation=45)
plt.show()

In [None]:
# Exclude non-numeric columns from correlation calculation
numeric_columns = ua.select_dtypes(include=[np.number])

# Calculate correlation matrix for numeric columns
correlation_matrix = numeric_columns.corr()

# Plotting the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
numerical_columns = ['Unemployment_Rate', 'Employed', 'Labor_Participation_Rate', 'longitude', 'latitude']
sns.pairplot(ua[numerical_columns])
plt.title('Pairplot of Numerical Variables')
plt.show()

In [None]:
fig = px.scatter_geo(ua, 
                     lat='longitude', lon='latitude', color="State",
                     hover_name="State", size="Unemployment_Rate",
                     animation_frame="Month", scope='asia',
                     title='Impact of Lockdown on Employment in India',
                     projection='natural earth',
                     color_continuous_scale='viridis',
                     size_max=30)
fig.update_geos(
    showcoastlines=True, coastlinecolor="RebeccaPurple", coastlinewidth=1,
    showland=True, landcolor="LightGreen", 
    showocean=True, oceancolor="LightBlue", 
    showcountries=True, countrycolor="Black", countrywidth=1, 
    showlakes=True, lakecolor="LightBlue"
)

# Setting center around India and increase the scale for closer view
fig.update_geos(center=dict(lon=78, lat=23), projection_scale=4)

fig.show()

In [None]:
state_unemployment = ua.groupby('State')['Unemployment_Rate'].mean().reset_index()

# Sort the data in descending order to find the state with the highest unemployment rate
state_unemployment_sorted = state_unemployment.sort_values(by='Unemployment_Rate', ascending=False)

# Plot the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='State', y='Unemployment_Rate', data=state_unemployment_sorted)
plt.xticks(rotation=90)
plt.xlabel('State')
plt.ylabel('Average Unemployment Rate')
plt.title('Average Unemployment Rate by State')
plt.tight_layout()
plt.show()

# CONCLUSION

### Based on the EDA and the visualizations of the unemployment rate data, we can draw the following conclusions:

### Urban Areas: 
The analysis indicates that most of the high unemployment rates are observed in urban areas. This could be attributed to various factors such as a higher population density, more industries, and a larger labor force competing for limited job opportunities.

### Regional Variation: 
The southern region of India shows a higher average unemployment rate compared to other regions. This regional disparity may be influenced by economic and industrial variations across different parts of the country.

### States with Highest Unemployment Rates: 
The top five states with the highest average unemployment rates are Haryana, Tripura, Jharkhand, Bihar, and Delhi. These states might be facing specific economic challenges and job market issues, contributing to elevated unemployment rates.