# üìâ Unemployment Analysis with Python
## CodeAlpha Data Science Internship ‚Äî Task 2

| Detail | Info |
|--------|------|
| **Intern** | Mohammed Abuzar |
| **Company** | CodeAlpha |
| **Task** | Task 2 ‚Äî Unemployment Analysis |
| **Dataset** | India Unemployment 2019‚Äì2020 |

In [None]:
import os
if not os.path.exists('Unemployment_in_India.csv'):
    from google.colab import files
    print('Upload both CSV files...')
    uploaded = files.upload()
    print('Files uploaded:', list(uploaded.keys()))
else:
    print('CSV files already present!')

---
## üì¶ 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
plt.style.use('seaborn-v0_8-whitegrid')
COVID_RED='#e74c3c'; PRE_BLUE='#3498db'
print('All libraries imported!')

---
## üìÇ 2. Load & Clean Datasets

In [None]:
df1 = pd.read_csv('Unemployment_in_India.csv')
df1.columns = df1.columns.str.strip()
df1 = df1.dropna().reset_index(drop=True)
for col in ['Region','Date','Frequency','Area']:
    df1[col] = df1[col].astype(str).str.strip()
df1['Date'] = pd.to_datetime(df1['Date'], format='%d-%m-%Y')
df1['Month'] = df1['Date'].dt.month
df1['Year']  = df1['Date'].dt.year
df1.rename(columns={'Estimated Unemployment Rate (%)':'Unemployment_Rate','Estimated Employed':'Employed','Estimated Labour Participation Rate (%)':'Labour_Participation'}, inplace=True)
print('Dataset 1:', df1.shape)
df1.head(3)

In [None]:
df2 = pd.read_csv('Unemployment_Rate_upto_11_2020.csv')
df2.columns = df2.columns.str.strip()
for col in ['Region','Date','Region.1']:
    df2[col] = df2[col].astype(str).str.strip()
df2['Date'] = pd.to_datetime(df2['Date'], format='%d-%m-%Y')
df2['Month'] = df2['Date'].dt.month
df2['Year']  = df2['Date'].dt.year
df2.rename(columns={'Estimated Unemployment Rate (%)':'Unemployment_Rate','Estimated Employed':'Employed','Estimated Labour Participation Rate (%)':'Labour_Participation','Region.1':'Zone'}, inplace=True)
print('Dataset 2:', df2.shape)
df2.head(3)

---
## üîç 3. EDA ‚Äî National Trend

In [None]:
national = df1.groupby('Date')['Unemployment_Rate'].mean().reset_index().sort_values('Date')
covid_start = pd.to_datetime('2020-03-01')
fig, ax = plt.subplots(figsize=(13,5))
ax.axvspan(covid_start, national['Date'].max(), alpha=0.12, color=COVID_RED)
ax.axvline(covid_start, color=COVID_RED, linestyle='--', linewidth=2)
ax.plot(national['Date'], national['Unemployment_Rate'], color=PRE_BLUE, linewidth=2.5, marker='o', markersize=7, markerfacecolor='white', markeredgewidth=2, label='Avg Unemployment')
ax.fill_between(national['Date'], national['Unemployment_Rate'], alpha=0.15, color=PRE_BLUE)
peak = national.loc[national['Unemployment_Rate'].idxmax()]
ax.annotate(f"Peak: {peak['Unemployment_Rate']:.1f}%", xy=(peak['Date'],peak['Unemployment_Rate']), xytext=(peak['Date'],peak['Unemployment_Rate']-3), fontsize=11, fontweight='bold', color=COVID_RED, arrowprops=dict(arrowstyle='->', color=COVID_RED))
ax.set_title('India National Unemployment Rate ‚Äî Monthly Trend (2019‚Äì2020)', fontsize=14, fontweight='bold')
ax.set_xlabel('Month'); ax.set_ylabel('Unemployment Rate (%)')
ax.legend(); ax.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%b %Y'))
plt.xticks(rotation=30); plt.tight_layout()
plt.savefig('national_trend.png', dpi=100, bbox_inches='tight'); plt.show()
print('Saved: national_trend.png')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13,5))
fig.suptitle('Unemployment Rate Distribution', fontsize=14, fontweight='bold')
axes[0].hist(df1['Unemployment_Rate'], bins=25, color=PRE_BLUE, edgecolor='white', alpha=0.8)
axes[0].axvline(df1['Unemployment_Rate'].mean(), color=COVID_RED, linestyle='--', linewidth=2, label=f"Mean: {df1['Unemployment_Rate'].mean():.1f}%")
axes[0].axvline(df1['Unemployment_Rate'].median(), color='#2ecc71', linestyle='--', linewidth=2, label=f"Median: {df1['Unemployment_Rate'].median():.1f}%")
axes[0].set_title('Overall Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Unemployment Rate (%)'); axes[0].legend()
for area, color in [('Rural','#2ecc71'),('Urban','#e74c3c')]:
    s = df1[df1['Area']==area]['Unemployment_Rate']
    axes[1].hist(s, bins=20, alpha=0.6, color=color, edgecolor='white', label=f"{area} (mean:{s.mean():.1f}%)")
axes[1].set_title('Rural vs Urban', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Unemployment Rate (%)'); axes[1].legend()
plt.tight_layout(); plt.savefig('distribution_analysis.png', dpi=100, bbox_inches='tight'); plt.show()
print('Saved: distribution_analysis.png')

In [None]:
ma = df1.groupby('Month')['Unemployment_Rate'].mean()
ml = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
fig, ax = plt.subplots(figsize=(11,5))
cols = [COVID_RED if m>=3 else PRE_BLUE for m in ma.index]
bars = ax.bar(ma.index, ma.values, color=cols, alpha=0.85, edgecolor='white')
for b,v in zip(bars,ma.values): ax.text(b.get_x()+b.get_width()/2., b.get_height()+0.2, f'{v:.1f}%', ha='center', fontsize=9, fontweight='bold')
ax.set_title('Average Unemployment Rate by Month', fontsize=14, fontweight='bold')
ax.set_xticks(ma.index); ax.set_xticklabels([ml[m-1] for m in ma.index])
ax.legend(handles=[mpatches.Patch(color=PRE_BLUE,label='Pre-Covid'), mpatches.Patch(color=COVID_RED,label='Covid')])
plt.tight_layout(); plt.savefig('seasonal_patterns.png', dpi=100, bbox_inches='tight'); plt.show()
print('Saved: seasonal_patterns.png')

---
## ü¶† 4. Covid-19 Impact Analysis

In [None]:
lockdown_date = pd.to_datetime('2020-03-25')
pre_covid = df1[df1['Date']<lockdown_date].copy()
post_covid = df1[df1['Date']>=lockdown_date].copy()
pre_mean = pre_covid['Unemployment_Rate'].mean()
post_mean = post_covid['Unemployment_Rate'].mean()
increase = ((post_mean-pre_mean)/pre_mean)*100
print('='*50)
print('COVID-19 IMPACT SUMMARY')
print('='*50)
print(f'Pre-Covid  Avg: {pre_mean:.2f}%')
print(f'Post-Covid Avg: {post_mean:.2f}%')
print(f'Increase      : +{post_mean-pre_mean:.2f}% (+{increase:.1f}%)')
print('='*50)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13,5))
fig.suptitle('Covid-19 Impact on Unemployment', fontsize=14, fontweight='bold')
cats = ['Pre-Covid\n(up to Mar 2020)', 'Post-Lockdown\n(Mar‚ÄìJun 2020)']
bars = axes[0].bar(cats, [pre_mean,post_mean], color=[PRE_BLUE,COVID_RED], alpha=0.85, edgecolor='white', width=0.5)
for b,v in zip(bars,[pre_mean,post_mean]): axes[0].text(b.get_x()+b.get_width()/2., b.get_height()+0.3, f'{v:.2f}%', ha='center', fontsize=13, fontweight='bold')
axes[0].set_title('Before vs After Lockdown', fontsize=12, fontweight='bold'); axes[0].set_ylim(0,post_mean*1.4)
sp = pre_covid.groupby('Region')['Unemployment_Rate'].mean().sort_values(ascending=False).head(8)
spo = post_covid.groupby('Region')['Unemployment_Rate'].mean()
c2 = sp.index.intersection(spo.index); sp=sp[c2]; spo=spo[c2].reindex(sp.index)
x=np.arange(len(c2)); w=0.38
axes[1].bar(x-w/2,sp.values,w,label='Pre-Covid',color=PRE_BLUE,alpha=0.8,edgecolor='white')
axes[1].bar(x+w/2,spo.values,w,label='Post-Covid',color=COVID_RED,alpha=0.8,edgecolor='white')
axes[1].set_xticks(x); axes[1].set_xticklabels(c2,rotation=45,ha='right',fontsize=8)
axes[1].set_title('States: Pre vs Post Covid', fontsize=12, fontweight='bold'); axes[1].legend()
plt.tight_layout(); plt.savefig('covid_impact.png', dpi=100, bbox_inches='tight'); plt.show()
print('Saved: covid_impact.png')

In [None]:
m2020 = df1[df1['Year']==2020].groupby('Month')['Unemployment_Rate'].mean()
ml2 = ['Jan','Feb','Mar','Apr','May','Jun']
bc = [COVID_RED if m>=4 else PRE_BLUE for m in m2020.index]
fig, ax = plt.subplots(figsize=(10,5))
bars = ax.bar(ml2[:len(m2020)], m2020.values, color=bc, alpha=0.85, edgecolor='white')
for b,v in zip(bars,m2020.values): ax.text(b.get_x()+b.get_width()/2., b.get_height()+0.3, f'{v:.1f}%', ha='center', fontsize=12, fontweight='bold')
ax.set_title('2020 Monthly Unemployment Spike ‚Äî Covid Impact', fontsize=14, fontweight='bold')
ax.legend(handles=[mpatches.Patch(color=PRE_BLUE,label='Before Lockdown'),mpatches.Patch(color=COVID_RED,label='After Lockdown')])
plt.tight_layout(); plt.savefig('monthly_2020_spike.png', dpi=100, bbox_inches='tight'); plt.show()
print('Saved: monthly_2020_spike.png')

---
## üó∫Ô∏è 5. Zone & State Analysis

In [None]:
zcols = {'South':'#e74c3c','North':'#3498db','East':'#2ecc71','West':'#f39c12','Northeast':'#9b59b6'}
zavg = df2.groupby('Zone')['Unemployment_Rate'].mean().sort_values(ascending=False)
zt = df2.groupby(['Date','Zone'])['Unemployment_Rate'].mean().reset_index()
fig, axes = plt.subplots(1,2,figsize=(14,5))
fig.suptitle('Zone-wise Unemployment Analysis (2020)', fontsize=14, fontweight='bold')
for zone in sorted(df2['Zone'].unique()):
    s = zt[zt['Zone']==zone].sort_values('Date')
    axes[0].plot(s['Date'],s['Unemployment_Rate'],linewidth=2.5,marker='o',markersize=5,label=zone,color=zcols.get(zone,'#333'))
axes[0].axvline(pd.to_datetime('2020-03-25'),color='black',linestyle='--',linewidth=2,alpha=0.7,label='Lockdown')
axes[0].set_title('Zone Trends Over Time',fontsize=12,fontweight='bold'); axes[0].legend(fontsize=9)
axes[0].xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%b')); plt.setp(axes[0].xaxis.get_majorticklabels(),rotation=30)
zc=[zcols.get(z,'#333') for z in zavg.index]
bars=axes[1].barh(zavg.index,zavg.values,color=zc,alpha=0.85,edgecolor='white')
for b,v in zip(bars,zavg.values): axes[1].text(b.get_width()+0.3,b.get_y()+b.get_height()/2,f'{v:.1f}%',va='center',fontsize=11,fontweight='bold')
axes[1].set_title('Average by Zone',fontsize=12,fontweight='bold'); axes[1].set_xlim(0,zavg.max()+8)
plt.tight_layout(); plt.savefig('zone_analysis.png',dpi=100,bbox_inches='tight'); plt.show()
print('Saved: zone_analysis.png')

In [None]:
sm = df2.pivot_table(values='Unemployment_Rate',index='Region',columns='Month',aggfunc='mean').round(1)
sm.columns = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct'][:len(sm.columns)]
fig, ax = plt.subplots(figsize=(13,9))
sns.heatmap(sm,annot=True,fmt='.1f',cmap='RdYlGn_r',linewidths=0.4,linecolor='white',ax=ax,annot_kws={'size':8,'weight':'bold'})
ax.set_title('State √ó Month Unemployment Heatmap (2020)',fontsize=14,fontweight='bold')
plt.tight_layout(); plt.savefig('state_heatmap.png',dpi=100,bbox_inches='tight'); plt.show()
print('Saved: state_heatmap.png')

In [None]:
state_avg_all = df2.groupby('Region')['Unemployment_Rate'].mean().sort_values(ascending=False)
t5=state_avg_all.head(5); b5=state_avg_all.tail(5)
fig,axes=plt.subplots(1,2,figsize=(13,5))
fig.suptitle('Best & Worst States by Unemployment (2020)',fontsize=14,fontweight='bold')
b1=axes[0].barh(t5.index[::-1],t5.values[::-1],color=COVID_RED,alpha=0.85,edgecolor='white')
for b,v in zip(b1,t5.values[::-1]): axes[0].text(b.get_width()+0.3,b.get_y()+b.get_height()/2,f'{v:.1f}%',va='center',fontsize=11,fontweight='bold')
axes[0].set_title('Top 5 Highest',fontsize=12,fontweight='bold'); axes[0].set_xlim(0,t5.max()+10)
b2=axes[1].barh(b5.index,b5.values,color='#2ecc71',alpha=0.85,edgecolor='white')
for b,v in zip(b2,b5.values): axes[1].text(b.get_width()+0.1,b.get_y()+b.get_height()/2,f'{v:.1f}%',va='center',fontsize=11,fontweight='bold')
axes[1].set_title('Top 5 Lowest',fontsize=12,fontweight='bold'); axes[1].set_xlim(0,t5.max()+10)
plt.tight_layout(); plt.savefig('top_bottom_states.png',dpi=100,bbox_inches='tight'); plt.show()
print('Saved: top_bottom_states.png')

---
## üèòÔ∏è 6. Rural vs Urban Analysis

In [None]:
rural=df1[df1['Area']=='Rural'].groupby('Date')['Unemployment_Rate'].mean().reset_index().sort_values('Date')
urban=df1[df1['Area']=='Urban'].groupby('Date')['Unemployment_Rate'].mean().reset_index().sort_values('Date')
fig,axes=plt.subplots(1,2,figsize=(14,5))
fig.suptitle('Rural vs Urban Unemployment',fontsize=14,fontweight='bold')
for d,l,c in [(rural,'Rural','#2ecc71'),(urban,'Urban','#e74c3c')]:
    axes[0].plot(d['Date'],d['Unemployment_Rate'],linewidth=2.5,marker='o',markersize=6,label=f"{l} (mean:{d['Unemployment_Rate'].mean():.1f}%)",color=c,markerfacecolor='white',markeredgewidth=2)
    axes[0].fill_between(d['Date'],d['Unemployment_Rate'],alpha=0.1,color=c)
axes[0].axvline(lockdown_date,color='black',linestyle='--',linewidth=2,alpha=0.6)
axes[0].set_title('Trend Over Time',fontsize=12,fontweight='bold'); axes[0].legend(fontsize=10)
axes[0].xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%b %Y')); plt.setp(axes[0].xaxis.get_majorticklabels(),rotation=30)
areas=['Rural','Urban']
pv=[df1[(df1['Area']==a)&(df1['Date']<lockdown_date)]['Unemployment_Rate'].mean() for a in areas]
qv=[df1[(df1['Area']==a)&(df1['Date']>=lockdown_date)]['Unemployment_Rate'].mean() for a in areas]
x=np.arange(2); w=0.35
axes[1].bar(x-w/2,pv,w,label='Pre-Covid',color=PRE_BLUE,alpha=0.85,edgecolor='white')
axes[1].bar(x+w/2,qv,w,label='Post-Covid',color=COVID_RED,alpha=0.85,edgecolor='white')
for i,(p,q) in enumerate(zip(pv,qv)):
    axes[1].text(i-w/2,p+0.3,f'{p:.1f}%',ha='center',fontsize=11,fontweight='bold',color=PRE_BLUE)
    axes[1].text(i+w/2,q+0.3,f'{q:.1f}%',ha='center',fontsize=11,fontweight='bold',color=COVID_RED)
axes[1].set_xticks(x); axes[1].set_xticklabels(areas,fontsize=13); axes[1].legend()
axes[1].set_title('Pre vs Post Covid by Area',fontsize=12,fontweight='bold')
plt.tight_layout(); plt.savefig('rural_urban_analysis.png',dpi=100,bbox_inches='tight'); plt.show()
print('Saved: rural_urban_analysis.png')

---
## üìà 7. Forecasting

In [None]:
mn=df2.groupby('Date')['Unemployment_Rate'].mean().reset_index().sort_values('Date')
start=mn['Date'].min(); mn['days']=(mn['Date']-start).dt.days
Xt=mn['days'].values.reshape(-1,1); yt=mn['Unemployment_Rate'].values
poly=make_pipeline(PolynomialFeatures(3),LinearRegression()); poly.fit(Xt,yt)
ld=mn['days'].max()
fd=np.array([ld+30*i for i in range(1,5)]).reshape(-1,1)
fdates=pd.date_range(mn['Date'].max()+pd.DateOffset(months=1),periods=4,freq='MS')
fp=poly.predict(fd)
ad=np.linspace(0,ld+120,200).reshape(-1,1)
adt=[start+pd.Timedelta(days=int(d)) for d in ad.flatten()]
ap=poly.predict(ad)
r2=r2_score(yt,poly.predict(Xt)); rmse=np.sqrt(mean_squared_error(yt,poly.predict(Xt)))
print(f'R2={r2:.4f}  RMSE={rmse:.4f}')
print('Forecast:'); [print(f'  {d.strftime("%b %Y")} -> {max(0,p):.2f}%') for d,p in zip(fdates,fp)]
fig,ax=plt.subplots(figsize=(13,6))
ax.axvspan(mn['Date'].min(),pd.to_datetime('2020-03-25'),alpha=0.08,color=PRE_BLUE)
ax.axvspan(pd.to_datetime('2020-03-25'),mn['Date'].max(),alpha=0.08,color=COVID_RED)
ax.axvspan(mn['Date'].max(),fdates[-1],alpha=0.08,color='#9b59b6')
ax.axvline(pd.to_datetime('2020-03-25'),color=COVID_RED,linestyle='--',linewidth=2)
ax.scatter(mn['Date'],yt,color='black',s=70,zorder=5,label='Actual',edgecolors='white')
ax.plot(adt,ap,color='#9b59b6',linewidth=2.5,label='Poly Fit+Forecast')
ax.scatter(fdates,[max(0,p) for p in fp],color='#9b59b6',s=90,zorder=6,marker='D',edgecolors='white',label='Forecast')
for d,p in zip(fdates,fp): ax.text(d,max(0,p)+0.5,f'{max(0,p):.1f}%',ha='center',fontsize=9,fontweight='bold',color='#9b59b6')
ax.text(0.02,0.97,f'R¬≤={r2:.3f} | RMSE={rmse:.2f}%',transform=ax.transAxes,fontsize=11,bbox=dict(boxstyle='round',facecolor='white',alpha=0.8))
ax.set_title('Unemployment Forecast ‚Äî Polynomial Regression',fontsize=14,fontweight='bold')
ax.set_xlabel('Month'); ax.set_ylabel('Unemployment Rate (%)'); ax.legend()
ax.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%b %Y')); plt.xticks(rotation=30)
plt.tight_layout(); plt.savefig('forecast.png',dpi=100,bbox_inches='tight'); plt.show()
print('Saved: forecast.png')

---
## ‚úÖ 8. Policy Insights & Conclusions

### üîë Key Findings
1. **Covid-19 caused the sharpest unemployment spike** in the dataset
2. **Urban unemployment was consistently higher** than rural
3. **Labour Force Participation dropped sharply** during lockdown
4. **Recovery was underway by June 2020**
5. **Significant state-level variation** ‚Äî targeted interventions needed

### üèõÔ∏è Policy Recommendations
1. Expand MGNREGS rural job guarantee during crises
2. Portable benefits for urban informal/gig workers
3. State-specific programs for high-unemployment regions
4. Women re-entry programs to recover Labour Force Participation
5. Real-time unemployment dashboards for faster responses

---
*üìâ CodeAlpha Data Science Internship | Task 2*

In [None]:
print('='*55)
print('  UNEMPLOYMENT ANALYSIS ‚Äî FINAL SUMMARY')
print('='*55)
print(f'  Dataset 1 records : {df1.shape[0]}')
print(f'  Dataset 2 records : {df2.shape[0]}')
print(f'  Pre-Covid Avg     : {pre_mean:.2f}%')
print(f'  Post-Covid Avg    : {post_mean:.2f}%')
print(f'  Covid Impact      : +{increase:.1f}%')
print(f'  Best State        : {state_avg_all.index[-1]} ({state_avg_all.iloc[-1]:.1f}%)')
print(f'  Worst State       : {state_avg_all.index[0]} ({state_avg_all.iloc[0]:.1f}%)')
print(f'  Forecast R2       : {r2:.4f}')
print('  Plots: national_trend, distribution_analysis, seasonal_patterns,')
print('         covid_impact, monthly_2020_spike, zone_analysis,')
print('         state_heatmap, top_bottom_states, rural_urban_analysis, forecast')
print('='*55)
print('Task 2 Complete ‚Äî CodeAlpha Internship!')