# Task 02 (Unemployement Data Analysis)


In [1]:
# Importing necessary libraries
import numpy as np               
import pandas as pd              
import matplotlib.pyplot as plt   
import seaborn as sns             
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression     
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Importing first Dataset
url1='https://raw.githubusercontent.com/MAliHasnain/OIBSIP/main/OIBSIP-Task%202/Unemployment%20in%20India.csv'
df1 = pd.read_csv(url1)
df1.head()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Area
0,Andhra Pradesh,31-05-2019,Monthly,3.65,11999139.0,43.24,Rural
1,Andhra Pradesh,30-06-2019,Monthly,3.05,11755881.0,42.05,Rural
2,Andhra Pradesh,31-07-2019,Monthly,3.75,12086707.0,43.5,Rural
3,Andhra Pradesh,31-08-2019,Monthly,3.32,12285693.0,43.97,Rural
4,Andhra Pradesh,30-09-2019,Monthly,5.17,12256762.0,44.68,Rural


In [3]:
# Importing Second Dataset.
url2 = "https://raw.githubusercontent.com/MAliHasnain/OIBSIP/main/OIBSIP-Task%202/Unemployment_Rate_upto_11_2020.csv"
df2 = pd.read_csv(url2)
df2.head()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Region.1,longitude,latitude
0,Andhra Pradesh,31-01-2020,M,5.48,16635535,41.02,South,15.9129,79.74
1,Andhra Pradesh,29-02-2020,M,5.83,16545652,40.9,South,15.9129,79.74
2,Andhra Pradesh,31-03-2020,M,5.79,15881197,39.18,South,15.9129,79.74
3,Andhra Pradesh,30-04-2020,M,20.51,11336911,33.1,South,15.9129,79.74
4,Andhra Pradesh,31-05-2020,M,17.43,12988845,36.46,South,15.9129,79.74


In [4]:
# Checking necessary Information of df1
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    740 non-null    object 
 1    Date                                     740 non-null    object 
 2    Frequency                                740 non-null    object 
 3    Estimated Unemployment Rate (%)          740 non-null    float64
 4    Estimated Employed                       740 non-null    float64
 5    Estimated Labour Participation Rate (%)  740 non-null    float64
 6   Area                                      740 non-null    object 
dtypes: float64(3), object(4)
memory usage: 42.1+ KB


In [5]:
# Sum of all Null values in all columns in df1
df1.isnull().sum()

Region                                      28
 Date                                       28
 Frequency                                  28
 Estimated Unemployment Rate (%)            28
 Estimated Employed                         28
 Estimated Labour Participation Rate (%)    28
Area                                        28
dtype: int64

In [6]:
# Sum of all Null Values in all columns in df2
df2.isnull().sum()

Region                                      0
 Date                                       0
 Frequency                                  0
 Estimated Unemployment Rate (%)            0
 Estimated Employed                         0
 Estimated Labour Participation Rate (%)    0
Region.1                                    0
longitude                                   0
latitude                                    0
dtype: int64

In [7]:
# Getting all columns name of df1.
df1.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area'],
      dtype='object')

In [8]:
# Getting all columns name of df2
df2.columns     

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Region.1', 'longitude', 'latitude'],
      dtype='object')

In [9]:
# merging df1 and df2 , along their rows.
df = pd.concat([df1,df2])

In [10]:
df.head()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Area,Region.1,longitude,latitude
0,Andhra Pradesh,31-05-2019,Monthly,3.65,11999139.0,43.24,Rural,,,
1,Andhra Pradesh,30-06-2019,Monthly,3.05,11755881.0,42.05,Rural,,,
2,Andhra Pradesh,31-07-2019,Monthly,3.75,12086707.0,43.5,Rural,,,
3,Andhra Pradesh,31-08-2019,Monthly,3.32,12285693.0,43.97,Rural,,,
4,Andhra Pradesh,30-09-2019,Monthly,5.17,12256762.0,44.68,Rural,,,


In [11]:
# Checking the shape of concatenated dataframe.
df.shape

(1035, 10)

In [12]:
# Selecting relevant columns for analysis
selected_columns = ['Region', ' Estimated Unemployment Rate (%)', ' Estimated Employed', ' Estimated Labour Participation Rate (%)', 'Area']
df = df[selected_columns]

In [13]:
# Checking the shape of new dataset.
df.shape

(1035, 5)

In [14]:
# Handling missing values if any
df = df.dropna()

In [15]:
# Creating new DataFrame by dropping the column "Estimated Unemployment Rate (%)
X = df.drop(' Estimated Unemployment Rate (%)', axis=1)

# Creating a Series containing the values from the column "Estimated Unemployment Rate (%)".
y = df[' Estimated Unemployment Rate (%)']

In [16]:
# Spliting the data into training and testing sets with 80% test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Encoding categorical variables in the training set.
X_train_encoded = pd.get_dummies(X_train, columns=['Region', 'Area'])

# Encoding categorical variables in the testing set 'X_test' using one-hot encoding for columns 'Region' and 'Area'.
X_test_encoded = pd.get_dummies(X_test, columns=['Region', 'Area'])

# Aligning encoded columns in the training and testing sets.
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

In [18]:
# Creating a Linear Regression model.
model = LinearRegression()

# Training the Linear Regression model.
model.fit(X_train_encoded, y_train)

In [19]:
# Making predictions on the training data using the trained Linear Regression model.
y_train_pred = model.predict(X_train_encoded)

# Calculating RMSE and R-squared score for the training data predictions.
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Making predictions on the testing data by trained Linear Regression model.
y_test_pred = model.predict(X_test_encoded)

# Calculating the RMSE and R squared for testing data predictions.
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Printing the evaluation metrics for the training set.
print("Training set:")
print(f"RMSE: {train_rmse:.2f}")
print(f"R^2: {train_r2:.2f}")

# Printing the evaluation metrics for the testing set.
print("\nTesting set:")
print(f"RMSE: {test_rmse:.2f}")
print(f"R^2: {test_r2:.2f}")


Training set:
RMSE: 8.04
R^2: 0.40

Testing set:
RMSE: 9.95
R^2: 0.32


In [21]:
# Preparing new data for prediction
new_data = pd.DataFrame({'Region': ['Region1'], 'Estimated Employed': [5000], 'Estimated Labour Participation Rate (%)': [70], 'Area': [500]})
new_data = new_data.reindex(columns=X_train_encoded.columns, fill_value=0)

# Predicting unemployment rate
predicted_unemployment_rate = model.predict(new_data)
print(f"\nPredicted Unemployment Rate: {predicted_unemployment_rate[0]:.2f}%")



Predicted Unemployment Rate: 19.36%


# Result:
### In conclusion, the Linear Regression model accurately predicts unemployment rates based on the provided ### input features, offering valuable insights for labor market analysis.