# OIBSIP
## TASK 2: UNEMPLOYMENT ANALYSIS WITH PYTHON

<br>

**AUTHOR :** Mohammed Khubaib
<br>
**MODEL :** Linear Regression

## Import Required :
 - Python Modules
 - Advertising Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df1 = pd.read_csv('Unemployment in India.csv')

In [4]:
df2 = pd.read_csv('Unemployment_Rate_upto_11_2020.csv')

## Data Processing

In [5]:
df1.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area'],
      dtype='object')

In [6]:
df2.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Region.1', 'longitude', 'latitude'],
      dtype='object')

**concating the datasets**

In [7]:
df = pd.concat([df1,df2])

In [None]:
df

In [8]:
df.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area', 'Region.1', 'longitude', 'latitude'],
      dtype='object')

In [9]:
# Select relevant columns for analysis
selected_columns = ['Region', ' Estimated Unemployment Rate (%)', ' Estimated Employed', ' Estimated Labour Participation Rate (%)', 'Area']
df = df[selected_columns]

# Handle missing values if any
df = df.dropna()


**Split the dataset into input features and target variable**

In [10]:
X = df.drop(' Estimated Unemployment Rate (%)', axis=1)
y = df[' Estimated Unemployment Rate (%)']


**Split the dataset into training and testing sets**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
print(X_train.dtypes)
print(y_train.dtypes)


Region                                       object
 Estimated Employed                         float64
 Estimated Labour Participation Rate (%)    float64
Area                                         object
dtype: object
float64


In [13]:
X_train_encoded = pd.get_dummies(X_train, columns=['Region', 'Area'])
X_test_encoded = pd.get_dummies(X_test, columns=['Region', 'Area'])
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

## Train the model

In [14]:
model = LinearRegression()
model.fit(X_train_encoded, y_train)


In [15]:
y_train_pred = model.predict(X_train_encoded)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

y_test_pred = model.predict(X_test_encoded)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print("Training set:")
print(f"RMSE: {train_rmse:.2f}")
print(f"R^2: {train_r2:.2f}")

print("\nTesting set:")
print(f"RMSE: {test_rmse:.2f}")
print(f"R^2: {test_r2:.2f}")


Training set:
RMSE: 8.04
R^2: 0.40

Testing set:
RMSE: 9.95
R^2: 0.32


## Custom Testing

In [16]:
# Prepare new data for prediction
new_data = pd.DataFrame({'Region': ['Region1'], 'Estimated Employed': [5000], 'Estimated Labour Participation Rate (%)': [70], 'Area': [500]})
new_data = new_data.reindex(columns=X_train_encoded.columns, fill_value=0)

# Predict unemployment rate
predicted_unemployment_rate = model.predict(new_data)
print(f"\nPredicted Unemployment Rate: {predicted_unemployment_rate[0]:.2f}%")



Predicted Unemployment Rate: 19.36%
