In [None]:

#if you are using this in google colab you dont need to pip install modules & library
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# --- Data Loading and Preprocessing ---
from google.colab import drive
drive.mount('/content/drive')

# Replace with the actual path to your CSV file
file_pathone = '/content/drive/MyDrive/Dataset/Unemployment_in_India.csv'
file_pathtwo = '/content/drive/MyDrive/Dataset/Unemployment_Rate_upto_11_2020.csv'

# Load the datasets
df1 = pd.read_csv(file_pathone)
df2 = pd.read_csv(file_pathtwo)

# Data Exploration and Cleaning
# Display basic information and first few rows of each dataset
print(df1.info())
print(df1.head())
print(df2.info())
print(df2.head())

# Check for missing values
print(df1.isnull().sum())
print(df2.isnull().sum())

# Handle missing values (if any) - for simplicity, dropping rows with missing values
df1.dropna(inplace=True)
df2.dropna(inplace=True)

# Feature Engineering

# Remove leading spaces from column names
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()

# Strip leading/trailing spaces from the 'Date' column in both dataframes
df1['Date'] = df1['Date'].str.strip()
df2['Date'] = df2['Date'].str.strip()

# Convert 'Date' column to datetime objects
df1['Date'] = pd.to_datetime(df1['Date'], format='%d-%m-%Y')
df2['Date'] = pd.to_datetime(df2['Date'], format='%d-%m-%Y')

# Extract month and year from 'Date' for potential seasonality analysis
df1['Month'] = df1['Date'].dt.month
df1['Year'] = df1['Date'].dt.year
df2['Month'] = df2['Date'].dt.month
df2['Year'] = df2['Date'].dt.year


# Data Visualization 1-7

# 1. Unemployment Trend Over Time
plt.figure(figsize=(12, 6))
sns.lineplot(x='Date', y='Estimated Unemployment Rate (%)', data=df1) # Removed extra spaces around column name
plt.title("Unemployment Trend in India")
plt.xlabel("Date")
plt.ylabel("Unemployment Rate (%)")
plt.show()


# 2. Unemployment Rate Distribution
plt.figure(figsize=(8, 6))
sns.histplot(df1['Estimated Unemployment Rate (%)'], kde=True) # Removed extra spaces around column name
plt.title("Distribution of Unemployment Rates")
plt.xlabel("Unemployment Rate (%)")
plt.show()


# 3. Unemployment by Region
plt.figure(figsize=(12, 6))
sns.barplot(x='Region', y='Estimated Unemployment Rate (%)', data=df1) # Removed extra spaces around column name
plt.title("Unemployment Rate by Region")
plt.xlabel("Region")
plt.ylabel("Unemployment Rate (%)")
plt.xticks(rotation=90)
plt.show()

# 4. Correlation Heatmap
plt.figure(figsize=(10, 8))
# Selecting  only numerical columns for correlation calculation
numerical_df = df1.select_dtypes(include=['number'])
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# 6.  Unemployment Rate by Year (Box Plot)
plt.figure(figsize=(8, 6))
sns.boxplot(x='Year', y='Estimated Unemployment Rate (%)', data=df1)
plt.title("Unemployment Rate by Year")
plt.xlabel("Year")
plt.ylabel("Unemployment Rate (%)")
plt.show()

# 7. Scatter Plot: Unemployment vs. Labor Participation Rate
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Estimated Labour Participation Rate (%)', y='Estimated Unemployment Rate (%)', data=df1)
plt.title("Unemployment vs. Labor Participation Rate")
plt.xlabel("Labor Participation Rate (%)")
plt.ylabel("Unemployment Rate (%)")
plt.show()


# Model Building (Using df1 as an example, you can repeat for df2)
# Selecting  features and target variable
X = df1[['Month', 'Year', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']]
y = df1['Estimated Unemployment Rate (%)']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)

# 2. Decision Tree Regression
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)

# 3. Random Forest Regression
forest_model = RandomForestRegressor()
forest_model.fit(X_train, y_train)
forest_pred = forest_model.predict(X_test)

# Model Evaluation
# Creating  a dictionary to store model results
model_results = {}

# Evaluate each model
for model_name, model, predictions in zip(['Linear Regression', 'Decision Tree', 'Random Forest'],
                                        [linear_model, tree_model, forest_model],
                                        [linear_pred, tree_pred, forest_pred]):
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    model_results[model_name] = {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}

# Print model evaluation results
for model_name, metrics in model_results.items():
    print(f"--- {model_name} ---")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")
    print()