# Machine Learning Project on Indian Non Metro Jobs Dataset

## Import the required libraries

In [3]:
# import pandas
# this library help us to convert different data files (structured or semi structured) into pandas data frame
import pandas as pd

# import numpy
# this library help us to perform different numerical and statistical operations on different data structures
import numpy as np

# import seaborn and matplotlib
# these both libraries help us to create visualization
import seaborn as sns
import matplotlib.pyplot as plt

## Load the Dataset

In [5]:
# Load the file from local
df_indian_non_metro_jobs_dataset = pd.read_csv('indian_non_metro_jobs_dataset.csv')

In [6]:
# Show the first five observations
df_indian_non_metro_jobs_dataset.head()

Unnamed: 0,Job_ID,City,Industry,Job_Role,Skill_Set,Experience_Level,Salary_Range,Company_Name,Talent_Inflow,Talent_Outflow,Infrastructure_Score,Smart_City_Investment,GCC_Presence,MSME_Growth_Rate,Unemployment_Rate,Education_Hubs,Cost_of_Living_Index,Job_Posting_Date
0,1,Ranchi,Retail,Retail,"Customer Service, English",Mid-Level,622099.18,Reliance Retail,1159,1583,74.25,9162.21,0,6.2,9.28,29,74.65,2023-10-15
1,2,Vijayawada,Tech,AI/ML,"Python, Deep Learning, TensorFlow, NLP",Entry-Level,613072.56,HCLTech,3779,618,84.11,4862.53,1,2.78,3.32,7,96.37,2023-03-31
2,3,Ranchi,Retail,Retail,"MS Office, Customer Service",Entry-Level,466698.78,Reliance Retail,1596,1565,54.9,5226.14,0,4.85,5.99,34,60.63,2024-12-25
3,4,Vadodara,Manufacturing,Sales,"Negotiation, Communication",Entry-Level,352905.66,Reliance Retail,1318,1227,73.16,4400.15,0,4.97,9.14,7,64.78,2023-08-11
4,5,Agra,Education,Data Science,"SQL, Python, Data Visualization",Senior-Level,1948630.28,Reliance Retail,1169,2511,52.7,2257.42,0,3.99,6.6,7,66.45,2023-06-03


In [7]:
# Show the last five obseravtions
df_indian_non_metro_jobs_dataset.tail()

Unnamed: 0,Job_ID,City,Industry,Job_Role,Skill_Set,Experience_Level,Salary_Range,Company_Name,Talent_Inflow,Talent_Outflow,Infrastructure_Score,Smart_City_Investment,GCC_Presence,MSME_Growth_Rate,Unemployment_Rate,Education_Hubs,Cost_of_Living_Index,Job_Posting_Date
11995,11996,Raipur,Finance,Business Development,"Sales, Market Analysis, CRM",Senior-Level,1915750.24,Reliance Retail,538,1534,50.47,5232.16,0,8.35,9.44,19,68.9,2025-02-14
11996,11997,Vadodara,Finance,Data Science,"Data Visualization, Statistics",Entry-Level,543950.21,Others,1663,2265,56.52,2564.89,0,2.63,7.62,15,90.58,2023-07-07
11997,11998,Vijayawada,Tech,Engineering,"Python, Java",Mid-Level,1871356.44,Reliance Retail,2184,0,81.68,4351.64,1,4.01,4.2,17,65.8,2023-08-31
11998,11999,Vijayawada,Others,Data Science,"SQL, Statistics",Senior-Level,2165847.48,Reliance Retail,2330,487,87.71,1088.28,1,4.76,4.43,13,77.11,2025-02-25
11999,12000,Vadodara,Manufacturing,Data Science,"Data Visualization, Statistics, SQL, Python",Senior-Level,1619926.58,Reliance Retail,1164,1297,69.29,2114.9,0,2.48,7.09,18,69.32,2024-01-30


## Basic data overview

In [9]:
# Check the shape of the data
df_indian_non_metro_jobs_dataset.shape

(12000, 18)

**Inference**
 - We have 12000 observations and 18 attributes.

In [11]:
# Check the columns of the data
df_indian_non_metro_jobs_dataset.columns

Index(['Job_ID', 'City', 'Industry', 'Job_Role', 'Skill_Set',
       'Experience_Level', 'Salary_Range', 'Company_Name', 'Talent_Inflow',
       'Talent_Outflow', 'Infrastructure_Score', 'Smart_City_Investment',
       'GCC_Presence', 'MSME_Growth_Rate', 'Unemployment_Rate',
       'Education_Hubs', 'Cost_of_Living_Index', 'Job_Posting_Date'],
      dtype='object')

In [12]:
# Check the basic info
df_indian_non_metro_jobs_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Job_ID                 12000 non-null  int64  
 1   City                   12000 non-null  object 
 2   Industry               12000 non-null  object 
 3   Job_Role               12000 non-null  object 
 4   Skill_Set              12000 non-null  object 
 5   Experience_Level       12000 non-null  object 
 6   Salary_Range           12000 non-null  float64
 7   Company_Name           12000 non-null  object 
 8   Talent_Inflow          12000 non-null  int64  
 9   Talent_Outflow         12000 non-null  int64  
 10  Infrastructure_Score   12000 non-null  float64
 11  Smart_City_Investment  12000 non-null  float64
 12  GCC_Presence           12000 non-null  int64  
 13  MSME_Growth_Rate       12000 non-null  float64
 14  Unemployment_Rate      12000 non-null  float64
 15  Ed

In [13]:
# Example of user defined function
def addition1(x,y):
    sum1 = x + y
    return sum1

In [14]:
# Call the function
addition1(10,20)

30

**Interpretation**
 - We have 11 numerical columns and 7 catergorical columns.
 - The data we have, acquires 1.6+ MB storage space.
 - We don't have any valid null records in the data.

In [16]:
# Describe the Dataset
df_indian_non_metro_jobs_dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Job_ID,12000.0,6000.5,3464.24595,1.0,3000.75,6000.5,9000.25,12000.0
Salary_Range,12000.0,846147.641146,607037.970547,300000.0,335759.525,667431.47,1171641.0,3985713.38
Talent_Inflow,12000.0,1305.965167,783.722309,0.0,751.0,1155.0,1731.0,4204.0
Talent_Outflow,12000.0,1300.928083,498.169449,0.0,1058.0,1404.0,1643.25,2843.0
Infrastructure_Score,12000.0,67.177056,11.323789,50.0,58.08,65.87,73.65,94.99
Smart_City_Investment,12000.0,3922.670814,2180.646913,1000.66,2253.26,3546.27,4788.977,9995.7
GCC_Presence,12000.0,0.299417,0.458022,0.0,0.0,0.0,1.0,1.0
MSME_Growth_Rate,12000.0,7.273655,3.383855,2.0,4.45,6.98,9.46,14.99
Unemployment_Rate,12000.0,7.483524,2.730499,2.0,5.4,7.59,9.82,12.0
Education_Hubs,12000.0,14.78125,7.292199,5.0,9.0,14.0,18.0,39.0


**Interpretation**
 - From this, we can easily identify the outliers, by looking at the difference between mean and 50%(median)

# Data Preprocessing

### Null Value Handling

In [20]:
# Check the count of null record present in data frame
df_indian_non_metro_jobs_dataset.isnull().sum()

Job_ID                   0
City                     0
Industry                 0
Job_Role                 0
Skill_Set                0
Experience_Level         0
Salary_Range             0
Company_Name             0
Talent_Inflow            0
Talent_Outflow           0
Infrastructure_Score     0
Smart_City_Investment    0
GCC_Presence             0
MSME_Growth_Rate         0
Unemployment_Rate        0
Education_Hubs           0
Cost_of_Living_Index     0
Job_Posting_Date         0
dtype: int64

**Interpretation**
 - In the data we don't have the null records

## EDA(Exploratory Data Analysis)

In [23]:
# Step 1: Seggregate the data based on the data type
df_num = df_indian_non_metro_jobs_dataset.select_dtypes(include = 'number')

In [24]:
# Step 2: Make a separate data frame for categorical variables
df_cat = df_indian_non_metro_jobs_dataset.select_dtypes(include = 'object')

In [25]:
# Check the columns present in categorical data frame
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [26]:
# Check the categories present in categorical columns
df_cat.City.value_counts()

City
Raipur           1235
Vijayawada       1227
Jodhpur          1222
Visakhapatnam    1218
Madurai          1205
Ranchi           1195
Rajkot           1192
Agra             1183
Vadodara         1175
Nashik           1148
Name: count, dtype: int64

In [27]:
# Check the columns present in categorical data frame
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [28]:
# Check the categories present in categorical columns
df_cat.Industry.value_counts()

Industry
Tech             2572
Pharma           1786
Others           1544
Retail           1450
Education        1436
Manufacturing    1134
Healthcare       1055
Finance          1023
Name: count, dtype: int64

In [29]:
# Check the categories present in categorical columns
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [30]:
# Check the categories present in categorical columns
df_cat.Job_Role.value_counts()

Job_Role
Business Development    2381
Engineering             2259
AI/ML                   1776
Operations              1518
Data Science            1500
Others                  1018
Sales                   1005
Retail                   543
Name: count, dtype: int64

In [31]:
# Check the categories present in categorical columns
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [32]:
# Check the categories present in categorical columns
df_cat.Skill_Set.value_counts()

Skill_Set
Market Analysis, Sales, CRM                    279
Sales, Market Analysis, CRM                    274
CRM, Market Analysis, Sales                    265
CRM, Sales, Market Analysis                    265
Market Analysis, CRM, Sales                    259
                                              ... 
Statistics, Data Visualization, Python, SQL     13
Statistics, Python, Data Visualization, SQL     13
SQL, Data Visualization, Statistics, Python     13
Python, Data Visualization, Statistics          12
SQL, Python, Data Visualization, Statistics     10
Name: count, Length: 226, dtype: int64

In [33]:
# Check the categories present in categorical columns
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [34]:
# Check the categories present in categorical columns
df_cat.Experience_Level.value_counts()

Experience_Level
Entry-Level     4887
Mid-Level       4164
Senior-Level    2346
Internship       603
Name: count, dtype: int64

In [35]:
# Check the categories present in categorical columns
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [36]:
# Check the categories present in categorical columns
df_cat.Company_Name.value_counts()

Company_Name
Reliance Retail    5541
Others             5539
HCLTech             920
Name: count, dtype: int64

In [37]:
# Check the categories present in categorical columns
df_cat.columns

Index(['City', 'Industry', 'Job_Role', 'Skill_Set', 'Experience_Level',
       'Company_Name', 'Job_Posting_Date'],
      dtype='object')

In [38]:
# Check the categories present in categorical columns
df_cat.Job_Posting_Date.value_counts()

Job_Posting_Date
2025-02-12    33
2024-07-27    32
2023-07-18    30
2025-01-09    30
2023-06-23    29
              ..
2024-06-09     7
2023-08-26     7
2024-10-21     7
2024-07-11     6
2024-08-08     4
Name: count, Length: 731, dtype: int64

In [39]:
# Ensure Job_Posting_Date is in datetime format
df_cat['Job_Posting_Date'] = pd.to_datetime(df_cat['Job_Posting_Date'])

In [40]:
# Extracting standard date components
df_cat['Day'] = df_cat['Job_Posting_Date'].dt.day
df_cat['Month'] = df_cat['Job_Posting_Date'].dt.month
df_cat['Year'] = df_cat['Job_Posting_Date'].dt.year
df_cat['Weekday'] = df_cat['Job_Posting_Date'].dt.day_name()
df_cat['Quarter'] = df_cat['Job_Posting_Date'].dt.quarter

In [41]:
# Define function to get season
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Summer'
    elif month in [6, 7, 8, 9]:
        return 'Monsoon'
    else:
        return 'Autumn'  # Optional, in case you want 4 seasons
# Apply the function
df_cat['Season'] = df_cat['Job_Posting_Date'].apply(get_season)

In [42]:
# Check the categorical data frame
df_cat.head()

Unnamed: 0,City,Industry,Job_Role,Skill_Set,Experience_Level,Company_Name,Job_Posting_Date,Day,Month,Year,Weekday,Quarter,Season
0,Ranchi,Retail,Retail,"Customer Service, English",Mid-Level,Reliance Retail,2023-10-15,15,10,2023,Sunday,4,Autumn
1,Vijayawada,Tech,AI/ML,"Python, Deep Learning, TensorFlow, NLP",Entry-Level,HCLTech,2023-03-31,31,3,2023,Friday,1,Summer
2,Ranchi,Retail,Retail,"MS Office, Customer Service",Entry-Level,Reliance Retail,2024-12-25,25,12,2024,Wednesday,4,Winter
3,Vadodara,Manufacturing,Sales,"Negotiation, Communication",Entry-Level,Reliance Retail,2023-08-11,11,8,2023,Friday,3,Monsoon
4,Agra,Education,Data Science,"SQL, Python, Data Visualization",Senior-Level,Reliance Retail,2023-06-03,3,6,2023,Saturday,2,Monsoon


In [43]:
import pandas as pd

# Ensure Job_Posting_Date is in datetime format
df_indian_non_metro_jobs_dataset['Job_Posting_Date'] = pd.to_datetime(df_indian_non_metro_jobs_dataset['Job_Posting_Date'])

# Extracting standard date components
df_indian_non_metro_jobs_dataset['Day'] = df_indian_non_metro_jobs_dataset['Job_Posting_Date'].dt.day
df_indian_non_metro_jobs_dataset['Month'] = df_indian_non_metro_jobs_dataset['Job_Posting_Date'].dt.month
df_indian_non_metro_jobs_dataset['Year'] = df_indian_non_metro_jobs_dataset['Job_Posting_Date'].dt.year
df_indian_non_metro_jobs_dataset['Weekday'] = df_indian_non_metro_jobs_dataset['Job_Posting_Date'].dt.day_name()
df_indian_non_metro_jobs_dataset['Quarter'] = df_indian_non_metro_jobs_dataset['Job_Posting_Date'].dt.quarter

# Define function to get season
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Summer'
    elif month in [6, 7, 8, 9]:
        return 'Monsoon'
    else:
        return 'Autumn'  # Optional, in case you want 4 seasons

# Apply the function
df_indian_non_metro_jobs_dataset['Season'] = df_indian_non_metro_jobs_dataset['Job_Posting_Date'].apply(get_season)

# Display updated data frames
df_indian_non_metro_jobs_dataset.head()

Unnamed: 0,Job_ID,City,Industry,Job_Role,Skill_Set,Experience_Level,Salary_Range,Company_Name,Talent_Inflow,Talent_Outflow,...,Unemployment_Rate,Education_Hubs,Cost_of_Living_Index,Job_Posting_Date,Day,Month,Year,Weekday,Quarter,Season
0,1,Ranchi,Retail,Retail,"Customer Service, English",Mid-Level,622099.18,Reliance Retail,1159,1583,...,9.28,29,74.65,2023-10-15,15,10,2023,Sunday,4,Autumn
1,2,Vijayawada,Tech,AI/ML,"Python, Deep Learning, TensorFlow, NLP",Entry-Level,613072.56,HCLTech,3779,618,...,3.32,7,96.37,2023-03-31,31,3,2023,Friday,1,Summer
2,3,Ranchi,Retail,Retail,"MS Office, Customer Service",Entry-Level,466698.78,Reliance Retail,1596,1565,...,5.99,34,60.63,2024-12-25,25,12,2024,Wednesday,4,Winter
3,4,Vadodara,Manufacturing,Sales,"Negotiation, Communication",Entry-Level,352905.66,Reliance Retail,1318,1227,...,9.14,7,64.78,2023-08-11,11,8,2023,Friday,3,Monsoon
4,5,Agra,Education,Data Science,"SQL, Python, Data Visualization",Senior-Level,1948630.28,Reliance Retail,1169,2511,...,6.6,7,66.45,2023-06-03,3,6,2023,Saturday,2,Monsoon


### Decide the target variable

In [45]:
# Check the data frame to decide the target variable
df_indian_non_metro_jobs_dataset.head()

Unnamed: 0,Job_ID,City,Industry,Job_Role,Skill_Set,Experience_Level,Salary_Range,Company_Name,Talent_Inflow,Talent_Outflow,...,Unemployment_Rate,Education_Hubs,Cost_of_Living_Index,Job_Posting_Date,Day,Month,Year,Weekday,Quarter,Season
0,1,Ranchi,Retail,Retail,"Customer Service, English",Mid-Level,622099.18,Reliance Retail,1159,1583,...,9.28,29,74.65,2023-10-15,15,10,2023,Sunday,4,Autumn
1,2,Vijayawada,Tech,AI/ML,"Python, Deep Learning, TensorFlow, NLP",Entry-Level,613072.56,HCLTech,3779,618,...,3.32,7,96.37,2023-03-31,31,3,2023,Friday,1,Summer
2,3,Ranchi,Retail,Retail,"MS Office, Customer Service",Entry-Level,466698.78,Reliance Retail,1596,1565,...,5.99,34,60.63,2024-12-25,25,12,2024,Wednesday,4,Winter
3,4,Vadodara,Manufacturing,Sales,"Negotiation, Communication",Entry-Level,352905.66,Reliance Retail,1318,1227,...,9.14,7,64.78,2023-08-11,11,8,2023,Friday,3,Monsoon
4,5,Agra,Education,Data Science,"SQL, Python, Data Visualization",Senior-Level,1948630.28,Reliance Retail,1169,2511,...,6.6,7,66.45,2023-06-03,3,6,2023,Saturday,2,Monsoon


In [46]:
# After discussion, we decided to take Salary_Range as our target variable(dependent variable)
target = df_indian_non_metro_jobs_dataset.Salary_Range

# After storing this variable into target, we have to remove it from the main data frame(independent variable)
df_indian_non_metro_jobs_dataset = df_indian_non_metro_jobs_dataset.drop('Salary_Range', axis = 1)

## Prepare the data for Machine Learning

In [48]:
# Step 1: Seggregate the data based on the data type
df_num = df_indian_non_metro_jobs_dataset.select_dtypes(include = 'number')

In [49]:
# Step 2: Make a separate data frame for categorical variables
df_cat = df_indian_non_metro_jobs_dataset.select_dtypes(include = 'object')

### Perform encoding on categorical variables

In [51]:
# Perform encoding(one hot encoding)
df_encoded = pd.get_dummies(df_cat)

In [52]:
# Check the encoded data
df_encoded.head()

Unnamed: 0,City_Agra,City_Jodhpur,City_Madurai,City_Nashik,City_Raipur,City_Rajkot,City_Ranchi,City_Vadodara,City_Vijayawada,City_Visakhapatnam,...,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,Season_Autumn,Season_Monsoon,Season_Summer,Season_Winter
0,False,False,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,True
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False


### Scale the numerical variables

In [54]:
# import the standard scaler subpackage for performing the scaling
from sklearn.preprocessing import StandardScaler

In [55]:
# Initiate the standard scaler
ss = StandardScaler()

In [56]:
# Transform the numerical variables
scale = ss.fit_transform(df_num)

# Check the transformed data
scale

array([[-1.73190648, -0.1875298 ,  0.5662404 , ...,  1.01376383,
        -1.0585089 ,  1.3430051 ],
       [-1.7316178 ,  3.15563032, -1.3709322 , ..., -1.00760347,
        -1.0585089 , -1.33600989],
       [-1.73132913,  0.37008889,  0.53010661, ...,  1.59129735,
         0.5442806 ,  1.3430051 ],
       ...,
       [ 1.73132913,  1.1203859 , -2.61152564, ...,  0.43623032,
        -1.0585089 ,  0.4500001 ],
       [ 1.7316178 ,  1.30668413, -1.63390589, ..., -1.29637023,
         2.1470701 , -1.33600989],
       [ 1.73190648, -0.18114973, -0.00788536, ..., -1.58513699,
         0.5442806 , -1.33600989]])

In [57]:
# Create a new data frame for scale data
df_scaled = pd.DataFrame(scale, columns = df_num.columns)

In [58]:
# Check the scaled data frame
df_scaled.head()

Unnamed: 0,Job_ID,Talent_Inflow,Talent_Outflow,Infrastructure_Score,Smart_City_Investment,GCC_Presence,MSME_Growth_Rate,Unemployment_Rate,Education_Hubs,Cost_of_Living_Index,Day,Month,Year,Quarter
0,-1.731906,-0.18753,0.56624,0.624635,2.402845,-0.653745,-0.317301,0.657957,1.949939,-0.463827,-0.08555,1.013764,-1.058509,1.343005
1,-1.731618,3.15563,-1.370932,1.495405,0.431018,1.529649,-1.328024,-1.524885,-1.067109,1.424717,1.728422,-1.007603,-1.058509,-1.33601
2,-1.731329,0.370089,0.530107,-1.084228,0.597769,-0.653745,-0.716271,-0.547001,2.635632,-1.682859,1.048183,1.591297,0.544281,1.343005
3,-1.73104,0.015357,-0.148406,0.528374,0.218971,-0.653745,-0.680807,0.606682,-1.067109,-1.322019,-0.539043,0.43623,-1.058509,0.45
4,-1.730752,-0.17477,2.429138,-1.278517,-0.763682,-0.653745,-0.970429,-0.32359,-1.067109,-1.176813,-1.446029,-0.141303,-1.058509,-0.443005


### Concatenate both the encoded and scaled data frame

In [60]:
# Do the concatenation
X = pd.concat([df_encoded, df_scaled], axis=1)

# Check the concatenated data
X.head()

Unnamed: 0,City_Agra,City_Jodhpur,City_Madurai,City_Nashik,City_Raipur,City_Rajkot,City_Ranchi,City_Vadodara,City_Vijayawada,City_Visakhapatnam,...,Smart_City_Investment,GCC_Presence,MSME_Growth_Rate,Unemployment_Rate,Education_Hubs,Cost_of_Living_Index,Day,Month,Year,Quarter
0,False,False,False,False,False,False,True,False,False,False,...,2.402845,-0.653745,-0.317301,0.657957,1.949939,-0.463827,-0.08555,1.013764,-1.058509,1.343005
1,False,False,False,False,False,False,False,False,True,False,...,0.431018,1.529649,-1.328024,-1.524885,-1.067109,1.424717,1.728422,-1.007603,-1.058509,-1.33601
2,False,False,False,False,False,False,True,False,False,False,...,0.597769,-0.653745,-0.716271,-0.547001,2.635632,-1.682859,1.048183,1.591297,0.544281,1.343005
3,False,False,False,False,False,False,False,True,False,False,...,0.218971,-0.653745,-0.680807,0.606682,-1.067109,-1.322019,-0.539043,0.43623,-1.058509,0.45
4,True,False,False,False,False,False,False,False,False,False,...,-0.763682,-0.653745,-0.970429,-0.32359,-1.067109,-1.176813,-1.446029,-0.141303,-1.058509,-0.443005


## Split the data in training set and testing set

In [62]:
# import the train test split
from sklearn.model_selection import train_test_split

In [63]:
# Create xtrain, xtest, ytrain, ytest
X_train, X_test, y_train, y_test = train_test_split(X,target,test_size=0.20,random_state=42)

In [64]:
# Authentic check training set
print(X_train.shape)

print(y_train.shape)

(9600, 284)
(9600,)


In [65]:
# Authentic check for testing set
print(X_test.shape)

print(y_test.shape)

(2400, 284)
(2400,)


## Find the base Machine Learning Algorithm

### Linear Regression

In [68]:
# import Linear Regression Algorithm and evaluation matrix 
from sklearn.linear_model import LinearRegression

# Here we are using MSE as evaluation matrix
from sklearn.metrics import mean_squared_error

In [69]:
# Create a linear regression model
model = LinearRegression()

In [70]:
# Fit the model to the training data
model.fit(X_train, y_train)

In [71]:
# Predict on the testing data
predictions = model.predict(X_test)

In [72]:
# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 55891774273.06084


### Ridge Model

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Assuming X_train, X_test, y_train, y_test are already defined from train_test_split

# Create a Ridge regression model
ridge_model = Ridge(alpha=1.0)  # alpha is the regularization strength

# Fit the model to the training data
ridge_model.fit(X_train, y_train)

# Predict on the testing data
predictions = ridge_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 55759250422.22612


### Lasso Model

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Assuming X_train, X_test, y_train, y_test are already defined from train_test_split

# Create a Lasso regression model
lasso_model = Lasso(alpha=1.0)  # alpha is the regularization strength

# Fit the model to the training data
lasso_model.fit(X_train, y_train)

# Predict on the testing data
predictions = lasso_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 55873369303.79538


  model = cd_fast.enet_coordinate_descent(


### Decision Tree Regressor

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Assuming X_train, X_test, y_train, y_test are already defined from train_test_split

# Create a Decision Tree regression model
tree_model = DecisionTreeRegressor()

# Fit the model to the training data
tree_model.fit(X_train, y_train)

# Predict on the testing data
predictions = tree_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 80720841495.81865


### Random Forest Regressor

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Assuming X_train, X_test, y_train, y_test are already defined from train_test_split

# Create a Random Forest regression model
forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
forest_model.fit(X_train, y_train)

# Predict on the testing data
predictions = forest_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 42859702369.44861


### SVR

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Assuming X_train, X_test, y_train, y_test are already defined from train_test_split

# Create a SVM regression model
svm_model = SVR(kernel='rbf')  # 'rbf' kernel is commonly used for non-linear regression

# Fit the model to the training data
svm_model.fit(X_train, y_train)

# Predict on the testing data
predictions = svm_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 397191726197.3311


**Interpretation**
 - As per this method, we can conclude that Random Forest Regressor is our base algorithm because it has least MSE as compared to others.

## Hyper Parameter Tuning using cross validation

In [85]:
from sklearn.model_selection import RandomizedSearchCV

# Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Define hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='neg_mean_squared_error',  # Use MSE
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=1,  # Use all available cores
    error_score='raise'
)

# Fit on training data
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_

# Predict and evaluate
y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


### Feature selection for base algorithm

In [87]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score

# 1. Define the base model
rf = RandomForestRegressor(random_state=42)

# 2. Define RFE - you can set n_features_to_select to a specific number
n_features = 10  # Keeping top 10 features
rfe = RFE(estimator=rf, n_features_to_select=n_features)

# 3. Fit RFE on the training data
rfe.fit(X_train, y_train)

# 4. Transform the datasets
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# 5. Train the model again using selected features
rf.fit(X_train_rfe, y_train)

# 6. Predict and evaluate
y_pred = rf.predict(X_test_rfe)

# 7. Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Selected Features: {rfe.support_}")
print(f"Feature Ranking: {rfe.ranking_}")
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


KeyboardInterrupt



In [11]:
import joblib

# Assuming 'rf' is your trained model
joblib.dump(rf, 'model.pkl')

ModuleNotFoundError: No module named 'joblib'