In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error as mse 
from sklearn.linear_model import LinearRegression

In [2]:
#loading data 
train=pd.read_csv('Data_file.csv')
test=pd.read_csv('Test_data.csv')

In [3]:
# Looking at the first five rows
train.head()

Unnamed: 0,S.No.,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
0,1,Tier 1,Manager,Non-Metro,55523.0,3,66,19,71406.58
1,2,Tier 2,Executive,Metro,57081.0,1,84,18,68005.87
2,3,Tier 2,Executive,Metro,60347.0,2,52,28,76764.02
3,4,Tier 3,Executive,Metro,49010.0,2,81,33,82092.39
4,5,Tier 3,Executive,Metro,57879.0,4,74,32,73878.1


In [4]:
# types of values are stored in the columns.
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   S.No.                 1338 non-null   int64 
 1   College               1338 non-null   object
 2   Role                  1338 non-null   object
 3   City type             1338 non-null   object
 4   Previous CTC          1338 non-null   object
 5   Previous job changes  1338 non-null   int64 
 6   Graduation marks      1338 non-null   int64 
 7   Exp (Months)          1338 non-null   int64 
 8   CTC                   1338 non-null   object
dtypes: int64(4), object(5)
memory usage: 94.2+ KB


In [5]:
#checking for null values
train.isnull().sum()

S.No.                   0
College                 0
Role                    0
City type               0
Previous CTC            0
Previous job changes    0
Graduation marks        0
Exp (Months)            0
CTC                     0
dtype: int64

In [6]:
#checking for duplicated rows
train.duplicated().sum()

0

In [7]:
#5 point summary of numerical featues
train.describe()

Unnamed: 0,S.No.,Previous job changes,Graduation marks,Exp (Months)
count,1338.0,1338.0,1338.0,1338.0
mean,669.5,2.525411,59.890882,39.207025
std,386.391641,1.123502,14.894696,14.04996
min,1.0,1.0,35.0,18.0
25%,335.25,2.0,47.0,27.0
50%,669.5,3.0,60.0,39.0
75%,1003.75,4.0,73.0,51.0
max,1338.0,4.0,85.0,64.0


In [8]:
#dropping S.No. column
train.drop('S.No.',axis=1,inplace=True)

In [9]:
#converting categorical features into numerical
train=pd.get_dummies(columns=['College','Role','City type'],drop_first=True,data=train)

In [10]:
#converting CTC into numerical
train['CTC']=train['CTC'].str.replace(',', '')
train['Previous CTC']=train['Previous CTC'].str.replace(',', '')

In [11]:
train['CTC']=train['CTC'].astype('float')
train['Previous CTC']=train['Previous CTC'].astype('float')

In [12]:
#checking datatype and first 5 rows
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Previous CTC          1338 non-null   float64
 1   Previous job changes  1338 non-null   int64  
 2   Graduation marks      1338 non-null   int64  
 3   Exp (Months)          1338 non-null   int64  
 4   CTC                   1338 non-null   float64
 5   College_Tier 2        1338 non-null   uint8  
 6   College_Tier 3        1338 non-null   uint8  
 7   Role_Manager          1338 non-null   uint8  
 8   City type_Non-Metro   1338 non-null   uint8  
dtypes: float64(2), int64(3), uint8(4)
memory usage: 57.6 KB


In [13]:
X_train=train.drop('CTC',axis=1)
y_train=train[['CTC']]

In [14]:
#builing model using Simple linear regression
model=LinearRegression(normalize=True)
model.fit(X_train,y_train)


LinearRegression(normalize=True)

In [15]:
#builing model using Random forest regression
from sklearn.ensemble import RandomForestRegressor
model2=RandomForestRegressor()
model2.fit(X_train,y_train)

  model2.fit(X_train,y_train)


RandomForestRegressor()

In [16]:
#checking test data
test.head()

Unnamed: 0,College,Role,City type,College_T1,College_T2,Role_Manager,City_Metro,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC,Predicted CTC
0,Tier 1,Manager,Non-Metro,1,0,1,0,55523,3,66,19,71406.57653,
1,Tier 2,Executive,Metro,0,1,0,1,57081,1,84,18,68005.87063,
2,Tier 2,Executive,Metro,0,1,0,1,60347,2,52,28,76764.02028,
3,Tier 3,Executive,Metro,0,0,0,1,49010,2,81,33,82092.38688,
4,Tier 3,Executive,Metro,0,0,0,1,57879,4,74,32,73878.09773,


In [17]:
#Rearranging the test dataset columns
test=test[['College','Role','City type','previous CTC','previous job changes','Graduation marks','Exp','Actual CTC']]

In [18]:
test.head()

Unnamed: 0,College,Role,City type,previous CTC,previous job changes,Graduation marks,Exp,Actual CTC
0,Tier 1,Manager,Non-Metro,55523,3,66,19,71406.57653
1,Tier 2,Executive,Metro,57081,1,84,18,68005.87063
2,Tier 2,Executive,Metro,60347,2,52,28,76764.02028
3,Tier 3,Executive,Metro,49010,2,81,33,82092.38688
4,Tier 3,Executive,Metro,57879,4,74,32,73878.09773


In [19]:
#converting categorical features into numerical
test=pd.get_dummies(columns=['College','Role','City type'],drop_first=True,data=test)

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   previous CTC          1338 non-null   int64  
 1   previous job changes  1338 non-null   int64  
 2   Graduation marks      1338 non-null   int64  
 3   Exp                   1338 non-null   int64  
 4   Actual CTC            1338 non-null   float64
 5   College_Tier 2        1338 non-null   uint8  
 6   College_Tier 3        1338 non-null   uint8  
 7   Role_Manager          1338 non-null   uint8  
 8   City type_Non-Metro   1338 non-null   uint8  
dtypes: float64(1), int64(4), uint8(4)
memory usage: 57.6 KB


In [21]:
X_test=test.drop('Actual CTC',axis=1)
y_test=test['Actual CTC']

In [22]:
#prediction with linear regression model
y_pred=model.predict(X_test)
print("R2 score",r2_score(y_test,y_pred))
print("MSE",mse(y_test,y_pred))

R2 score 0.6081873160460818
MSE 61677937.04476765


In [23]:
model.score(X_test,y_test)

0.6081873160460818

In [25]:
model2.score(X_test,y_test)

0.9468514471691791

In [26]:
#prediction with random forest regression model
y_predR=model2.predict(X_test)
print("R2 score",r2_score(y_test,y_predR))
print("MSE",mse(y_test,y_predR))

R2 score 0.9468514471691791
MSE 8366480.2845061105


In [64]:
#Predicated CTC
new_df=pd.DataFrame(y_predR,columns=["Predicated CTC"]).head()

In [68]:
final_df=pd.concat([test,new_df],axis=1).head()
final_df=final_df[['previous CTC','previous job changes','Graduation marks','Exp','College_Tier 2','College_Tier 3','Role_Manager','City type_Non-Metro','Actual CTC','Predicated CTC']]

In [69]:
final_df

Unnamed: 0,previous CTC,previous job changes,Graduation marks,Exp,College_Tier 2,College_Tier 3,Role_Manager,City type_Non-Metro,Actual CTC,Predicated CTC
0,55523,3,66,19,0,0,1,1,71406.57653,78050.6639
1,57081,1,84,18,1,0,0,0,68005.87063,65430.6628
2,60347,2,52,28,1,0,0,0,76764.02028,73877.9635
3,49010,2,81,33,0,1,0,0,82092.38688,76486.2355
4,57879,4,74,32,0,1,0,0,73878.09773,72325.6462
