ML Code for Predicting CTC

Importing Required Packages:-

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error as mse 
from sklearn.linear_model import LinearRegression

Loading required files for training and predicting

In [2]:
#file for training
data = pd.read_csv('C:\\Users\\Jayti Bansal\\Downloads\\Data_file.xlsx - Data.csv')

In [3]:
#file for testing
data_test= pd.read_csv("C:\\Users\\Jayti Bansal\\Downloads\\Test_data_file.xlsx - Prediction.csv")

Exploring Data

In [4]:
data

Unnamed: 0,S.No.,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
0,1,Tier 1,Manager,Non-Metro,55523.00,3,66,19,71406.58
1,2,Tier 2,Executive,Metro,57081.00,1,84,18,68005.87
2,3,Tier 2,Executive,Metro,60347.00,2,52,28,76764.02
3,4,Tier 3,Executive,Metro,49010.00,2,81,33,82092.39
4,5,Tier 3,Executive,Metro,57879.00,4,74,32,73878.10
...,...,...,...,...,...,...,...,...,...
1333,1334,Tier 3,Executive,Metro,59661.00,4,68,50,69712.40
1334,1335,Tier 1,Executive,Non-Metro,53714.00,1,67,18,69298.75
1335,1336,Tier 2,Executive,Non-Metro,61957.00,1,47,18,66397.77
1336,1337,Tier 1,Executive,Non-Metro,53203.00,3,69,21,64044.38


Details about Data

In [5]:
#displays data dimensions
data.shape

(1338, 9)

In [6]:
#displays columns and their data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   S.No.                 1338 non-null   int64 
 1   College               1338 non-null   object
 2   Role                  1338 non-null   object
 3   City type             1338 non-null   object
 4   Previous CTC          1338 non-null   object
 5   Previous job changes  1338 non-null   int64 
 6   Graduation marks      1338 non-null   int64 
 7   Exp (Months)          1338 non-null   int64 
 8   CTC                   1338 non-null   object
dtypes: int64(4), object(5)
memory usage: 94.2+ KB


In [7]:
#displays overview of the values each column
data.describe()

Unnamed: 0,S.No.,Previous job changes,Graduation marks,Exp (Months)
count,1338.0,1338.0,1338.0,1338.0
mean,669.5,2.525411,59.890882,39.207025
std,386.391641,1.123502,14.894696,14.04996
min,1.0,1.0,35.0,18.0
25%,335.25,2.0,47.0,27.0
50%,669.5,3.0,60.0,39.0
75%,1003.75,4.0,73.0,51.0
max,1338.0,4.0,85.0,64.0


In [8]:
#returns the number of missing values from data set
data.isnull().sum()

S.No.                   0
College                 0
Role                    0
City type               0
Previous CTC            0
Previous job changes    0
Graduation marks        0
Exp (Months)            0
CTC                     0
dtype: int64

In [9]:
#dropping of irrelavant columns
data.drop('S.No.',axis=1,inplace=True)
data.head()

Unnamed: 0,College,Role,City type,Previous CTC,Previous job changes,Graduation marks,Exp (Months),CTC
0,Tier 1,Manager,Non-Metro,55523.0,3,66,19,71406.58
1,Tier 2,Executive,Metro,57081.0,1,84,18,68005.87
2,Tier 2,Executive,Metro,60347.0,2,52,28,76764.02
3,Tier 3,Executive,Metro,49010.0,2,81,33,82092.39
4,Tier 3,Executive,Metro,57879.0,4,74,32,73878.1


In [10]:
#dummy variable encoding
#using get_dummies() function from pandas to perform dummy variable encoding.
#drop_first is set True to drop the first column so that a dummy state can be achieved.

data = pd.get_dummies(columns=['College','Role','City type'], drop_first=True,data=data)

In [11]:
#to delete ',' from the values from the columns CTC,Previous CTC which are expected to have numerical data.
#',' makes the values to be of string data type

data['CTC']=data['CTC'].str.replace(',','')

In [12]:
data['Previous CTC']=data['Previous CTC'].str.replace(',','')

In [13]:
#converting the data type to float.

data['CTC']=data['CTC'].astype('float')
data['Previous CTC']=data['Previous CTC'].astype('float')

In [14]:
#after convertion of data types of required columns, the final data types are

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Previous CTC          1338 non-null   float64
 1   Previous job changes  1338 non-null   int64  
 2   Graduation marks      1338 non-null   int64  
 3   Exp (Months)          1338 non-null   int64  
 4   CTC                   1338 non-null   float64
 5   College_Tier 2        1338 non-null   uint8  
 6   College_Tier 3        1338 non-null   uint8  
 7   Role_Manager          1338 non-null   uint8  
 8   City type_Non-Metro   1338 non-null   uint8  
dtypes: float64(2), int64(3), uint8(4)
memory usage: 57.6 KB


In [15]:
#training data based on CTC.

X=data.drop(['CTC'],axis=1)
Y=data['CTC']

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=0)

In [17]:
#creating linear regression model and to predict a Best Fit

model=LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

Testing and Predicting

In [18]:
Y_pred=model.predict(x_test)

In [19]:
# Comparing Actual vs Predicted scores for test dataset
df = pd.DataFrame(np.c_[y_test,Y_pred],columns=["Actual","Predicted"])  
df 

Unnamed: 0,Actual,Predicted
0,71105.71,79485.124518
1,62426.39,69054.039220
2,99734.64,96749.061695
3,85083.58,79052.020421
4,59721.74,65331.612536
...,...,...
397,70052.79,77403.366471
398,82925.12,87027.025189
399,76028.50,75917.437885
400,81015.32,81227.922598


In [20]:
#Checking the Accuracy
print("Model-Score :",model.score(x_test,y_test))

Model-Score : 0.6657383809347185
