## Model Trainer Salary of Data Professions

In [1]:
# importing librares
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score,r2_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# importing data
df=pd.read_csv('data/cleaned_data.csv')
df.head()

Unnamed: 0,FIRST NAME,LAST NAME,SEX,DOJ,CURRENT DATE,DESIGNATION,AGE,UNIT,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP,SALARY,DAY,DAY NAME,MONTH,YEAR
0,TOMASA,ARMEN,F,2014-05-18,2016-01-07,Analyst,21.0,Finance,24,6,2.0,0,44570,18,Sunday,5,2014
1,OLIVE,ANCY,F,2014-07-28,2016-01-07,Analyst,21.0,Finance,23,7,3.0,0,40955,28,Monday,7,2014
2,CHERRY,AQUILAR,F,2013-04-03,2016-01-07,Analyst,22.0,IT,22,8,3.0,0,45550,3,Wednesday,4,2013
3,LEON,ABOULAHOUD,M,2014-11-20,2016-01-07,Analyst,24.0,Operations,27,3,0.0,3,43161,20,Thursday,11,2014
4,VICTORIA,-,F,2013-02-19,2016-01-07,Analyst,22.0,Marketing,20,10,4.0,0,48736,19,Tuesday,2,2013


In [3]:
col_names = df.columns.tolist()
col_names = col_names[0:12] + col_names[13:] + [col_names[12]]
X = df[col_names[:-1]]
y = df[col_names[-1]]

In [4]:
# removing unnecessary columns
X = X.drop(columns=['FIRST NAME','LAST NAME','DOJ','CURRENT DATE','LEAVES USED','LEAVES REMAINING','DAY','DAY NAME'])

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2477 entries, 0 to 2476
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SEX          2477 non-null   object 
 1   DESIGNATION  2477 non-null   object 
 2   AGE          2477 non-null   float64
 3   UNIT         2477 non-null   object 
 4   RATINGS      2477 non-null   float64
 5   PAST EXP     2477 non-null   int64  
 6   MONTH        2477 non-null   int64  
 7   YEAR         2477 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 154.9+ KB


In [6]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder',categorical_transformer,cat_features),
        ('StandardScaler',numeric_transformer,num_features),
    ]
)

In [7]:
X_t = preprocessor.fit_transform(X)

In [8]:
X_t.shape

(2477, 19)

In [9]:
X.shape

(2477, 8)

In [10]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_t,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1981, 19), (496, 19))

In [11]:
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2_score(y_test,y_pred)

0.9629434307435873

In [12]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(model,X_train,y_train,cv=3))

[0.95847121 0.9549807  0.94712537]


In [13]:
models = {
    'Linear Regression':LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'Ridge': Ridge(),
    'Lasso':Lasso(),
    'RandomForestRegressor':RandomForestRegressor()
}
model_list = list(models.keys())
train_r2_score = []
test_r2_score = []
l = pd.DataFrame()
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    
    train_r2_score.append(r2_score(y_train,y_train_pred))
    test_r2_score.append(r2_score(y_test,y_test_pred))
    # l.append(cross_val_score(model,X_train,y_train,cv=3).tolist())

In [14]:
pd.DataFrame({'Models':model_list,'Train R2':train_r2_score,'Test R2':test_r2_score})

Unnamed: 0,Models,Train R2,Test R2
0,Linear Regression,0.956749,0.962943
1,Decision Tree,0.999493,0.903023
2,KNeighborsRegressor,0.940405,0.93434
3,Ridge,0.955509,0.959793
4,Lasso,0.956748,0.962924
5,RandomForestRegressor,0.992316,0.949825


## Linear Regression

In [15]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 96.29


In [16]:
# exporting model and preprocessor
import pickle

data ={'data':df,'model':lin_model,'preprocessor':preprocessor}
with open('model/model.pkl', 'wb') as file:
    pickle.dump(data, file)