In [193]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [194]:
df=pd.read_csv('Salary_Data.csv')

In [195]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6702 non-null   float64
 1   Gender               6702 non-null   object 
 2   Education Level      6701 non-null   object 
 3   Job Title            6702 non-null   object 
 4   Years of Experience  6701 non-null   float64
 5   Salary               6699 non-null   float64
dtypes: float64(3), object(3)
memory usage: 314.4+ KB


In [197]:
df.shape

(6704, 6)

In [198]:
df.duplicated().sum()

4912

In [199]:
df.isnull().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

In [200]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [201]:
df.dropna(inplace=True)

In [202]:
import matplotlib.pyplot as plt
import seaborn as sns

In [203]:
df.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [204]:
df.rename({'Education Level':'Education_Level','Job Title':'Job_Title','Years of Experience':'Years_of_Experience'},axis=1,inplace=True)

In [205]:
df.Education_Level.unique()

array(["Bachelor's", "Master's", 'PhD', "Bachelor's Degree",
       "Master's Degree", 'High School', 'phD'], dtype=object)

In [206]:
df.Education_Level.replace({"Bachelor's":"Bachelor","Bachelor's Degree":"Bachelor","Master's":"Masters","Master's Degree":"Masters","phD":"PhD","Master":"Masters"},inplace=True)

In [207]:
df.head()

Unnamed: 0,Age,Gender,Education_Level,Job_Title,Years_of_Experience,Salary
0,32.0,Male,Bachelor,Software Engineer,5.0,90000.0
1,28.0,Female,Masters,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor,Sales Associate,7.0,60000.0
4,52.0,Male,Masters,Director,20.0,200000.0


In [208]:
df.Education_Level.unique()

array(['Bachelor', 'Masters', 'PhD', 'High School'], dtype=object)

In [209]:
import plotly.express as px
px.box(df,y='Age')

In [210]:
df['Age']=np.clip(df['Age'],21,53)

In [211]:
df.head()

Unnamed: 0,Age,Gender,Education_Level,Job_Title,Years_of_Experience,Salary
0,32.0,Male,Bachelor,Software Engineer,5.0,90000.0
1,28.0,Female,Masters,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor,Sales Associate,7.0,60000.0
4,52.0,Male,Masters,Director,20.0,200000.0


In [212]:
pd.options.display.max_rows=200

In [213]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import BinaryEncoder

In [214]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split

In [215]:
X=df.drop('Salary',axis=1)
y=df['Salary']
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [114]:
num_cols=['Age','Years_of_Experience']
cat_cols=['Gender','Job_Title']
ord_cols=['Education_Level']

num_pipe=Pipeline(steps=[('standard',StandardScaler())],)
ord_pipe=Pipeline(steps=[('ordinal',OrdinalEncoder(categories=[['High School','Bachelor', 'Masters', 'PhD']]))],)
cat_pipe=Pipeline(steps=[('cat',BinaryEncoder(cols=cat_cols))],)

preprocessor=ColumnTransformer(transformers=[('numeric',num_pipe,num_cols),('ordin',ord_pipe,ord_cols),('categ',cat_pipe,cat_cols)],remainder='passthrough')

In [115]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [116]:
model=LinearRegression()
model.fit(X_train,Y_train)

LinearRegression()

In [117]:
y_pred=model.predict(X_test)

In [118]:
Y_test

1883    150000.0
2630     75969.0
498     100000.0
5973     60000.0
4108     80000.0
          ...   
5021    120000.0
2592    185462.0
5706     70000.0
2929    150729.0
499     190000.0
Name: Salary, Length: 1675, dtype: float64

In [119]:
model.score(X_test,Y_test)

0.716698338901687

In [143]:
ls=Lasso(alpha=100,max_iter=1000)
ls.fit(X_train,Y_train)

Lasso(alpha=100)

In [144]:
ls.score(X_test,Y_test)

0.7175153008212871

In [147]:
ls_pred=ls.predict(X_test)

In [145]:
rg=Ridge(alpha=100,max_iter=1000)
rg.fit(X_train,Y_train)

Ridge(alpha=100, max_iter=1000)

In [146]:
rg.score(X_test,Y_test)

0.7161934235844925

In [148]:
rg_pred=rg.predict(X_test)

In [159]:
import joblib
joblib.dump(ls,'salary_prediction.pickle')

['salary_prediction.pickle']

In [160]:
joblib.dump(preprocessor,'model_transformer.pickle')

['model_transformer.pickle']

In [221]:
X_test.iloc[0].to_frame('0').T

Unnamed: 0,Age,Gender,Education_Level,Job_Title,Years_of_Experience
0,36.0,Male,Masters,Back end Developer,9.0


In [222]:
processor=joblib.load(open('model_transformer.pickle','rb'))
processor.transform(X_test.iloc[0].to_frame('0').T)

array([[0.32533386, 0.15297122, 2.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        1.        , 0.        , 0.        ]])

In [223]:
Age = 25
Gender= "Male"
Education_Level= "Bachelor"
Job_Title= "Data Analyst"
Years_of_experience= 3
frame=pd.DataFrame({'Age':[Age],'Years_of_Experience':[Years_of_experience],'Education_Level':[Education_Level],'Gender':[Gender],'Job_Title':[Job_Title]})

In [224]:
processor.transform(frame)

array([[-1.14759894, -0.84784211,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  1.        ]])