In [276]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [277]:
df=pd.read_csv("Salary Data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


In [278]:
#drop null values
df=df.dropna()

In [279]:
#copy data
employee_data=df.copy()
employee_data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [280]:
#feature encoding
#1] One Hot encoding
employee_data=pd.get_dummies(employee_data,columns=["Gender"],drop_first=True)
employee_data["Gender_Male"]=employee_data["Gender_Male"].astype("int64")
#2] Label Encoding
mapping={
    "Bachelor's":1,
    "Master's":2,
    "PhD":3
}
employee_data["Education Level"]=employee_data["Education Level"].map(mapping)

In [281]:
#3] Frequency Encoding
freq=employee_data["Job Title"].value_counts()
employee_data["Job Title"]=employee_data["Job Title"].map(freq)

In [282]:
#move salary column to last after one hot encoding
cols = [col for col in employee_data.columns if col != 'Salary'] + ['Salary']
employee_data = employee_data[cols]

In [283]:
#train_test_split
X=employee_data.drop(["Salary"],axis=1)
y=employee_data["Salary"]
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)
X_train.head()

Unnamed: 0,Age,Education Level,Job Title,Years of Experience,Gender_Male
193,34.0,1,6,7.0,1
75,37.0,1,2,10.0,1
84,29.0,1,1,2.0,0
363,33.0,1,5,5.0,1
16,33.0,2,1,7.0,0


In [284]:
#perform Standardization
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [285]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train_scaled,y_train)
y_pred=model.predict(X_test_scaled)


In [299]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error


In [290]:
print("Train R2:", model.score(X_train_scaled, y_train))
print("Test R2:", model.score(X_test_scaled, y_test))


Train R2: 0.9033612765419364
Test R2: 0.8998057975106805


In [291]:
print('MAE :',mean_absolute_error(y_test,y_pred))
print("MSE :",mean_squared_error(y_test,y_pred))
print("RMSE :",root_mean_squared_error(y_test,y_pred))

MAE : 10649.032393185065
MSE : 240223838.65812597
RMSE : 15499.156062770837


In [294]:
import numpy as np
print("Mean of y_test:", np.mean(y_test))
print("MAE %:", (10649 / np.mean(y_test)) * 100)


Mean of y_test: 102466.66666666667
MAE %: 10.392648015614833


In [300]:
#Perform LassoCV
from sklearn.linear_model import LassoCV
alphas=[0.01,0.1,2,5,10,20,30,40,50,100]
LassoCV_model=LassoCV(
    alphas=alphas,
    cv=5,
    max_iter=1000,
    random_state=42
)
LassoCV_model.fit(X_train_scaled,y_train)
y_pred_lasso=LassoCV_model.predict(X_test_scaled)
print("MSE using LassoCV : ",mean_squared_error(y_test,y_pred_lasso))

MSE using LassoCV :  240225013.51707652


In [298]:
print("Linear R2:", r2_score(y_test, y_pred))
print("Lasso R2:", r2_score(y_test, y_pred_lasso))


Linear R2: 0.8998057975106805
Lasso R2: 0.8993615243280902
