In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_log_error, median_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
#reading the data 
df=pd.read_csv("D:\Beinex\Python\Dataset-Kaggle\Task_26-06\Salary_Data.csv")
df

##### Understanding the data and its distribution

In [None]:
df.describe()

In [None]:
df.info()

##### Data cleaning & Feature Engineering

*Since the data does not have any unique id or we will not remove duplicates*

In [None]:
#checkking whether there is any na values in the data
df.isna().sum()

In [None]:
#finding rows where education level is na 
df[df['Education Level'].isna()]

In [None]:
#drop rows where all values are na
df=df.dropna(how='all',axis=0)
#checking the na values in data after dropping 
df.isna().sum()

In [None]:
#labels in eductaion distribution in the data
edu=df.value_counts(df['Education Level'])
print(edu)

In [None]:
#found multiple category labels, combined and mapped them
map_var={"High School":1,"Bachelor's Degree":2,"Bachelor's":2,"Master's Degree":3,"Master's":3,"PhD":4,"phD":4}
df['Education'] = df['Education Level'].transform(lambda x: x.map(map_var))
df['Education'].unique()
df.head()

In [None]:
df[df['Salary'].isna()]

In [None]:
#fill the missing salary considering the job title and eductaion
df['Salary']=df.groupby(['Job Title','Education'])['Education'].transform(lambda x: x.fillna(x.mode().iloc[0]))

In [None]:
#dropping the rows where still na values are present 
df.dropna(axis=0,inplace=True)
df.isna().sum()


In [None]:
#plotting the gender distribution in the data
gender=df.value_counts(df['Gender'])
print(gender)
plt.pie(gender,labels=gender,autopct='%1.1f%%')
plt.title('Gender distribution in Salary data')
plt.show()

*The data has almost an equaal distribution of male and female, however the 'other' categories are found to be significantly less*

In [None]:
#The gender labels are changed to numercal values using LabelEncoder method
df['Gender']=LabelEncoder().fit_transform(df['Gender']) #male as 1, female 0 and others and 2
df.head()

In [None]:
print("No of unique jobs titles are: ",df['Job Title'].nunique())
#The lables under job title are changed to numercal values using LabelEncoder method
df['Job Title']=LabelEncoder().fit_transform(df['Job Title']) 
df.info()

In [None]:
sns.displot(data=df, x='Years of Experience')
plt.title('Distribution of Years of Experience')
plt.show()


*Skewed to the right which implies need of normalization before applying ML algorithms as well as the presense of outliers in the data*

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#Normalize the data (age,years of experience)
#df[['Age','Years of Experience']]=MinMaxScaler().fit_transform(df[['Age','Years of Experience']])

sns.pairplot(df, x_vars=['Age','Gender','Education','Job Title','Years of Experience'], y_vars=["Salary",'Years of Experience'])

*Strong correlation has been observed between education & salary, age & years of experience*

In [None]:
#checking outliers in the data
sns.boxplot(df['Salary'])

*No outliers found in salary variable. In case of age and years of experience, some outliers are found above the third quartile. But I believe removing those values might affect the prediction model.*

#### Applying regression models 

In [None]:
df.info()

In [None]:
#Assigning the dependent and independent variables 
X=df[['Age','Gender','Education','Job Title','Years of Experience']]
Y=df['Salary']

In [None]:
from sklearn import linear_model
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:

MLR=linear_model.LinearRegression().fit(X,Y)

#Multiple linear regression model
#Values 
print('The intercept: ', MLR.intercept_)
print('The coefficiants are: ',MLR.coef_)

#score
print("Variability in Y explained by X: ",MLR.score(X,Y))

In [None]:
LM=ols('Y~X',df).fit()
print(LM.summary())

*From the summary table, it is found that P>|t| value is greater than 0.05 for the variable 'gender', which implies the variable 'gender' is in-significant in predicting the salary*.

*Hence, we create another model without 'gender' in feature variables*

In [None]:
#Assigning the dependent and independent variables 
X=df[['Age','Education','Job Title','Years of Experience']]
Y=df['Salary']

#fitting the model
LM=ols('Y~X',df).fit()
print(LM.summary())

*All variables are found significant with R^2 value 1*

##### Different loss functions

1) **Mean Squared Error (MSE)**: Mean Squared Error is a commonly used loss function for regression problems. It calculates the average of the squared differences between the predicted values and the actual values. MSE penalizes larger errors more heavily due to the squaring operation.The lower the MSE value, the better the model's performance, with zero indicating a perfect fit.

2) **Mean Absolute Error (MAE)**: Mean Absolute Error calculates the average of the absolute differences between the predicted values and the actual values. MAE treats all errors equally and does not differentiate between small and large errors. The lower the MAE value, the better the model's performance, with zero indicating a perfect fit.

3) **R-squared (Coefficient of Determination)**: R-squared is a metric used to measure the proportion of the variance in the target variable that is explained by the model. It represents the goodness of fit of the regression model. R-squared ranges from 0 to 1, where 0 indicates that the model does not explain any variability and 1 indicates a perfect fit.

4) **Root Mean Squared Error (RMSE)**:RMSE is the square root of the Mean Squared Error (MSE).It represents the average magnitude of the errors made by the model in the same units as the target variable.RMSE is useful when we want to evaluate the model's performance in a more interpretable scale.

5) **Mean Absolute Percentage Error (MAPE)**: MAPE measures the average percentage difference between the predicted and actual values.It calculates the absolute percentage difference for each data point and then takes the average.MAPE is useful when you want to assess the relative error in percentage terms.

6) **Huber Loss**: Huber Loss is a combination of MSE and MAE. It behaves like MSE for small errors but switches to MAE for larger errors.
Huber Loss is less sensitive to outliers compared to MSE and provides a compromise between MSE and MAE.


*In this data, I believe all these listed loss functions are usable. However, since the varaible 'Salary' does not have any outliers Mean Absolute Error and Huber loss function will be the best to understand the performance of the regression based prediction.*


**Using linear regression model for the analysis**

In [None]:
#Split the data into training and testing sets (0.7:0.3)
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, train_size=0.7,test_size=0.3, random_state=33)

#Create an instance of Linear Regression
model=LinearRegression()

#Train the Linear Regression model
model.fit(X_train, Y_train)
Y_pred=model.predict(X_test)

#Evaluate the model using different loss functions
mse=mean_squared_error(Y_test,Y_pred)
r2=r2_score(Y_test,Y_pred)
rmse=np.sqrt(mse)
mae=mean_absolute_error(Y_test,Y_pred)
mape=np.mean(np.abs((Y_test-Y_pred) / Y_test))*100
huber_loss=mean_squared_error(Y_test,Y_pred, squared=False)

#Print the loss values
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"Huber Loss: {huber_loss}")

In [None]:
#Split the data into training and testing sets (0.8:0.2)
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, train_size=0.8,test_size=0.2, random_state=33)

#Create an instance of Linear Regression
model=LinearRegression()

#Train the Linear Regression model
model.fit(X_train, Y_train)
Y_pred=model.predict(X_test)

#Evaluate the model using different loss functions
mse=mean_squared_error(Y_test,Y_pred)
r2=r2_score(Y_test,Y_pred)
rmse=np.sqrt(mse)
mae=mean_absolute_error(Y_test,Y_pred)
mape=np.mean(np.abs((Y_test-Y_pred) / Y_test))*100
huber_loss=mean_squared_error(Y_test,Y_pred, squared=False)

#Print the loss values
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"Huber Loss: {huber_loss}")

In [None]:
#Split the data into training and testing sets (0.9:0.1)
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, train_size=0.9,test_size=0.1, random_state=33)

#Create an instance of Linear Regression
model=LinearRegression()

#Train the Linear Regression model
model.fit(X_train, Y_train)
Y_pred=model.predict(X_test)

#Evaluate the model using different loss functions
mse=mean_squared_error(Y_test,Y_pred)
r2=r2_score(Y_test,Y_pred)
rmse=np.sqrt(mse)
mae=mean_absolute_error(Y_test,Y_pred)
mape=np.mean(np.abs((Y_test-Y_pred) / Y_test))*100
huber_loss=mean_squared_error(Y_test,Y_pred, squared=False)


#Print the loss values
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"Huber Loss: {huber_loss}")

*The model was performing better with a train test split ration of 70:30. Therefore we take this as the best split ratio*

*Lower loss values indicate better performance in all the loss functions, except for R-squared where higher values indicate better fit. We are considering MSE loss function as the best fit loss fn according to the data.*

*The MAE value and Huber loss value corresponding the 70:30 split is* **4.765559110910889e-15**

In [None]:
#Calculate residuals
#Split the data into training and testing sets (0.7:0.3)
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, train_size=0.7,test_size=0.3, random_state=33)

#Create an instance of Linear Regression
model=LinearRegression()

#Train the Linear Regression model
model.fit(X_train, Y_train)
Y_pred=model.predict(X_test)

Residual=Y_pred-Y_test
Residual

'''
#plotting 
plt.scatter(Y_pred,Residual,color='green')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Values')
plt.show()
'''