# MLR


### Import packages & Loading Data


In [None]:
import pandas as pd
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import statsmodels
from scipy.stats import iqr
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.stats import pearsonr
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from math import sqrt
from statistics import mean,stdev
import warnings
warnings.filterwarnings("ignore")


In [None]:
sal = pd.read_csv('salary.csv')

### Missing Values

In [None]:
sal.isnull().sum()

In [None]:
sal['salary'].fillna(sal['salary'].mean(),inplace = True)
sal = sal.drop_duplicates()

* Filling in missing values and dropping duplicates

### Splitting Data

In [None]:
y = sal.salary
x = sal
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)

In [None]:
data_corr = x_train.corr().iloc[:,0].sort_values()
data_corr

In [None]:
strong_data = data_corr[abs(data_corr) > 0.45].sort_values(ascending=False)


*The features of interest with regard to salary are:*
- Years worked in this field
- Years worked at current rank 
- Position
- Field of work

*as they have the higher correlation*

*We further need to see if the relationships are significant.* 

### Descriptive table

In [None]:
data_num = x_train[['salary','yearsworked', 'yearsrank','position','Field']]

In [None]:
data_num.describe()

### Histograms

In [None]:
data_num.hist(figsize=(16, 20),bins=20, xlabelsize=8, ylabelsize=8)

*From the above histograms of the features of interest it is clear that Position and Field are categorical data.*
- Salary 
    - There are possible outliers
    - it is skewed to the right 
    - A lot of individuals earn between 33000 dollars & 75000 dollars


- Years worked in this field
    - There are possible outliers
    -i t is skewed to the right
    - A lot of individuals have worked between 0 years & 29 years
    

- Years worked at current rank
    - It is skewed to the right
    - There are possible outliers
    - Most individuals have worked at the current rank for years between 0 & 5 years.
    

### Scatter plots

In [None]:
scatter_matrix(data_num, alpha=0.2, figsize=(10, 10))

*Looking at the above scatter matrix:*
- The first row from top of the matrix
    - It shows the realtionship of salary with other features of interest.
        - There is a linear relationship between salary & years worked in the current field.
        - There is a linear relationship between salary & Years worked at current rank.
        - There is no defined relationship with salary for position and Field.
- The matrix can be used to see if there is a relationship amongst the features other than the relationship with salary 
    

*The Field feature and Position feature are categorical. We therefore have to transorm the data.*

### Transforming data (get_dummies)

In [None]:
dict1 = {1:'Engineering', 2:'Finance', 3:'Human Resources', 4:'Marketing'}
data['Field'] = data.Field.map(dict1)
dict2 ={0:'female', 1: 'male'}
data['male'] =data.male.map(dict2)
dict3={1:'Junior', 2:'Manager', 3:'Executive'}
data['position']=data.position.map(dict3)
dict4 ={0:'no other qual', 1:'yes other qual'}
data['otherqual'] = data.otherqual.map(dict4)
dict5 = {0:'no degree', 1:'yes degree'}
data['degree']= data.degree.map(dict5)
data = pd.get_dummies(data, prefix_sep='_', drop_first=True)


In [None]:
dict1 = {1:'Engineering', 2:'Finance', 3:'Human Resources', 4:'Marketing'}
x_train['Field'] = x_train.Field.map(dict1)
dict2 ={0:'female', 1: 'male'}
x_train['male'] =x_train.male.map(dict2)
dict3={1:'Junior', 2:'Manager', 3:'Executive'}
x_train['position']=x_train.position.map(dict3)
dict4 ={0:'no other qual', 1:'yes other qual'}
x_train['otherqual'] = x_train.otherqual.map(dict4)
dict5 = {0:'no degree', 1:'yes degree'}
x_train['degree']= x_train.degree.map(dict5)


In [None]:
x_train1 = pd.get_dummies(x_train, prefix_sep='_', drop_first=True)

In [None]:
desired_features = x_train1[['salary','yearsworked','yearsrank','market','position_Junior','position_Manager','Field_Finance','Field_Marketing','Field_Human Resources']]

In [None]:
desired_features_corr = desired_features.corr()
desired_features_corr.style.background_gradient(cmap = 'Pink')

In [None]:
print('For yearsworked, the correlation and p_value are:',stats.pearsonr(x_train1['yearsworked'],x_train['salary']))
print('For yearsrank, the correlation and p_value are:',stats.pearsonr(x_train1['yearsrank'],x_train1['salary']))
print('For junior position, the correlation and p_value are:',stats.pearsonr(x_train1['position_Junior'],x_train1['salary']))
print('For manager position, the correlation and p_value are:',stats.pearsonr(x_train1['position_Manager'],x_train1['salary']))
print('For finance field, the correlation and p_value are:',stats.pearsonr(x_train1['Field_Finance'],x_train1['salary']))
print('For marketing field, the correlation and p_value are:',stats.pearsonr(x_train1['Field_Marketing'],x_train1['salary']))
print('For HR field, the correlation and p_value are:',stats.pearsonr(x_train1['Field_Human Resources'],x_train1['salary']))


*The above correlationship map shows the relationship between the features.* 
- It can show if there is multicollinearity that exist.
- The darker the color on the block the higher the correlation.
    
*From the features we assume to be desired, the ones with strong correlation with salary are:*
- Yearsworked
- Yearsrank
- Position 

*But there is a strong correlation between yearsworked and yearsank.
Therefore the features chosen for fitting a model are:*
- yearsworked
- position

Due to the low correlation between salary and the Field categories, we decided to not include Field in our model.

In [None]:
dict1 = {1:'Engineering', 2:'Finance', 3:'Human Resources', 4:'Marketing'}
x_test['Field'] = x_test.Field.map(dict1)
dict2 ={0:'female', 1: 'male'}
x_test['male'] =x_test.male.map(dict2)
dict3={1:'Junior', 2:'Manager', 3:'Executive'}
x_test['position']=x_test.position.map(dict3)
dict4 ={0:'no other qual', 1:'yes other qual'}
x_test['otherqual'] = x_test.otherqual.map(dict4)
dict5 = {0:'no degree', 1:'yes degree'}
x_test['degree']= x_test.degree.map(dict5)
x_test1 = pd.get_dummies(x_test, prefix_sep='_', drop_first=True)

In [None]:
# Field = pd.get_dummies(x_train['Field'],prefix_sep='_',drop_first=False)
# Field.set_index('Human Resources')

In [None]:
# dat_num = data_num.drop(['salary'],axis=1)
# plt.figure(figsize=(15,15))
# ax = sns.boxplot(data=dat_num,orient='h' ,palette='Set2')


## Fitting the model for all the features

In [None]:
x_train1 = x_train1.drop('salary',axis=1)

In [None]:
import statsmodels.api as sm
x_train2 = sm.add_constant(x_train1)
model1 = sm.OLS(y_train,x_train2).fit()
model1.summary()


*The R-squared is 0.823 which means about 82% of the data is exlained by this model.*
*Looking at the P-values, most variables do not have a significant relationship with salary.*

## Fitting the model for the desired features

In [None]:
x_train3 = x_train1[['yearsrank','position_Junior','position_Manager']]
x_train3 = sm.add_constant(x_train3)
model2 = sm.OLS(y_train,x_train3).fit()
model2.summary()

*There Rsquared value of this model is 0.619. Which tells that the model explains about 62% of the observed points.*


*What we get from this model is that:*
 - For every 1 unit increase in yearsranked the employee salary will increase by 699.3902 dollars.
 - For a junior position the employee will earn less by 17020 dollars.
 - For manager position the employee will earn less by by 11580 dollars.
 - According to this model the lower the rank of your position the lower the employee salary.
 - For any employee the employee will earn atleast 54510 dollars.
    

### Predictions


In [None]:
x_test1 = x_test1[['yearsrank','position_Junior','position_Manager']]
x_test1 = sm.add_constant(x_test1)

In [None]:
# salary_prediction1 = model2.predict(x_train3)
salary_prediction2 = model2.predict(x_test1)


calculating residuals


In [None]:
# residual1 = y_train.values - salary_prediction1
residual2 = y_test.values - salary_prediction2
# residual1_mean = residual1.mean()
residual2_mean = residual2.mean()
# residual1_std = residual1.std()
residual2_std = residual2.std()

In [None]:
standardised_residual2 =  residual2/residual2_std
standardised_residual2

In [None]:
standardised_predicted = (salary_prediction2 - salary_prediction2.mean())/salary_prediction2.std()

In [None]:

fig,figsize = (60,30)
sns.residplot(standardised_predicted, standardised_residual2,lowess = True)
plt.xlabel = ('fitted values')
#title and labels x = predicted(fitted values) and y = std residuals

We added the lowess line, which is the locally weighted smoothing line. The lowess line is a tool which creates a smooth line through our residual plot to help us see a realtionship and future trends between our variables.There is no clear relationship in the plot, since the points are scattered (randomly dispersed) thus, the addition of the lowess line supports that a linear model is appropriate for the data. 
There are no problems with the regression.

In [None]:
r=mean_squared_error(y_test,salary_prediction2)
root_mean_error2 = sqrt(r)
print('The RMSE for test set is:',root_mean_error2)