In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm

import pickle

from DataExploration import *
from DataExploration import descriptive_statistics as ds
from DataExploration import plots as pl

## Guidelines

I nedenstående codeblock udfyld de givne variabler med det ønskede data, og kør det.

**Indsæt variabler nedenfor**


In [None]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
ds.describe_data(df, verbose=True, round_digits=2)

In [None]:
df.drop(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime', 'StandardHours'], axis=1, inplace=True)

In [None]:
pl.show_histograms(df, layout='grid', bell_curve=True)

In [None]:
pl.show_boxplots(df, layout='grid')

In [None]:
pl.show_correlation_heatmap(df)

In [None]:
pl.show_correlation_heatmap(df[['TotalWorkingYears', 'Age', 'MonthlyIncome', 'MonthlyRate', 'JobLevel', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']])

Looking at the correlation heatmap, we've deducted that a linear regression on monthly income as our dependant variable. Since job level is a categorical variable, we only see a relevant correlation with total working years, making that our independant variable.

In [None]:
plt.xlabel('Total Working Years')
plt.ylabel('Monthly Income')
plt.title('Monthly Income vs Total Working Years')
plt.scatter(df['TotalWorkingYears'], df['MonthlyIncome'], color='blue')
plt.show()

## Linear Regression

In [None]:
# Independent variable
X = df[['TotalWorkingYears', 'JobLevel']].values.reshape(-1, 2)
# Dependent variable
y = df['MonthlyIncome'].values.reshape(-1, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.20)

In [None]:
myreg = LinearRegression()

In [None]:
myreg.fit(X_train, y_train)
myreg

In [None]:
a = myreg.coef_
b = myreg.intercept_

In [None]:
y_predicted = myreg.predict(X_test)

In [None]:
mae = sm.mean_absolute_error(y_test, y_predicted)
mse = sm.mean_squared_error(y_test, y_predicted)
rmse = np.sqrt(sm.mean_squared_error(y_test, y_predicted))
eV = round(sm.explained_variance_score(y_test, y_predicted), 2)
r2 = sm.r2_score(y_test, y_predicted)
print('Mean Absolute Error ',mae)
print('Mean Squared Error ',mse)
print('Root Mean Squared Error ',rmse)
print('Explained variance score ',eV )
print('R2 score ',r2)



In [None]:
import ShowLinearRegression as slr

In [None]:
# default test_size is 0.2 and the test_shape is 42 as default
slr.show_model(df, ['JobLevel', 'TotalWorkingYears'], ['MonthlyIncome'], test_size=0.25)