In [1]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

salary_data = pd.read_csv('salary.csv')

# Remove Nan values
salary_data.dropna(how='any',axis=0)

salary_data.shape

y = salary_data.salary

X_train, X_test, y_train, y_test = train_test_split(salary_data, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

lm = smf.ols(formula='salary ~ yearsworked', data=X_train).fit() 

lm.params

(411, 11)
(103, 11)


Intercept      39853.42221
yearsworked      852.88278
dtype: float64

# What does the unstandardized coefficient (B or 'coef' in statsmodels) tell you about the relationship between Years Worked and Salary?

#### There is a positive relationship in between the salary and the years worked in that if you increase the years worked by 1 "unit" the salary would increase by approximately 852.88

# Calculate the expected salary for someone with 12 years’ work experience.

In [2]:
pre_df = pd.DataFrame({'yearsworked': [12]})
lm.predict(pre_df)

0    50088.015569
dtype: float64

#### Predicted salary for 80 years work experience: 50088.01

# Calculate the expected salary for someone with 80 years’ work experience. Are there any problems with this prediction? If so, what are they?

In [3]:
ei = pd.DataFrame({'yearsworked': [80]})
lm.predict(ei)

0    108084.044605
dtype: float64

#### Predicted salary for 80 years work experience: 108084.04

#### It is very unlikely for someone to to have 80 years work experience.

#### The prediction amount is very low for someone with some much experience wheres someone with 12 years experince would earn about half of this prediction. The problem may be that salary is also affected by other factors within the data set

In [19]:
# create a DataFrame with the minimum and maximum values
predictions = lm.predict(X_train.yearsworked)

In [5]:

# print the confidence intervals for the model coefficients
lm.conf_int()

Unnamed: 0,0,1
Intercept,38241.973844,41464.870575
yearsworked,752.014712,953.750848


# What do the 95% confidence intervals [0.025, 0.975] mean?

####  approximately 95% of the confidence intervals contain the "true" coefficient.

In [6]:
# print the p-values for the model coefficients
lm.pvalues

Intercept      4.551765e-172
yearsworked     9.006810e-48
dtype: float64

# Does the model significantly predict the dependent variable? Report the amount of variance explained (R^2) and significance value (p) to support your answer.

#### The model significantly oredicts the depended variable given that is has an an r2 of 0.4 and a p-value that is less than the threshold of 0.5

# What percentage of the variance in employees’ salaries is accounted for by the number of years they have worked

#### 38%

In [7]:
# print the R-squared value for the model
lm.rsquared

0.40315931469675204

In [8]:
lm.summary()

0,1,2,3
Dep. Variable:,salary,R-squared:,0.403
Model:,OLS,Adj. R-squared:,0.402
Method:,Least Squares,F-statistic:,276.3
Date:,"Mon, 01 Apr 2019",Prob (F-statistic):,9.01e-48
Time:,15:30:57,Log-Likelihood:,-4359.2
No. Observations:,411,AIC:,8722.0
Df Residuals:,409,BIC:,8730.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.985e+04,819.750,48.617,0.000,3.82e+04,4.15e+04
yearsworked,852.8828,51.312,16.622,0.000,752.015,953.751

0,1,2,3
Omnibus:,59.0,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.15
Skew:,1.009,Prob(JB):,2.39e-18
Kurtosis:,3.818,Cond. No.,27.2


In [9]:
salary_data.corr()

Unnamed: 0,salary,exprior,yearsworked,yearsrank,market,degree,otherqual,position,male,Field,yearsabs
salary,1.0,0.119989,0.623589,0.610706,0.407214,0.044554,0.167137,0.702122,0.361564,-0.499316,-0.06963
exprior,0.119989,1.0,-0.246549,-0.066403,-0.035041,-0.049373,0.264804,0.116596,0.046558,-0.048315,0.13238
yearsworked,0.623589,-0.246549,1.0,0.813471,-0.070896,0.028421,-0.04049,0.746736,0.278963,-0.261379,0.055468
yearsrank,0.610706,-0.066403,0.813471,1.0,-0.026975,0.006516,0.007036,0.48521,0.237787,-0.263953,0.035632
market,0.407214,-0.035041,-0.070896,-0.026975,1.0,0.036408,0.021692,-0.013358,0.181201,-0.223827,-0.167068
degree,0.044554,-0.049373,0.028421,0.006516,0.036408,1.0,-0.214717,0.04368,0.061611,-0.098424,0.029311
otherqual,0.167137,0.264804,-0.04049,0.007036,0.021692,-0.214717,1.0,0.160311,0.015833,-0.076623,-0.017639
position,0.702122,0.116596,0.746736,0.48521,-0.013358,0.04368,0.160311,1.0,0.318129,-0.288812,0.029751
male,0.361564,0.046558,0.278963,0.237787,0.181201,0.061611,0.015833,0.318129,1.0,-0.128874,-0.622179
Field,-0.499316,-0.048315,-0.261379,-0.263953,-0.223827,-0.098424,-0.076623,-0.288812,-0.128874,1.0,-0.000127


# We have only looked at the number of years an employee has worked. What other employee characteristics might influence their salary?

#### Gender
#### Education
#### exprior
#### Field
#### Yearsrank

In [10]:
np.sqrt(mean_squared_error(y_train, predictions))


9772.459691268014

In [11]:
nm = smf.ols(formula='salary ~ yearsworked', data=X_test).fit()
nm.summary()

0,1,2,3
Dep. Variable:,salary,R-squared:,0.335
Model:,OLS,Adj. R-squared:,0.329
Method:,Least Squares,F-statistic:,50.45
Date:,"Mon, 01 Apr 2019",Prob (F-statistic):,1.82e-10
Time:,15:30:58,Log-Likelihood:,-1088.3
No. Observations:,102,AIC:,2181.0
Df Residuals:,100,BIC:,2186.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.112e+04,1723.426,23.859,0.000,3.77e+04,4.45e+04
yearsworked,776.8937,109.379,7.103,0.000,559.888,993.899

0,1,2,3
Omnibus:,26.696,Durbin-Watson:,1.792
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.186
Skew:,1.163,Prob(JB):,6.91e-10
Kurtosis:,5.126,Cond. No.,26.2


In [17]:
test_predictions = nm.predict(X_test)
test_predictions

304    58988.008995
497    55880.434158
440    48888.390775
153    48111.497066
499    55103.540449
131    46557.709648
204    56657.327867
508    55880.434158
325    61318.690122
247    57434.221576
362    62872.477541
352    57434.221576
289    60541.796413
84     46557.709648
10     43450.134811
324    45003.922229
78     46557.709648
30     41896.347393
184    54326.646740
195    54326.646740
222    61318.690122
209    53549.753030
281    59764.902704
208    57434.221576
124    45003.922229
250    55880.434158
76     44227.028520
381    58211.115286
342    62872.477541
323    60541.796413
           ...     
104    48111.497066
296    43450.134811
101    47334.603357
79     41119.453683
428    45003.922229
388    41896.347393
18     43450.134811
176    54326.646740
231    56657.327867
504    60541.796413
132    44227.028520
39     43450.134811
471    52772.859321
46     44227.028520
356    63649.371250
277    58211.115286
361    61318.690122
93     47334.603357
377    50442.178194


In [None]:
np.sqrt(mean_squared_error(y_test, test_predictions))