In [2]:
# Import all dependencies required for the problem.
from __future__ import print_function
from plotly.offline import iplot, init_notebook_mode

import numpy as np
import pandas as pd
import plotly.graph_objs as go

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
# Set a Seed for random number generation for reproducible results
init_notebook_mode(connected=True)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [4]:
# Load the titanic dataset using Pandas library 
df = pd.read_csv('../../data/salary_dataset.csv')

In [5]:
df.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [7]:
# Split the dataset into dependent features (passenger details used for prediction)
# and target features (prediction if the passenger survived)
x = df.loc[:,:'YearsExperience']
y = df['Salary']

In [12]:
# Split the dataset into train and test, for learning from one dataset and test it on the other.
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [14]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
from math import sqrt
from sklearn.metrics import mean_squared_error
print("Root Mean Squared Error Linear Regression Model: {:.2f}".format(
    sqrt(mean_squared_error(y_test.values, regressor.predict(X_test)))
))

Root Mean Squared Error Linear Regression Model: 7059.04


In [17]:
plot1 = go.Scatter(
    x = X_train['YearsExperience'].values,
    y = y_train.values,
    name='Training Data',
    mode='markers',
    connectgaps=True
)

plot2 = go.Scatter(
    x = X_train['YearsExperience'].values,
    y = regressor.predict(X_train.values),
    name='Model Prediction',
    connectgaps=True
)

fig = dict(data=[plot1, plot2], layout=dict(
    title='Salary vs Experience',
    xaxis=dict(title='Experience (years)'),
    yaxis=dict(title='Salary')))
iplot(fig)