In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
url="C:/project_datasets/SALES.txt"
ds=pd.read_csv(url,sep="\t",header=None)

In [None]:
ds.shape

In [None]:
# Renaming the columns of the dataset.

ds.columns=["Sales","Advertising"]

In [None]:
# Independent variable= Feature Variable/ Input Varibale/ X/ Predictor Variable
# Dependent variable= Target Variable/ Ouput Variable/ Y/ Response Variable

In [None]:
# Declaring independent and dependent variable.

x=ds["Sales"].values
y=ds["Advertising"].values

# This gives the data values for x and y respectively.
# Values attribute of pandas dataframe returns the numpy arrays.

In [None]:
# Visual exploratory data analysis

plt.scatter(x, y, color = 'blue', label='Scatter Plot')
plt.title('Relationship between Sales and Advertising')
plt.xlabel('Sales')
plt.ylabel('Advertising')
plt.legend(loc=4)
plt.show()

In [None]:
# Checking dimensions of x and y

x.shape
y.shape

In [None]:
# Since we are working with only one feature variable, so we need to reshape using Numpy reshape() method.
# It specifies first dimension to be -1, which means "unspecified".

X = x.reshape(-1,1)
y = y.reshape(-1,1)

In [None]:
# Splitting data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [None]:
# Fitting the model

from sklearn.linear_model import LinearRegression

lm = LinearRegression()

lm.fit(X_train, y_train)

# predict on test data

y_pred=lm.predict(X_test)

In [None]:
b=lm.intercept_
c=lm.coef_

# So, our fitted regression line is:

# y=cx+b
# y = 1.60509347 * x - 11.16003616

# That is our linear model.

In [None]:
# Predicting Advertising values on first five Sales values.

lm.predict(X)[0:5]

In [None]:
# To make an individual prediction using the linear regression model.

print(str(lm.predict(24)))

In [None]:
# Regression metrics for model performance.

# Calculating the RMSE, MAS, and R2 score.

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse=mean_squared_error(y_test, y_pred)
rmse=np.sqrt(mse)

mae=mean_absolute_error(y_test, y_pred)

r2=r2_score(y_test, y_pred)

In [None]:
# Plotting the regression line.

plt.scatter(X, y, color = 'blue', label='Scatter Plot')
plt.plot(X_test, y_pred, color = 'black', linewidth=3, label = 'Regression Line')
plt.title('Relationship between Sales and Advertising')
plt.xlabel('Sales')
plt.ylabel('Advertising')
plt.legend(loc=4)
plt.show()

In [None]:
# Plotting residual errors

plt.scatter(lm.predict(X_train), lm.predict(X_train) - y_train, color = 'red', label = 'Train data')
plt.scatter(lm.predict(X_test), lm.predict(X_test) - y_test, color = 'blue', label = 'Test data')
plt.hlines(xmin = 0, xmax = 50, y = 0, linewidth = 3)
plt.title('Residual errors')
plt.legend(loc = 4)
plt.show()

In [None]:
# Calculating the training and testing score

train_score=lm.score(X_train, y_train)
test_score=lm.score(X_test, y_test)

# We obtained training score less than the testing score. So, underfitting.

In [None]:
# # Save model for future use

from sklearn.externals import joblib
joblib.dump(lm, 'lm_regressor.pkl')

# To load the model

# lm2=joblib.load('lm_regressor.pkl')