In [12]:
# When evaluating our models, not only do we want to visualize the results, 
# but we also want a quantitative measure to determine how accurate the model is.

# Two very important measures that are often used in Statistics to determine 
# the accuracy of a model are:

# R^2 / R-squared
# Mean Squared Error (MSE)

# R-squared

# R squared, also known as the coefficient of determination, is a measure to indicate 
# how close the data is to the fitted regression line.

# The value of the R-squared is the percentage of variation of the response variable (y) 
# that is explained by a linear model.

# Mean Squared Error (MSE)

# The Mean Squared Error measures the average of the squares of errors. 
# That is, the difference between actual value (y) and the estimated value (ŷ).

In [19]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
# load de dataframe
raw_dataset = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/automobileEDA.csv")
# select objetives columns
raw_dataset = raw_dataset.loc[:, ['highway-mpg', 'price']]
# copy dataset
dataset = raw_dataset.copy()
# clean nan values
dataset = dataset.dropna()
# sample train dataset
train_dataset = dataset.sample(frac=0.8, random_state=1)
# sample test dataset
test_dataset = dataset.drop(train_dataset.index)

# features (independent variables)
train_features = train_dataset.copy()
test_features = test_dataset.copy()
# labels (dependet variable)
train_labels = train_features.pop('price')
test_labels = test_features.pop('price')

In [16]:
regr = linear_model.LinearRegression()

regr.fit(train_features, train_labels)

price_pred = regr.predict(test_features)

print('The R-square is: ', regr.score(test_features, test_labels))

# close to 1 is fine

The R-square is:  0.521090174566289


In [17]:
Yhat = regr.predict(test_features)
print('The output of the first four predicted value is: ', Yhat[0:4])

The output of the first four predicted value is:  [16306.03066416 17974.31114462 22145.01234576  4628.06730096]


In [26]:
mse = mean_squared_error(test_labels, Yhat)
print('The mean square error of price and predicted value is: ', mse)

The mean square error of price and predicted value is:  29640542.54717908


In [None]:
# decision making: determining a good model fit

# Now that we have visualized the different models, and generated the R-squared and MSE values for the fits, how do we determine a good model fit?

# What is a good R-squared value?
# When comparing models, the model with the higher R-squared value is a better fit for the data.

# What is a good MSE?
# When comparing models, the model with the smallest MSE value is a better fit for the data.

# Let's take a look at the values for the different models.
# Simple Linear Regression: Using Highway-mpg as a Predictor Variable of Price.

# R-squared: 0.49659118843391759
# MSE: 3.16 x10^7
# Multiple Linear Regression: Using Horsepower, Curb-weight, Engine-size, and Highway-mpg as Predictor Variables of Price.

# R-squared: 0.80896354913783497
# MSE: 1.2 x10^7
# Polynomial Fit: Using Highway-mpg as a Predictor Variable of Price.

# R-squared: 0.6741946663906514
# MSE: 2.05 x 10^7