### Multiple Linear Regression

#### Step 1. Import packages

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

#### Step 2. Read the data in

In [2]:
# download the source dataset here: path= "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%202/data/FuelConsumptionCo2.csv"
source = "/Users/pc/Desktop/IBM AI Engineer/Machine Learning with Python"
path = source + "FuelConsumptionCo2.csv"

df = pd.read_csv("FuelConsumptionCo2.csv")

#### Step 3. Data Exploration

In [3]:
# summarize the data
df.describe()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
count,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0
mean,2014.0,3.346298,5.794752,13.296532,9.474602,11.580881,26.441425,256.228679
std,0.0,1.415895,1.797447,4.101253,2.79451,3.485595,7.468702,63.372304
min,2014.0,1.0,3.0,4.6,4.9,4.7,11.0,108.0
25%,2014.0,2.0,4.0,10.25,7.5,9.0,21.0,207.0
50%,2014.0,3.4,6.0,12.6,8.8,10.9,26.0,251.0
75%,2014.0,4.3,8.0,15.55,10.85,13.35,31.0,294.0
max,2014.0,8.4,12.0,30.2,20.5,25.8,60.0,488.0


#### Step 4. Creating train and test dataset

- 80% train & 20% test
- select random rows using np.random.rand()

In [5]:
cdf = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY','FUELCONSUMPTION_COMB','CO2EMISSIONS']]

msk = np.random.rand(len(df)) < 0.8
train = cdf[msk]
test  = cdf[~msk]

#### Step 5. Build the Simple Regression Model 
linear regression fits a linear model with coefficients B = (B1, ..., Bn) to minimize the MSE.

In [8]:
# Modelling: use sklearn package to model data
from sklearn import linear_model
regr = linear_model.LinearRegression()
x = np.asanyarray(train[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_COMB"]])
y = np.asanyarray(train[["CO2EMISSIONS"]])
regr.fit(x, y)

# Coefficients
print("Coefficients: ", regr.coef_)

Coefficients:  [[9.99933053 7.94880197 9.86734343]]


#### Step 6. Prediction

In [10]:
y_pred = regr.predict(test[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_COMB"]])
x = np.asanyarray(test[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_COMB"]])
y = np.asanyarray(test[["CO2EMISSIONS"]])
print("Residual sum of squares: %.2f"%np.mean((y_pred - y) ** 2))


Residual sum of squares: 521.43




#### Step 7. Evaluation

$$explainedVariance(y, \hat y) = 1 - \frac{Var\{y - \hat y\}}{Var\{y\}}$$

In [11]:
# Explained variance score: 1 is prefect prediction
print("Variance score: %.2f" % regr.score(x, y))


Variance score: 0.88


#### Step 8. Example

Another model:
- "ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_CITY"
- vs
- "CO2EMISSIONS"

In [13]:
# Build the Simple Regression Model 
from sklearn import linear_model
regr = linear_model.LinearRegression()
x = np.asanyarray(train[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_CITY"]])
y = np.asanyarray(train[["CO2EMISSIONS"]])
regr.fit(x, y)

# Prediction
y_pred = regr.predict(test[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_CITY"]])
x = np.asanyarray(test[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_CITY"]])
y = np.asanyarray(test[["CO2EMISSIONS"]])
print("Residual sum of squares: %.2f"%np.mean((y_pred - y) ** 2))

# Evaluation
print("Variance score: %.2f" % regr.score(x, y))


Residual sum of squares: 511.95
Variance score: 0.88


