In [None]:
# Importing Needed packages
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

In [None]:
#Downloading Data
from urllib.request import urlopen
FuelConsumption ='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/FuelConsumptionCo2.csv'


In [None]:
#Reading the data
dataset = pd.read_csv(FuelConsumption)
# taking a look at the dataset
dataset.head()

In [None]:
#Data Exploration - A descriptive exploration of the data.
# summarize the data
dataset.describe()

In [None]:
# Lets choose or select some features to play with, just to understand what's going on here
choosen_features = dataset[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB','CO2EMISSIONS']]

# let's just look at 4 columns each with 8 rows for now :)#
choosen_features.head(8)


In [None]:
#Let's now visualize each feature using histograms
import seaborn as sns # just to put the histograms nice looking
sns.set(style="ticks")# the same here

veja = choosen_features[['CYLINDERS','ENGINESIZE','CO2EMISSIONS','FUELCONSUMPTION_COMB']]
veja.hist()
plt.show()

In [None]:
#Now, lets plot each of these features vs the Emission, to see how linear is their relationship
# FUELCONSUMPTION_COMB VS CO2EMISSIONS

plt.scatter(choosen_features.FUELCONSUMPTION_COMB, choosen_features.CO2EMISSIONS,  color='blue')
plt.xlabel("FUELCONSUMPTION_COMB")
plt.ylabel("Emission")
plt.show()

In [None]:
# ENGINESIZE VS CO2EMISSIONS
plt.scatter(choosen_features.ENGINESIZE, choosen_features.CO2EMISSIONS,  color='blue')
plt.xlabel("Engine size")
plt.ylabel("Emission")
plt.show()

In [None]:
# CYLINDERS VS CO2EMISSIONS
plt.scatter(choosen_features.CYLINDERS, choosen_features.CO2EMISSIONS,  color='blue')
plt.xlabel("CYLINDERS")
plt.ylabel("Emission")
plt.show()

In [None]:
# Creating train and test dataset - 80% of the entire data for training, and the 20% for testing.
# We create a mask to select random rows using np.random.rand() function:

mask = np.random.rand(len(dataset)) < 0.8 # taking 80 percent of our data randomly and store it in msk
train = choosen_features[mask]# choosen_features is the choosen data ( with 4 columns and 9 rows)
test = choosen_features[~mask]# the data not used or the remain 20 percent is the test set

In [None]:
#Train data distribution
# Ploting only one feature
plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS,  color='blue')
plt.xlabel("Engine size")
plt.ylabel("Emission")
plt.show()

In [None]:
# Modeling - Using sklearn package to model our data.
from sklearn import linear_model
regr = linear_model.LinearRegression()
train_x = np.asanyarray(train[['ENGINESIZE']])
train_y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit (train_x, train_y)

# The best coefficients for x and the y-intercept found by the model
Coefficients = regr.coef_
Intercept = regr.intercept_

# The coefficients and y-Intercept found
print ('Coefficients: ', Coefficients )
print ('Intercept: ', Intercept)

In [None]:
# Preicted CO2_Emission - Trainnig set
predicted_CO2 = Intercept[0] + Coefficients[0][0]*train_x
size_train_x = len(train_x) # just to remember how many rows we have in this trainning set :)#
size_train_x # It's 864
#predicted_CO2 # the same here

In [None]:
# Let's plot the predicted result as a line into a original dataset
plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS,  color='blue')
plt.plot(train_x, predicted_CO2, '-r') # plt.plot is used for line
plt.xlabel("Engine size")
plt.ylabel("Emission_CO2")

In [None]:
# Make new predictions using the test set
test_x = np.asanyarray(test[['ENGINESIZE']])
test_y = np.asanyarray(test[['CO2EMISSIONS']])
predicted_CO2_Emission = regr.predict(test_x)
#predicted_CO2_Emission

In [None]:
# EValuating the model's performance - using the three measures well known
from sklearn.metrics import r2_score
print("Mean absolute error: %.2f" % np.mean(np.absolute(predicted_CO2_Emission - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((predicted_CO2_Emission - test_y) ** 2))
print("R2-score: %.2f" % r2_score(predicted_CO2_Emission, test_y) ) # I prefer this one : )##