In [1]:
"""
Created on Mon May 30 16:50:58 2022

AI Test 3 - Multivariable inputs to a single output using a linear regression
model. Upgraded version of AI Test 2

Isaac Burkholder
"""

import pandas as pd
import numpy as np
from sklearn import linear_model

# Read the data (this is some open data online, but any data will work)
data = pd.read_csv("https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/FuelConsumptionCo2.csv")
data.head()

# Consider the features we want to analyze
#Note that one of the interesting features of AI is
#there abillity to overcome overfitting with additional parameters.
#See: https://www.youtube.com/watch?v=lK5LcwmMn9Y, minute 47
X = data[[ 'ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY', 
 'FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG']]
Y = data["CO2EMISSIONS"]

# Generating training and testing data from our data:
# We are using 80% data for training.
train = data[:(int((len(data)*0.8)))]
test = data[(int((len(data)*0.8))):]

#Modeling:
#Using sklearn package to model data :
regr = linear_model.LinearRegression()
train_x = np.array(train[[ 'ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY',
 'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG']])
train_y = np.array(train["CO2EMISSIONS"])
regr.fit(train_x,train_y)
test_x = np.array(test[[ 'ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY',
 'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG']])
test_y = np.array(test["CO2EMISSIONS"])

# print the coefficient values:
coeff_data = pd.DataFrame(regr.coef_ , X.columns , columns=["Coefficients"])
coeff_data

#Now let's make a prediction from the data:
Y_pred = regr.predict(test_x)

# Check accuracy. R2 score is a general term used in the field
#of AI.

from sklearn.metrics import r2_score
R = r2_score(test_y , Y_pred)
print ("R² :",R)

R² : 0.9362912548588906


## A simple visualization of what is happening. Note that this is in a lower vector space, as we cannot graph in dimensions higher than 3.

In [None]:
"""
Created on Mon May 30 15:12:28 2022

AI Test 2

This is a linear regression model on a two dimensional plane.
Our 2 dimensions are CO2EMISSION and ENGINE SIZE. Essentially what we are doing
is transposing our data onto a high dimensional space and finding the line of
best fit. 

Pros to this model:
    Linear regression and linear algebra are easy to understand and because this
    is only in 2 dimensions we can visualize it quite simply
    
Cons:
    This model cannot handle multidimensional spaces. It is also quite probable
    that it is not robust enough to accurately model much

@author: Isaac Burkholder
"""
#For working with databases
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt

#To let python learn from linear models
from sklearn import linear_model

#Get feedback on prediction accuracy
from sklearn.metrics import r2_score

#This function returns prediction models by multiplying our input with the 
#weights that it has learned to assign, and the intercept it has learned to 
#assign. Both of these variables come from the line of best fit.
def get_regression_predictions(input_features,intercept,slope):
    predicted_values = input_features*slope + intercept
    return predicted_values

#this is the dataset that we are using
data = pd.read_csv("C:\\Users\\irbdr\\Downloads\\FuelConsumptionCo2.csv")
data.head()

#this seperates out the data that we are looking for
data = data[["ENGINESIZE","CO2EMISSIONS"]]

#plot of the original data that we have.
plt.scatter(data["ENGINESIZE"], data["CO2EMISSIONS"], color='blue')
plt.xlabel("ENGINESIZE")
plt.ylabel("CO2EMISSIONS")
plt.show()

#Seperating the data into training data, used to train the AI, and 
#testing data, to see how accurate the AI is.
train = data[:(int(len(data)*.8))]
test = data[(int(len(data)*.8)):]

#This is linear regression. This is a concepet I don't quite understand yet
regr =linear_model.LinearRegression()
train_x = np.array(train[['ENGINESIZE']])
train_y = np.array(train[['CO2EMISSIONS']])

#this fits the linear regression model we have to the training data that we 
#are passing to it. This is essentially where we tell the AI to find the 
#least squares solution, or the line of best fit.
#This is from linear algebra. Essentially it uses this data to transpose the
#vectors passed to it onto the solution plane.
regr.fit(train_x,train_y)

#to get a prediction about emissions, we select a previously unknown engine
#size
my_engine_size = 3.5

#now pass our variables to the function
estimatd_emission = get_regression_predictions(my_engine_size,regr.intercept_[0],regr.coef_[0][0])
#and print our results
print ("Estimated Emission :",estimatd_emission)

#this is how we get our metrics on how we see how well our AI prediction model
#is working
test_x = np.array(test[['ENGINESIZE']])
test_y = np.array(test[['CO2EMISSIONS']])

#This is where we feed our AI known data, and let it attempt to guess where
#it goes after having trained our model, and compare it to the real results.
test_y_predictions = regr.predict(test_x)
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_predictions - test_y)))
print("Mean sum of squares (MSE): %.2f" % np.mean((test_y_predictions - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y_predictions , test_y) )