<a href="https://colab.research.google.com/github/KevinHern/AI-Crash-Course/blob/main/AI_Crash_Course_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Metrics for Regressions

[Presentation: AI Crash Course 02](https://view.genial.ly/6196efb01bfa3c0dac953b67/presentation-ai-crashcourse02)

## 0) Preparations

In [None]:
# ----- Libraries ----- #

# For graph plotting
import matplotlib.pyplot as plt

# For dataset manipulation
import pandas as pd

# For visualizing more complex maps
import seaborn as sns

# For statistical analysis and Models
import statsmodels.api as sm
import statsmodels.formula.api as smapi
import numpy as np

In [None]:
'''
All the information regarding the dataset used for this demo can be found in the following link:
https://archive.ics.uci.edu/ml/datasets/auto+mpg
'''

# Getting Dataset
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data

In [None]:
# Loading Dataset and have a glimpse about it
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model_Year', 'Origin']

raw_dataset = pd.read_csv("auto-mpg.data", names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

# Brief Statistical Summary of the dataset
raw_dataset.describe()

In [None]:
# Lets check columns
raw_dataset.columns

In [None]:
# Summary of the dataset
raw_dataset.head()

In [None]:
# Returns a form of (# rows, # columns)
raw_dataset.shape

In [None]:
# Lets make a copy
new_dataset = raw_dataset.copy()

# Lets check for null values
new_dataset.isna().sum()

# Dropping null rows
new_dataset = new_dataset.dropna()

# Checking new dataset
new_dataset.head()

In [None]:
# Lets visualize the data
sns.pairplot(new_dataset[["Cylinders", "Displacement", "Weight", "Acceleration", "Horsepower", "MPG"]], diag_kind="kde")

## 1) Linear Models

In [None]:
# Lets try to predict MPG based on Acceleration
model = smapi.ols(formula="MPG ~ Acceleration", data=new_dataset)
model = model.fit()
print(model.summary())

In [None]:
# Lets try to predict MPG based on Weight
model = smapi.ols(formula="MPG ~ Weight", data=new_dataset)
model = model.fit()
print(model.summary())

In [None]:
# Lets try to predict MPG based on Displacement
model = smapi.ols(formula="MPG ~ Displacement", data=new_dataset)
model = model.fit()
print(model.summary())

## 2) Multilinear Models

In [None]:
model = smapi.ols(formula="MPG ~ Acceleration + Weight", data=new_dataset)
model = model.fit()
print(model.summary())

In [None]:
model = smapi.ols(formula="MPG ~ Acceleration + Weight + Displacement", data=new_dataset)
model = model.fit()
print(model.summary())

In [None]:
model = smapi.ols(formula="MPG ~ Acceleration + Weight + Displacement + Cylinders", data=new_dataset)
model = model.fit()
print(model.summary())

In [None]:
model = smapi.ols(formula="MPG ~ Acceleration + Weight + Displacement + Horsepower + Cylinders", data=new_dataset)
model = model.fit()
print(model.summary())

## 3) Polynomial Models

In [None]:
model = smapi.ols(formula="MPG ~ I(Horsepower**2) + Weight + Acceleration", data=new_dataset)
model = model.fit()
print(model.summary())

## 4) Metrics

### Dataset Split

In [51]:
# This will be useful to split our dataset into training and testing
from sklearn.model_selection import train_test_split

# Lets say test is 20%
train, test = train_test_split(new_dataset, test_size=0.2)

### Training models

In [None]:
# Model one: a simple linear regression
model1 = smapi.ols(formula="MPG ~ Weight", data=train)
model1 = model1.fit()
print(model1.summary())

In [None]:
# Model 2: a multilinear regression
model2 = smapi.ols(formula="MPG ~ Acceleration + Weight + Displacement + Cylinders", data=train)
model2 = model2.fit()
print(model2.summary())

In [None]:
# Model 3: a polynomial regression
model3 = smapi.ols(formula="MPG ~ I(Horsepower**2) + Weight + Acceleration", data=train)
model3 = model3.fit()
print(model3.summary())

### Making predictions

In [55]:
# Predictions from every model
model1_predictions = model1.predict(test)
model2_predictions = model2.predict(test)
model3_predictions = model3.predict(test)

### RMSE

In [None]:
# Importing to evaluate RMSE 
from statsmodels.tools.eval_measures import rmse

model1_rmse = rmse(test['MPG'], model1_predictions)
model2_rmse = rmse(test['MPG'], model2_predictions)
model3_rmse = rmse(test['MPG'], model3_predictions)

print("Model1 RMSE: {}\nModel2 RMSE: {}\nModel3 RMSE: {}".format(model1_rmse, model2_rmse, model3_rmse))

### MAE

In [57]:
# Importing to evaluate MAE
from statsmodels.tools.eval_measures import meanabs

model1_mae = meanabs(test['MPG'], model1_predictions)
model2_mae = meanabs(test['MPG'], model2_predictions)
model3_mae = meanabs(test['MPG'], model3_predictions)

print("Model1 MAE: {}\nModel2 MAE: {}\nModel3 MAE: {}".format(model1_mae, model2_mae, model3_mae))

Model1 MAE: 2.890330807364871
Model2 MAE: 2.775852434918535
Model3 MAE: 2.8648236187058687


## 5) Stepwise

In [63]:
def stepwise_selection(dataset, target, significance_level = 0.05, debug=False):
  # Setting up everything
  independent_variables = list(dataset.columns)
  independent_variables.remove(target)
  formula = target + "~ "
  max_iterations = len(independent_variables)

  # Begin algorithm
  for iteration in range(max_iterations):

    # Testing for each independent variable
    best_var = None
    max_t_value = 0
    for exog_variable in independent_variables:
      model = None
      if iteration == 0:
        model = smapi.ols(formula=formula + exog_variable, data=train).fit()
      else:
        model = smapi.ols(formula=formula + "+" + exog_variable, data=train).fit()

      if debug:
        print(model.summary())

      # Checking T and P Values
      if model.pvalues[exog_variable] <= significance_level:
        if abs(model.tvalues[exog_variable]) > max_t_value:
          best_var = exog_variable
          max_t_value = abs(model.tvalues[exog_variable])
      else:
        continue
    
    # If a variable with significance was found, then add to formula, else, stop algorithm
    if best_var is not None:
      if iteration == 0:
        formula += best_var
      else:
        formula += "+" + best_var
      independent_variables.remove(best_var)
    else:
      break

  return smapi.ols(formula=formula, data=dataset).fit()

In [None]:
best_model = stepwise_selection(dataset=train, target='MPG')
print(best_model.summary())
print('\n')
best_model_predictions = best_model.predict(test)
print(
    "Best Model RMSE: {}\nBest Model MAE: {}".format(
        rmse(test['MPG'], best_model_predictions),
        meanabs(test['MPG'], best_model_predictions)
        )
    )