# Load and inspect

In [None]:
import sklearn
import numpy as np
import os

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
# This function creates a dataset/auto-mpg directory in your workspace, downloads the auto-mpg.data file.
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/"
AUTO_PATH = os.path.join("datasets", "auto-mpg")
AUTO_URL = DOWNLOAD_ROOT + "auto-mpg.data"

def fetch_auto_data(auto_url=AUTO_URL, auto_path=AUTO_PATH):
    if not os.path.isdir(auto_path):
        os.makedirs(auto_path)
    data_path = os.path.join(auto_path, "auto-mpg.data")
    urllib.request.urlretrieve(auto_url, data_path)

In [None]:
# Download the dataset
fetch_auto_data()

In [None]:
# This function loads the dataset from the auto-mpg.data file.
# It returns a pandas DataFrame object containing all the data.

import pandas as pd

def load_auto_data(auto_path=AUTO_PATH):
    data_path = os.path.join(auto_path, "auto-mpg.data")
    return pd.read_fwf(data_path,header=None)

In [None]:
# load the dataset
auto_mpg = load_auto_data()

# Display the top five rows
auto_mpg.head()

In [None]:
# Get a quick description of the data
# (look for null values and non-numerical data which require special data preparation)
auto_mpg.info()

In [None]:
# Look at the non-numerical features
auto_mpg[8].value_counts()
#auto_mpg[8].value_counts()

In [None]:
# Display basic statistics for the features
auto_mpg.describe()

In [None]:
# Plot a histogram of each numerical feature
# (the first line deternines which graphical backend to use - in this case Jupyter's own)
%matplotlib inline
import matplotlib.pyplot as plt
auto_mpg.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# The correlation coefficient ranges from -1 (100% negative correlation) to 1 (100% positive correlation)
corr_matrix = auto_mpg.corr()
corr_matrix[0].sort_values(ascending=False)

# Preparation

In [None]:
# Split the dataset randomly in training set (80%) and test set (20%). Use a fixed random seed (42).
# As a rule of thumb, pick 20% for the test set, unless the dataset is very large.
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(auto_mpg, test_size=0.2, random_state=42)

## Stratified sampling

In [None]:
# Remove the labels from the training set (the method returns a new set, and it does not affect the original one).
auto_mpg_predictors = train_set.drop(0, axis=1)


# Keep the labels in a separate set.
auto_mpg_labels = train_set[0].copy()

In [None]:
# Remove the text attribute because median can only be calculated on numerical attributes.

auto_mpg_num = auto_mpg_predictors.drop(8, axis=1)

In [None]:
#Remove/Replace the missing data marked by '?'.

#auto_mpg_num = auto_mpg_num[auto_mpg_num[3] != '?']
auto_mpg_num.replace('?', np.nan, inplace=True)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [None]:
# Compute the median of each attribute.
imputer.fit(auto_mpg_num)

In [None]:
auto_mpg_num.info()

In [None]:
auto_mpg_num = imputer.transform(auto_mpg_num)

## Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
# scaler = MinMaxScaler()
auto_mpg_num_prepared = scaler.fit_transform(auto_mpg_num)

# Select and train a model

## Linear Regression

In [None]:
# Train a Linear Regression model.
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(auto_mpg_num_prepared, auto_mpg_labels)

In [None]:
# Measure the models RMSE on the training set
from sklearn.metrics import mean_squared_error

auto_mpg_predictions = lin_reg.predict(auto_mpg_num_prepared)
lin_mse = mean_squared_error(auto_mpg_labels, auto_mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# Measure the models RAE on the training set
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(auto_mpg_labels, auto_mpg_predictions)
lin_mae

## Decision Tree Regressor

In [None]:
# Train a DecisionTreeRegressor.
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(auto_mpg_num_prepared, auto_mpg_labels)

In [None]:
# Measure the models RMSE on the training set
auto_mpg_predictions = tree_reg.predict(auto_mpg_num_prepared)
tree_mse = mean_squared_error(auto_mpg_labels, auto_mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

## Evaluate the model on the test set

In [None]:
# Remove the labels from the test set.
X_test = test_set.drop(0, axis=1)
# Keep the labels in a separate set.
y_test = test_set[0].copy()

In [None]:
# Perform the same transformations on the test set as we did on the training set.
X_test = X_test.drop(8, axis=1)
X_test.replace('?', np.nan, inplace=True)
X_test = imputer.transform(X_test)
X_test_prepared = scaler.transform(X_test)

In [None]:
# Evaluate the linear regression model.
lin_reg_predictions = lin_reg.predict(X_test_prepared)
lin_reg_mse = mean_squared_error(y_test, lin_reg_predictions)
lin_reg_rmse = np.sqrt(lin_reg_mse)
lin_reg_rmse

In [None]:
# Evaluate the DecisionTreeRegressor.
tree_reg_predictions = tree_reg.predict(X_test_prepared)
tree_reg_mse = mean_squared_error(y_test, tree_reg_predictions)
tree_reg_rmse = np.sqrt(tree_reg_mse)
tree_reg_rmse