In [1]:
#importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
#reading the csv file using pandas
df = pd.read_csv('auto-mpg.csv', na_values='?', skipinitialspace=True)
df = df.drop('car name', axis=1)
data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data['cylinders']):
  strat_train_data = data.loc[train_index]
  strat_test_data = data.loc[test_index]

In [3]:
data = strat_train_data.drop('mpg', axis=1)
labels = strat_train_data['mpg'].copy()
data

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
145,4,83.0,61.0,2003,19.0,74,3
151,4,79.0,67.0,2000,16.0,74,2
388,4,156.0,92.0,2585,14.5,82,1
48,6,250.0,88.0,3139,14.5,71,1
114,4,98.0,90.0,2265,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108,15.5,74,2
156,8,400.0,170.0,4668,11.5,75,1
395,4,135.0,84.0,2295,11.6,82,1
14,4,113.0,95.0,2372,15.0,70,3


In [4]:
##preprocess the Origin column in data
def preprocess_origin_cols(df):
    df["origin"] = df["origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [5]:
#creating custom attribute adder class
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

# **Transformation pipeline for both numerical and categorical data**

In [6]:
def num_pipeline_transformer(data):
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    cat_attrs = ["origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [7]:
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

# **Training Models**
**Linear Regression**

In [8]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(prepared_data, labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
#testing the predictions with sample data
sample_data = data.iloc[:5]
sample_labels = labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", linear_reg.predict(sample_data_prepared))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [10]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


**Calculating Mean Squared Error**

In [11]:
from sklearn.metrics import mean_squared_error

mpg_predictions = linear_reg.predict(prepared_data)
linear_mse = mean_squared_error(labels, mpg_predictions)
linear_rmse = np.sqrt(linear_mse)
linear_rmse

2.9590402225760872

**Decision Tree**

In [12]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree.fit(prepared_data, labels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [13]:
mpg_predictions = decision_tree.predict(prepared_data)
decision_tree_mse = mean_squared_error(labels, mpg_predictions)
decision_tree_rmse = np.sqrt(decision_tree_mse)
decision_tree_rmse

0.0

**Model Evaluation using Cross Validation**

In [14]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(decision_tree, 
                         prepared_data, 
                         labels, 
                         scoring="neg_mean_squared_error", 
                         cv = 10)
decision_tree_reg_rmse_scores = np.sqrt(-scores)
decision_tree_reg_rmse_scores.mean()

3.331801632345358

In [15]:
scores = cross_val_score(linear_reg,
                         prepared_data,
                         labels,
                         scoring="neg_mean_squared_error",
                         cv = 10)
linear_reg_rmse_scores = np.sqrt(-scores)
linear_reg_rmse_scores.mean()

3.0757081793709333

**Random Forest Model**

In [16]:
from sklearn.ensemble import RandomForestRegressor

random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(prepared_data, labels)
random_forest_reg_cv_scores = cross_val_score(random_forest_reg,
                                         prepared_data,
                                         labels,
                                         scoring='neg_mean_squared_error',
                                         cv = 10)

random_forest_reg_rmse_scores = np.sqrt(-random_forest_reg_cv_scores)
random_forest_reg_rmse_scores.mean()

2.5562414602151504

**SVM Regressor**

In [17]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

**Hyperparameter Tuning using GridSearchCV**

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(prepared_data, labels)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [19]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 10}

In [20]:
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)

3.5437441250512327 {'max_features': 2, 'n_estimators': 3}
2.9613990161855113 {'max_features': 2, 'n_estimators': 10}
2.963355761833492 {'max_features': 2, 'n_estimators': 30}
3.26504128973217 {'max_features': 4, 'n_estimators': 3}
2.843723771591204 {'max_features': 4, 'n_estimators': 10}
2.703923526107172 {'max_features': 4, 'n_estimators': 30}
3.134647598853245 {'max_features': 6, 'n_estimators': 3}
2.912120100171451 {'max_features': 6, 'n_estimators': 10}
2.708864671204459 {'max_features': 6, 'n_estimators': 30}
3.0922223655012413 {'max_features': 8, 'n_estimators': 3}
2.6714411131387004 {'max_features': 8, 'n_estimators': 10}
2.7120968672897225 {'max_features': 8, 'n_estimators': 30}
3.1817593827845116 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.8624697401008157 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.2926754783239045 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8972545663399027 {'bootstrap': False, 'max_features': 3, 'n_estimat

**Checking Feature importance**

In [21]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.07031194, 0.15261158, 0.31573678, 0.28152823, 0.01602729,
       0.10866914, 0.01824629, 0.02870433, 0.00367918, 0.0010471 ,
       0.00343816])

In [22]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('weight', 0.2815282287980176),
 ('model year', 0.10866914024801619),
 ('horsepower', 0.3157367764785398),
 ('displacement', 0.15261158121467852),
 ('cylinders', 0.0703119352039172),
 ('acceleration', 0.016027285992239928),
 ('acc_on_power', 0.018246294246787052),
 ('acc_on_cyl', 0.02870432611818777)]

**Test Data Evaluation**

In [23]:
final_model = grid_search.best_estimator_

X_test = strat_test_data.drop("mpg", axis=1)
y_test = strat_test_data["mpg"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

3.058174700699749

In [24]:
def predict_mpg(config, model):   
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [25]:
vehicle_config = {
    'cylinders': [4, 6, 8],
    'displacement': [155.0, 160.0, 165.5],
    'horsepower': [93.0, 130.0, 98.0],
    'weight': [2500.0, 3150.0, 2600.0],
    'acceleration': [15.0, 14.0, 16.0],
    'model year': [81, 80, 78],
    'origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([35.92, 16.55, 21.55])

# **Saving the model**

In [26]:
import pickle

with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [27]:
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([35.92, 16.55, 21.55])

# **Deployed the trained model on Heroku by using Flask**

In [29]:
import requests

url = "https://prediction-auto-mpg.herokuapp.com/predict"
response = requests.post(url, json = vehicle_config)

response.text.strip()

'{"mpg_prediction":[35.92,16.55,21.55]}'