In [16]:
import pandas as pd
import numpy as np

In [17]:
# adding column names
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']

# reading the .data file 
df = pd.read_csv('./auto-mpg.data', na_values='?', names=cols, comment='\t', sep=' ', skipinitialspace=True)

# make a copy
df_original = df.copy()

In [18]:
# splitting data into stratified train/test to keep distributions between categories
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['Cylinders']):
    strat_train_set = df.iloc[train_index]
    strat_test_set = df.iloc[test_index]

In [19]:
# split the training and test set into features (X) and label (y)
X_train = strat_train_set.drop('MPG', axis=1)
y_train = strat_train_set['MPG'].copy()

X_test = strat_test_set.drop('MPG', axis=1)
y_test = strat_test_set['MPG'].copy()

In [20]:
# preprocess origin column
def preprocess_origin_col(df):
    df['Origin'] = df['Origin'].map({1: 'India', 2: 'USA', 3: 'Germany'})
    return df

In [21]:
# feature engineering with the BaseEstimator and Transformer
from sklearn.base import BaseEstimator, TransformerMixin

acceleration_pos, horsepower_pos, cylinders_pos = 4, 2, 0

class FeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): 
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        acc_on_cyl = X[:, acceleration_pos] / X[:, horsepower_pos]
        if self.acc_on_power:
            acc_on_power = X[:, acceleration_pos] / X[:, horsepower_pos]
            return np.c_[X, acc_on_power, acc_on_cyl]
        return np.c_[X, acc_on_cyl]

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

def numerical_pipeline_transformer(df):
    '''Preprocesses numerical columns in the DataFrame

    Args:
        df: DataFrame
    
    Returns:
        numerical_attr: DataFrame with only numerical columns
        numerical_pipeline: The pipeline object
    '''
    numerical = ['float', 'int64']

    numerical_data = df.select_dtypes(include=numerical)

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('feature_creator', FeatureCreator()),
        ('std_scaler', StandardScaler())
    ])
    return numerical_data, numerical_pipeline


In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def full_pipeline(df):
    '''Completely preprocesses the DataFrame (numerical and categorical columns)
    
    Args:
        df: DataFrame

    Returns:
        preprocessed_data: Preprocessed DataFrame
    '''
    numerical_attributes, numerical_pipeline = numerical_pipeline_transformer(df)
    numerical_attributes = list(numerical_attributes)
    cat_attributes = ['Origin']

    full_pipeline = ColumnTransformer([
        ('numerical', numerical_pipeline, numerical_attributes),
        ('cat', OneHotEncoder(), cat_attributes)
    ])
    preprocessed_data = full_pipeline.fit_transform(df)
    return preprocessed_data
    

In [24]:
X_train_prepr = preprocess_origin_col(X_train)
X_train_prepr = full_pipeline(X_train_prepr)

# Modeling

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lreg = LinearRegression()
lreg.fit(X_train_prepr, y_train)

X_test_prepr = preprocess_origin_col(X_test)
X_test_prepr = full_pipeline(X_test_prepr)

y_pred_lreg = lreg.predict(X_test_prepr)

print('MSE: ', mean_squared_error(y_pred_lreg, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_pred_lreg, y_test)))

MSE:  10.737753984271482
RMSE:  3.2768512301097044


In [27]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lreg, X_train_prepr, y_train, scoring='neg_mean_squared_error',
cv=10)
np.sqrt(-scores).mean()

3.052042580702212

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

rf_reg = RandomForestRegressor()

grid_search = GridSearchCV(rf_reg, param_grid, scoring='neg_mean_squared_error', return_train_score=True, cv=10)

grid_search.fit(X_train_prepr, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [29]:
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(f'RMSE: {np.sqrt(-mean_score)} with {(params)}')

RMSE: 3.4776328471438798 with {'max_features': 2, 'n_estimators': 3}
RMSE: 2.9242253816457597 with {'max_features': 2, 'n_estimators': 10}
RMSE: 2.899842097457752 with {'max_features': 2, 'n_estimators': 30}
RMSE: 3.2799203441787586 with {'max_features': 4, 'n_estimators': 3}
RMSE: 2.7902839435955435 with {'max_features': 4, 'n_estimators': 10}
RMSE: 2.8169584110774775 with {'max_features': 4, 'n_estimators': 30}
RMSE: 3.2182991602346456 with {'max_features': 6, 'n_estimators': 3}
RMSE: 2.8974951936418147 with {'max_features': 6, 'n_estimators': 10}
RMSE: 2.7022684392454304 with {'max_features': 6, 'n_estimators': 30}
RMSE: 3.012998683422919 with {'max_features': 8, 'n_estimators': 3}
RMSE: 2.816742060731431 with {'max_features': 8, 'n_estimators': 10}
RMSE: 2.6680899259670006 with {'max_features': 8, 'n_estimators': 30}
RMSE: 3.3715944088111907 with {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
RMSE: 3.0058357705362884 with {'bootstrap': False, 'max_features': 2, 'n_estim

In [30]:
feature_importances = grid_search.best_estimator_.feature_importances_

extra_features = ['acc_on_power', 'acc_on_cyl']
numerical = ['float64', 'int64']
numerical_features = list(X_train.select_dtypes(include=numerical))

numerical_features = numerical_features + extra_features
sorted(zip(numerical_features, feature_importances), reverse=True)

[('acc_on_power', 0.023740526158846476),
 ('acc_on_cyl', 0.01637800280434033),
 ('Weight', 0.19584452225887036),
 ('Model Year', 0.11485079241459735),
 ('Horsepower', 0.1326374072778818),
 ('Displacement', 0.27648206423683364),
 ('Cylinders', 0.21844532595469468),
 ('Acceleration', 0.015680319872427295)]

In [31]:
final_model = grid_search.best_estimator_

y_pred_rf = final_model.predict(X_test_prepr)
final_mse = mean_squared_error(y_test, y_pred_rf)
final_rmse = np.sqrt(final_mse)


In [32]:
def predict_y(input_data, model):
    if type(input_data) == dict:
        df = pd.DataFrame(input_data)
    else:
        df = input_data
    
    df = preprocess_origin_col(df)
    df = full_pipeline(df)
    y_pred = model.predict(df)
    return y_pred

In [128]:
import pickle

# saving the model
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [33]:
# open the model
import pickle
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    f_in.close()
model

RandomForestRegressor(max_features=8, n_estimators=30)

In [34]:
vehicle_data = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}
predict_y(vehicle_data, model)

array([33.54333333, 17.64333333, 21.29333333])