In [60]:
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [61]:
dataset = pd.read_csv("auto-mpg.csv")
dataset = pd.DataFrame(dataset)
# dataset = dataset.drop(columns="horsepower")
# dataset.shape
dataset.horsepower = pd.to_numeric(dataset.horsepower, errors='coerce')
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [62]:
dataset["car name"].value_counts()

amc matador                          5
ford pinto                           5
toyota corolla                       5
chevrolet impala                     4
amc gremlin                          4
chevrolet chevette                   4
amc hornet                           4
ford maverick                        4
peugeot 504                          4
toyota corona                        4
ford galaxie 500                     3
honda civic                          3
chevrolet caprice classic            3
chevrolet nova                       3
chevrolet vega                       3
ford gran torino                     3
dodge colt                           3
plymouth fury iii                    3
pontiac catalina                     3
chevrolet citation                   3
volkswagen dasher                    3
plymouth duster                      3
buick estate wagon (sw)              2
ford ltd                             2
oldsmobile cutlass salon brougham    2
audi 100ls               

In [63]:
dataset['constant'] = 1

In [64]:
X = dataset[
    ["model year", "weight", "acceleration", "horsepower",
     "displacement", "cylinders", "constant", "car name"]]
y = dataset[["mpg"]]

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_car_name = X_train["car name"]
X_train = X_train.drop("car name", axis=1)
X_test_car_name = X_test["car name"]
X_test = X_test.drop("car name", axis=1)

In [65]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [66]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [67]:
linear_regressor.coef_.transpose()

array([[ 0.73921536],
       [-0.00706142],
       [ 0.14878918],
       [ 0.00398814],
       [ 0.00725555],
       [-0.24724021],
       [ 0.        ]])

In [68]:
coeff = pd.DataFrame(linear_regressor.coef_.transpose(),
                    index=X_train.columns,
                    columns=["Coefficient"])
coeff

Unnamed: 0,Coefficient
model year,0.739215
weight,-0.007061
acceleration,0.148789
horsepower,0.003988
displacement,0.007256
cylinders,-0.24724
constant,0.0


In [69]:
forest_pred_y = forest.predict(X_test)

df = pd.DataFrame({"Actual: " : y_test.values.ravel(),
                "Predicted: " : forest_pred_y.ravel()})
df.head()

Unnamed: 0,Actual:,Predicted:
0,28.0,28.45
1,22.3,27.27
2,12.0,13.0
3,38.0,35.21
4,33.8,34.15


In [70]:
linear_pred_y = linear_regressor.predict(X_test)

df = pd.DataFrame({"Actual: " : y_test.values.ravel(),
                "Predicted: " : linear_pred_y.ravel()})
df.head()

Unnamed: 0,Actual:,Predicted:
0,28.0,27.273449
1,22.3,26.39877
2,12.0,10.422526
3,38.0,33.277905
4,33.8,32.107158


In [71]:
print("MAE: ",metrics.mean_absolute_error(y_test, forest_pred_y))
print("MSE: ",metrics.mean_squared_error(y_test, forest_pred_y))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test, forest_pred_y)))

MAE:  1.9310126582278482
MSE:  7.5863
RMSE:  2.7543238734760296


In [72]:
print("MAE: ",metrics.mean_absolute_error(y_test, linear_pred_y))
print("MSE: ",metrics.mean_squared_error(y_test, linear_pred_y))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test, linear_pred_y)))

MAE:  2.6724608826273144
MSE:  11.878262288865692
RMSE:  3.4464854981365716


In [73]:
df = {'Actual MPG': y_test.values.ravel(),
        ' Forest Predicted MPG': forest_pred_y.ravel(),
        'Linear Predicted MPG': linear_pred_y.ravel()}
df = pd.DataFrame(df)
df["car name"] = list(X_test_car_name)
df.head()

Unnamed: 0,Actual MPG,Forest Predicted MPG,Linear Predicted MPG,car name
0,28.0,28.45,27.273449,dodge colt
1,22.3,27.27,26.39877,ford fairmont 4
2,12.0,13.0,10.422526,oldsmobile delta 88 royale
3,38.0,35.21,33.277905,plymouth horizon miser
4,33.8,34.15,32.107158,subaru dl


In [74]:
export_csv = df.to_csv("Actual vs Predicted MPG.csv", index=True)

In [75]:
rf_model = open("Random_Forest_Model.pkl", "wb")
pickle.dump(forest, rf_model)
rf_model.close()

In [76]:
import_model = open("Random_Forest_Model.pkl", "rb")
model = pickle.load(import_model)
model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [77]:
model.feature_importances_

array([0.10147174, 0.36651222, 0.03634578, 0.15987663, 0.14646003,
       0.18933359, 0.        ])

In [79]:
# "model year", "weight", "acceleration", "horsepower", "displacement", "cylinders", "constant"
new_car = [[2010, 3500, 13, 155, 350, 8, 1]]
model.predict(new_car)

array([19.94])