In [None]:
#IMPORT ALL REQUIRED LIBRARIES 
# Pandas
import pandas as pd
# Numpy
import numpy as np
# Libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# Train-test split
from sklearn.model_selection import train_test_split
# Min-max scling
from sklearn.preprocessing import MinMaxScaler
# Statsmodel 
import statsmodels.api as sm
# VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
#R-squared
from sklearn.metrics import r2_score
# Label encoding
from sklearn.preprocessing import LabelEncoder
# Importing RFE
from sklearn.feature_selection import RFE
# Importing LinearRegression
from sklearn.linear_model import LinearRegression
# Supress warning
import warnings
warnings.filterwarnings('ignore')

# Libraries for cross validation 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

from sklearn import datasets
from sklearn.model_selection import cross_val_score, cross_val_predict

pd.set_option('display.max_columns',None)
%matplotlib inline

# Read the data
df_car = pd.read_csv('TrainData.csv')
df_car.head()

In [None]:
df_car.shape

In [None]:
df_car.info()

In [None]:
df_car.describe()

In [None]:
#Check for null values
df_car.isnull().sum()

In [None]:
#Repalce "?"" with highest value from the column for number of doors
df_car['num-of-doors'] = df_car['num-of-doors'].map({'four':1, 'two':0,'?':1})

In [None]:
#Check
df_car['num-of-doors'].values

In [None]:
#find all frames with "?"
df_car=df_car.replace("?",np.NaN)

In [None]:
#remove column ID 
df_car=df_car.drop(["ID"],axis=1)

In [None]:
df_car.info()

In [None]:
#CONVERT all STRING based numeric columns to Numeric
df_car['normalized-losses']=df_car['normalized-losses'].astype(np.float)

In [None]:
df_car['normalized-losses']

In [None]:
df_car['bore']=df_car['bore'].astype(float)

In [None]:
df_car['bore']

In [None]:
df_car['stroke']=df_car['stroke'].astype(float)

In [None]:
df_car['stroke']

In [None]:
df_car['horsepower']=df_car['horsepower'].astype(np.float)

In [None]:
df_car['horsepower']

In [None]:
df_car['peak-rpm']=df_car['peak-rpm'].astype(np.float)

In [None]:
df_car['peak-rpm']

In [None]:
df_car.info()

In [None]:
#Replace all Null values with mean from the respective columns
df_car.fillna(df_car.mean(),inplace=True)

In [None]:
df_car.info()

In [None]:
df_car.describe()

In [None]:
#gather all numerical columns
columns_num=['price','symboling','wheel-base','length','width','height','curb-weight','engine-size','compression-ratio',
             'city-mpg','highway-mpg','peak-rpm','horsepower','stroke','bore','normalized-losses','num-of-doors']

In [None]:
#gather categorical variables
columns_cat=['make','fuel-type','aspiration','body-style','drive-wheels','engine-location','engine-type','num-of-cylinders'
             ,'fuel-system']

In [None]:
#Plot Categorical variable bar plots
#k=0
#plt.figure(figsize=(20,25))
#for col in columns_cat:    
#    k=k+1
 #   plt.subplot(4,4,k)    
  #  df_car[col].value_counts().plot(kind='bar');
   # plt.title(col)

In [None]:
#Plot Pair Plots for all numerical variables
#plt.figure(figsize=(12,12))
#sns.pairplot(df_car[columns_num])
#plt.show()

In [None]:
#Plot HeatMap
#k=0
#plt.figure(figsize=(25,100))
#for col in range (len(columns_cat)-1):    
#   k=k+1
#   plt.subplot(16,1, k)   
#   ax = sns.boxplot(x =columns_cat[col], y = 'price', data = df_car)

In [None]:
#Convert All left categorical variables in to numerical variables
# Convert "fuel-type" to 1 and "diesel" to 0
df_car['fuel-type'] = df_car['fuel-type'].map({'gas': 1, 'diesel': 0})
df_car.head()

In [None]:
#Convert Engine Location
df_car['engine-location'] = df_car['engine-location'].map({'front': 1, 'rear': 0})
df_car.head()

In [None]:
# Convert aspiration "std" to 1 and "turbo" to 0
df_car['aspiration'] = df_car['aspiration'].map({'std':1, 'turbo':0})
df_car.head()

In [None]:
# Create Dummies for MAKE Column
make_status = pd.get_dummies(df_car['make'],drop_first=True)
make_status.head()

In [None]:
#Rename Column names
make_status = make_status.rename(columns={    'audi':'make(audi)','bmw':'make(BMW)', 
                                             'chevrolet':'make(chevrolet)','dodge':'make(dodge)', 
                                             'honda':'make(honda)','jaguar':'make(jaguar)',
                                             'mazda':'make(mazda)','mercedes-benz':'make(mercedes-benz)', 
                                             'mitsubishi':'make(mitsubishi)','nissan':'make(nissan)',
                                             'peugot':'make(peugot)','plymouth':'make(plymouth)',
                                            'porsche':'make(porsche)','saab':'make(saab)', 
                                             'subaru':'make(subaru)','toyota':'make(toyota)',
                                             'volkswagen':'make(volkswagen)', 'volvo':'make(volvo)',
                                             'alfa-romero':'make(alfa-romero)','isuzu':'make(isuzu)',
                                            'mercury':'make(mercury)','renault':'make(renault)' })
make_status.head()


In [None]:
# Concatinating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,make_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'make' column
df_car = df_car.drop('make',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'body style'
body_status = pd.get_dummies(df_car['body-style'],drop_first=True)
body_status.head()

In [None]:
# Renaming column names 
body_status = body_status.rename(columns={'convertible':'body(convertible)','hardtop':'body(hardtop)', 'hatchback':'body(hatchback)', 'sedan':'body(sedan)','wagon':'body(wagon)'})
body_status.head()

In [None]:
# Concatinating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,body_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'body-style' column as we don't need it
df_car = df_car.drop('body-style',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'drive-wheels'
drivewheel_status = pd.get_dummies(df_car['drive-wheels'],drop_first=True)
drivewheel_status.head()

In [None]:
# Renaming column names for better readability
drivewheel_status = drivewheel_status.rename(columns={'4wd':'drive(4wd)', 'fwd':'drive(fwd)', 'rwd':'drive(rwd)'})
drivewheel_status.head()

In [None]:
# Concatinating the dummy dataframe 
df_car = pd.concat([df_car,drivewheel_status], axis=1)
df_car.head()

In [None]:
#Remove Column Drive Wheels
df_car = df_car.drop('drive-wheels',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'engine-type'
enginetype_status = pd.get_dummies(df_car['engine-type'], drop_first=True)
enginetype_status.head()

In [None]:
# Renaming column name 
enginetype_status = enginetype_status.rename(columns={'dohc':'enginetype(dohc)', 'l':'enginetype(l)', 'ohc':'enginetype(ohc)', 
                                                      'ohcf':'enginetype(ohcf)','ohcv':'enginetype(ohcv)',
                                                        'rotor':'enginetype(rotor)'})
enginetype_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,enginetype_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'enginetype' column 
df_car = df_car.drop('engine-type',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'cylinders'
cylinders_status = pd.get_dummies(df_car['num-of-cylinders'], drop_first=True)
cylinders_status.head()

In [None]:
# Creating dummy variables for 'cylindernumber'
cylinders_status = cylinders_status.rename(columns={'eight':'cylindernumber(eight)','five':'cylindernumber(five)', 'four':'cylindernumber(four)', 'six':'cylindernumber(six)', 
                                                      'three':'cylindernumber(three)','twelve':'cylindernumber(twelve)',
                                                       'two':'cylindernumber(two)'})
cylinders_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,cylinders_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'cylindernumber' column 
df_car = df_car.drop('num-of-cylinders',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'fuelsystem'
fuelsystem_status = pd.get_dummies(df_car['fuel-system'], drop_first=True)
fuelsystem_status.head()

In [None]:
# Renaming column name 
fuelsystem_status = fuelsystem_status.rename(columns={'1bbl':'fuelsystem(1bbl)','2bbl':'fuelsystem(2bbl)','idi':'fuelsystem(idi)', 
                                                      'mfi':'fuelsystem(mfi)','mpfi':'fuelsystem(mpfi)' ,'spdi':'fuelsystem(spdi)',
                                                     'spfi':'fuelsystem(spfi)','4bbl':'fuelsystem(4bbl)'})
fuelsystem_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,fuelsystem_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'fuelsystem' column 
df_car = df_car.drop('fuel-system',axis=1)
df_car.head()

In [None]:
df_car.info()

In [None]:
# Splitting train and test dataset into 80:20 percent ratio.
df_train, df_test = train_test_split(df_car, train_size=0.8, random_state=100)
print(df_train.shape)
print(df_test.shape)
df_train.info()

In [None]:
df_car.head()

In [None]:
# Create a list of numeric variables. We don't need categorical variables because they are already scalled to 0 and 1
num_vars = ['symboling','normalized-losses','wheel-base','length','width','height','curb-weight','engine-size',
            'bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

# Instantiate an object
scaler = MinMaxScaler()

# Fit the data in the object
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()

In [None]:
# Popping out the 'price' column for y_train
y_train = df_train.pop('price') 
# Creating X_train
X_train = df_train

In [None]:
# Creating the model using scikit learn 
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
# RFE with the number of variables 20. We will select top 20 features.
rfe = RFE(lm, 20)
ref = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Top 20 features selected by the RFE
rfe_cols = X_train.columns[rfe.support_]
rfe_cols

In [None]:
# Creating X_train dataframe with RFE selected variables
X_train_1 = X_train[rfe_cols]
X_train_1.head()

In [None]:
# Adding constant  
X_train_sm_1 = sm.add_constant(X_train_1)
# Creating model
lr_1 = sm.OLS(y_train, X_train_sm_1)
# Fit the model
lr_model_1 = lr_1.fit()
print(lr_model_1.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_1.columns
vif['VIF'] = [variance_inflation_factor(X_train_1.values, i) for i in range(X_train_1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# High p-value and high VIF :-  remove this first.
# High p-value and low VIF :- remove this next. Because once we remove this variable there may be possiblity that when we again create the VIF, the VIF may become low. Because VIF is the corealtion of a variable with rest of the other variables.
# Low p-value and high VIF :- Remove this variable after the ones above.
# Low p-value and low VIF :-Keep this variable.

# Removing CITY-MPG based on Pvalue and VIF
X_train_2 = X_train_1.drop('city-mpg',axis=1)

# Adding constant 
X_train_sm_2 = sm.add_constant(X_train_2)
# Creating and fitting the model
lr_model_2 = sm.OLS(y_train, X_train_sm_2).fit()
print(lr_model_2.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_2.columns
vif['VIF'] = [variance_inflation_factor(X_train_2.values, i) for i in range(X_train_2.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing Cylinder Number SIX  based on Pvalue and VIF
X_train_3 = X_train_2.drop('cylindernumber(six)',axis=1)

# Adding constant 
X_train_sm_3 = sm.add_constant(X_train_3)
# Creating and fitting the model
lr_model_3 = sm.OLS(y_train, X_train_sm_3).fit()
print(lr_model_3.summary())

# Create VIF for Model-3
vif = pd.DataFrame()
vif['Features'] = X_train_3.columns
vif['VIF'] = [variance_inflation_factor(X_train_3.values, i) for i in range(X_train_3.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#removing Highway-MPG based on Pvalue and VIF
X_train_4 = X_train_3.drop('highway-mpg',axis=1)

# Adding constant 
X_train_sm_4 = sm.add_constant(X_train_4)
# Creating and fitting the model
lr_model_4 = sm.OLS(y_train, X_train_sm_4).fit()
print(lr_model_4.summary())

# Create VIF for Model-4
vif = pd.DataFrame()
vif['Features'] = X_train_4.columns
vif['VIF'] = [variance_inflation_factor(X_train_4.values, i) for i in range(X_train_4.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing Cylinder Number TWO based on Pvalue and VIF
X_train_5= X_train_4.drop('cylindernumber(two)',axis=1)

# Adding constant 
X_train_sm_5 = sm.add_constant(X_train_5)
# Creating and fitting the model
lr_model_5 = sm.OLS(y_train, X_train_sm_5).fit()
print(lr_model_5.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_5.columns
vif['VIF'] = [variance_inflation_factor(X_train_5.values, i) for i in range(X_train_5.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing Engine Location based on Pvalue and VIF
X_train_6= X_train_5.drop('engine-location',axis=1)

# Adding constant 
X_train_sm_6 = sm.add_constant(X_train_6)
# Creating and fitting the model
lr_model_6 = sm.OLS(y_train, X_train_sm_6).fit()
print(lr_model_6.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_6.columns
vif['VIF'] = [variance_inflation_factor(X_train_6.values, i) for i in range(X_train_6.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing engine size based on Pvalue and VIF
X_train_7= X_train_6.drop('engine-size',axis=1)

# Adding constant 
X_train_sm_7 = sm.add_constant(X_train_7)
# Creating and fitting the model
lr_model_7 = sm.OLS(y_train, X_train_sm_7).fit()
print(lr_model_7.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_7.columns
vif['VIF'] = [variance_inflation_factor(X_train_7.values, i) for i in range(X_train_7.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing BORE based on Pvalue and VIF
X_train_8= X_train_7.drop('bore',axis=1)

# Adding constant 
X_train_sm_8 = sm.add_constant(X_train_8)
# Creating and fitting the model
lr_model_8 = sm.OLS(y_train, X_train_sm_8).fit()
print(lr_model_8.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_8.columns
vif['VIF'] = [variance_inflation_factor(X_train_8.values, i) for i in range(X_train_8.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing Engine Type ROTOR based on Pvalue and VIF
X_train_9= X_train_8.drop('enginetype(rotor)',axis=1)

# Adding constant 
X_train_sm_9 = sm.add_constant(X_train_9)
# Creating and fitting the model
lr_model_9 = sm.OLS(y_train, X_train_sm_9).fit()
print(lr_model_9.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_9.columns
vif['VIF'] = [variance_inflation_factor(X_train_9.values, i) for i in range(X_train_9.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing make(mercury) based on Pvalue and Ptype
X_train_10= X_train_9.drop('make(mercury)',axis=1)

# Adding constant 
X_train_sm_10 = sm.add_constant(X_train_10)
# Creating and fitting the model
lr_model_10 = sm.OLS(y_train, X_train_sm_10).fit()
print(lr_model_10.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_10.columns
vif['VIF'] = [variance_inflation_factor(X_train_10.values, i) for i in range(X_train_10.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing Stroke based on Pvalue and VIF
X_train_11= X_train_10.drop('stroke',axis=1)

# Adding constant 
X_train_sm_11 = sm.add_constant(X_train_11)
# Creating and fitting the model
lr_model_11 = sm.OLS(y_train, X_train_sm_11).fit()
print(lr_model_11.summary())

# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_11.columns
vif['VIF'] = [variance_inflation_factor(X_train_11.values, i) for i in range(X_train_11.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#RESIDUAL Analysis
# Find y_train prediction
y_train_pred = lr_model_11.predict(X_train_sm_11)

In [None]:
# Residual
residual = y_train - y_train_pred

In [None]:
# Distribution of residuals or error terms
sns.distplot(residual)

In [None]:
# Scale the test set variables with min-max scaler
# Transform the data
df_test[num_vars] = scaler.transform(df_test[num_vars])
df_test.head()

In [None]:
# Popping out the 'price' column for y_test
y_test = df_test.pop('price')
# Creating X_test
X_test = df_test

In [None]:
# Taking only the columns from the final model.
X_test = X_test[X_train_11.columns]
X_test.head()

In [None]:
# Add constant
X_test_sm = sm.add_constant(X_test)
X_test_sm.head()

In [None]:
# Predict the model on the test set
y_test_pred = lr_model_11.predict(X_test_sm)

In [None]:
y_test_pred

In [None]:
# Evaluate the model with r-squared on the test set
r2 = r2_score(y_test, y_test_pred)
r2

In [None]:
#adjusted R value
adj_r2 = 1 - ((1 - r2) * (len(X_train) - 1) / (len(X_train) -7 - 1))
adj_r2

In [None]:
#RMSE Calculation
from sklearn.metrics import mean_squared_error
import math

MSE = np.square(np.subtract(y_test,y_test_pred)).mean() 
 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

In [None]:
#Test Data read and process for prediction
# Read the data
df_test= pd.read_csv('TestData.csv')
df_test.head()

In [None]:
df_test

In [None]:
X_train_11.columns

In [None]:
df_test['num-of-doors'] = df_test['num-of-doors'].map({'four':1, 'two':0,'?':1})

In [None]:
#find all frames with "?"
df_test=df_test.replace("?",np.NaN)

#remove column ID
df_test=df_test.drop(["ID"],axis=1)

In [None]:
df_test['normalized-losses']=df_test['normalized-losses'].astype(np.float)

In [None]:
df_test['bore']=df_test['bore'].astype(float)

In [None]:
df_test['stroke']=df_test['stroke'].astype(float)

In [None]:
df_test['horsepower']=df_test['horsepower'].astype(np.float)

In [None]:
df_test['peak-rpm']=df_test['peak-rpm'].astype(np.float)

In [None]:
df_test.fillna(df_test.mean(),inplace=True)

In [None]:
df_test.info()

In [None]:
#Convert All left categorical variables in to numerical variables
# Convert "fuel-type" to 1 and "diesel" to 0
df_test['fuel-type'] = df_test['fuel-type'].map({'gas': 1, 'diesel': 0})
df_test.head()

df_test['engine-location'] = df_test['engine-location'].map({'front': 1, 'rear': 0})
df_test.head()


# Convert aspiration "std" to 1 and "turbo" to 0
df_test['aspiration'] = df_test['aspiration'].map({'std':1, 'turbo':0})
df_test.head()


In [None]:
# Creating dummy variables for 'make'
# Dropping the redundant dummy variable (convertible)
make_status = pd.get_dummies(df_test['make'],drop_first=True)
make_status.head()


# Renaming column names for better readability
make_status = make_status.rename(columns={    'audi':'make(audi)','bmw':'make(BMW)', 
                                             'chevrolet':'make(chevrolet)','dodge':'make(dodge)', 
                                             'honda':'make(honda)','jaguar':'make(jaguar)',
                                             'mazda':'make(mazda)','mercedes-benz':'make(mercedes-benz)', 
                                             'mitsubishi':'make(mitsubishi)','nissan':'make(nissan)',
                                             'peugot':'make(peugot)','plymouth':'make(plymouth)',
                                            'porsche':'make(porsche)','saab':'make(saab)', 
                                             'subaru':'make(subaru)','toyota':'make(toyota)',
                                             'volkswagen':'make(volkswagen)', 'volvo':'make(volvo)',
                                             'alfa-romero':'make(alfa-romero)','isuzu':'make(isuzu)',
                                            'mercury':'make(mercury)','renault':'make(renault)' })
make_status.head()


# Concatinating the dummy dataframe with original dataframe
df_test = pd.concat([df_test,make_status], axis=1)
df_test.head()


# Dropping the 'make' column as we don't need it
df_test = df_test.drop('make',axis=1)
df_test.head()

In [None]:
#add missing columns in make 
df_test['make(chevrolet)']=0
df_test['make(dodge)']=0
df_test['make(isuzu)']=0
df_test['make(mercedes-benz)']=0
df_test['make(mercury)']=0
df_test['make(porsche)']=0
df_test['make(renault)']=0
df_test[ 'make(subaru)']=0
df_test['make(jaguar)']=0
df_test['make(audi)']=0
df_test

In [None]:
# Creating dummy variables for 'body style'
body_status = pd.get_dummies(df_test['body-style'],drop_first=True)
body_status.head()

# Renaming column names for better readability
body_status = body_status.rename(columns={'convertible':'body(convertible)','hardtop':'body(hardtop)', 'hatchback':'body(hatchback)', 'sedan':'body(sedan)','wagon':'body(wagon)'})
body_status.head()

# Concatinating the dummy dataframe with original dataframe
df_test = pd.concat([df_test,body_status], axis=1)
df_test.head()

# Dropping the 'body-style' column as we don't need it
df_test = df_test.drop('body-style',axis=1)
df_test.head()


In [None]:
df_test['body(hardtop)']=0
df_test['body(hatchback)']=0

In [None]:
df_test

In [None]:
# Creating dummy variables for 'drive-wheels'
drivewheel_status = pd.get_dummies(df_test['drive-wheels'],drop_first=True)
drivewheel_status.head()


# Renaming column names for better readability
drivewheel_status = drivewheel_status.rename(columns={'4wd':'drive(4wd)', 'fwd':'drive(fwd)', 'rwd':'drive(rwd)'})
drivewheel_status.head()

# Concatinating the dummy dataframe with original dataframe
df_test = pd.concat([df_test,drivewheel_status], axis=1)
df_test.head()

# Dropping the 'body-style' column as we don't need it
df_test = df_test.drop('drive-wheels',axis=1)
df_test.head()


In [None]:
df_test['drive(fwd)']=0


In [None]:
df_test

In [None]:
# Creating dummy variables for 'engine-type'
enginetype_status = pd.get_dummies(df_test['engine-type'], drop_first=True)
enginetype_status.head()

# Renaming column name for better readability
enginetype_status = enginetype_status.rename(columns={'dohc':'enginetype(dohc)', 'l':'enginetype(l)', 'ohc':'enginetype(ohc)', 
                                                      'ohcf':'enginetype(ohcf)','ohcv':'enginetype(ohcv)',
                                                        'rotor':'enginetype(rotor)'})
enginetype_status.head()


# Concating the dummy dataframe with original dataframe
df_test = pd.concat([df_test,enginetype_status], axis=1)
df_test.head()

# Dropping the 'enginetype' column as we don't need it
df_test = df_test.drop('engine-type',axis=1)
df_test.head()

In [None]:
df_test['enginetype(ohcf)']=0
df_test['enginetype(ohcv)']=0
df_test['enginetype(rotor)']=0


In [None]:
df_test

In [None]:
# Creating dummy variables for 'cylinders'
cylinders_status = pd.get_dummies(df_test['num-of-cylinders'], drop_first=True)
cylinders_status.head()

# Creating dummy variables for 'cylindernumber'
cylinders_status = cylinders_status.rename(columns={'eight':'cylindernumber(eight)','five':'cylindernumber(five)', 'four':'cylindernumber(four)', 'six':'cylindernumber(six)', 
                                                      'three':'cylindernumber(three)','twelve':'cylindernumber(twelve)',
                                                       'two':'cylindernumber(two)'})
cylinders_status.head()

# Concating the dummy dataframe with original dataframe
df_test = pd.concat([df_test,cylinders_status], axis=1)
df_test.head()

#Dropping the 'cylindernumber' column as we don't need it
df_test = df_test.drop('num-of-cylinders',axis=1)
df_test.head()

In [None]:
df_test['cylindernumber(three)']=0
df_test['cylindernumber(five)']=0
df_test['cylindernumber(two)']=0
df_test['cylindernumber(twelve)']=0

In [None]:
df_test

In [None]:
# Creating dummy variables for 'fuelsystem'
fuelsystem_status = pd.get_dummies(df_test['fuel-system'], drop_first=True)
fuelsystem_status.head()

# Renaming column name for better readability
fuelsystem_status = fuelsystem_status.rename(columns={'1bbl':'fuelsystem(1bbl)','2bbl':'fuelsystem(2bbl)','idi':'fuelsystem(idi)', 
                                                      'mfi':'fuelsystem(mfi)','mpfi':'fuelsystem(mpfi)' ,'spdi':'fuelsystem(spdi)',
                                                     'spfi':'fuelsystem(spfi)','4bbl':'fuelsystem(4bbl)'})
fuelsystem_status.head()

# Concating the dummy dataframe with original dataframe
df_test = pd.concat([df_test,fuelsystem_status], axis=1)
df_test.head()

# Dropping the 'fuelsystem' column as we don't need it
df_test = df_test.drop('fuel-system',axis=1)
df_test.head()

In [None]:
df_test['fuelsystem(4bbl)']=0
df_test['fuelsystem(mfi)']=0
df_test['fuelsystem(spdi)']=0
df_test['fuelsystem(spfi)']=0

In [None]:
df_test.info()

In [None]:
df_train.info()

In [None]:
df_test['price']=0

In [None]:
# Create a list of numeric variables. We don't need categorical variables because they are already scalled in 0 and 1.
num_vars1 = ['symboling','normalized-losses','wheel-base','length','width','height','curb-weight','engine-size',
            'bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

# Instantiate an object
scaler1 = MinMaxScaler()

# Fit the data in the object
df_test[num_vars1] = scaler1.fit_transform(df_test[num_vars1])
df_test

In [None]:
X_test_1 = df_test

In [None]:
# Popping out the 'price' column for y_train
y_test_1= df_test.pop('price') 
# Creating X_train
X_test_1 = df_test

In [None]:
X_test_1

In [None]:
y_test_1

In [None]:
# Taking only the columns from the final model.
X_test_1= X_test_1[X_train_11.columns]
X_test_1.head()

In [None]:
# Add constant
X_test_1_sm = sm.add_constant(X_test_1)
X_test_1_sm.head()

In [None]:
#predict
y_test_pred1 = lr_model_11.predict(X_test_1_sm)

In [None]:
#FINAL Predicted numbers for test set
y_test_pred1

In [None]:
#join the predicted values
df=pd.concat([df_test,y_test_pred1],axis=1, sort=False)

In [None]:
df.rename(columns={0: 'price'},inplace=True)

In [None]:
df

In [None]:
#inverse transform
df[num_vars1]= scaler.inverse_transform(df[num_vars1])

In [None]:
#final predicted price from the model
df

In [2580]:
#Move to CSV for final predicted prices
df.to_csv('Predicted_final.csv',index=False, header=False)