In [7]:
import pandas as pd
import tensorflow as tf
import numpy as np
import keras
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Preprocessing
def preprocess(df):
    # nan_cols = df.isna().any()
    # print("Empty Cols = ")
    # print(nan_cols[nan_cols == True].index.tolist())
    df.dropna(inplace=True)
    df['Fuel_Type_Encoded']=df['Fuel_Type'].astype('category').cat.codes
    df['Transmission_Encoded']=df['Transmission'].astype('category').cat.codes
    df['Owner_Type_Encoded']=df['Owner_Type'].astype('category').cat.codes
    df['New_Price'] = df['New_Price'].str.replace('Lakh', '')
    df['New_Price'] = df['New_Price'].str.replace('Cr', '')
    df['Power'] = df['Power'].str.replace('bhp', '')
    df['Engine'] = df['Engine'].str.replace('CC', '')
    df['Mileage'] = df['Mileage'].str.replace('kmpl', '')
    df['Mileage'] = df['Mileage'].str.replace('km/kg', '')
    df['New_Price'] = df['New_Price'].str.strip()
    df['Power'] = df['Power'].str.strip()
    df['Engine'] = df['Engine'].str.strip()
    df['Mileage'] = df['Mileage'].str.strip()
    df['Mileage']=df['Mileage'].astype(float)
    df['Engine']=df['Engine'].astype(float)
    df['Power']=df['Power'].astype(float)
    df['New_Price']=df['New_Price'].astype(float)
    df=df.reset_index(drop=True)
    return df

# Feature Engineering
def calcAge(year):
    current_year = datetime.now().year
    return current_year-year

def norm(srs):
    scaler = MinMaxScaler()
    scaled_srs = scaler.fit_transform(srs.values.reshape(-1,1))
    return scaled_srs
    
def feature_engineering(df):
    df['Age']=df['Year'].apply(calcAge)
    df['Kilometers_Driven']=norm(df['Kilometers_Driven'])
    df['Engine']=norm(df["Engine"])
    df['Power']=norm(df["Power"])
    df['Mileage']=norm(df["Mileage"])
    df['Price_Drop']=df['New_Price']-df['Price']
    df['Price']=norm(df["Price"])
    df['Price_Drop']=norm(df["Price_Drop"])
    df['New_Price']=norm(df["New_Price"])
    df['Fuel_Type']=df['Fuel_Type_Encoded']
    df['Owner_Type']=df['Owner_Type_Encoded']
    df['Transmission']=df['Transmission_Encoded']
    df['Name']=df['Name'].str.extract('(\w+)')
    df['Name']=df['Name'].astype('category').cat.codes
    df.drop(['Transmission_Encoded','Owner_Type_Encoded','Fuel_Type_Encoded','Location','New_Price','Price','Year'],axis=1,inplace=True)
    return df

def test_features(df):
    df['Age']=df['Year'].apply(calcAge)
    df['Kilometers_Driven']=norm(df['Kilometers_Driven'])
    df['Engine']=norm(df["Engine"])
    df['Power']=norm(df["Power"])
    df['Mileage']=norm(df["Mileage"])
    # df['Price_Drop']=df['New_Price']-df['Price']
    # df['Price']=norm(df["Price"])
    # df['Price_Drop']=norm(df["Price_Drop"])
    df['New_Price']=norm(df["New_Price"])
    df['Fuel_Type']=df['Fuel_Type_Encoded']
    df['Owner_Type']=df['Owner_Type_Encoded']
    df['Transmission']=df['Transmission_Encoded']
    df['Name']=df['Name'].str.extract('(\w+)')
    df['Name']=df['Name'].astype('category').cat.codes
    df.drop(['Transmission_Encoded','Owner_Type_Encoded','Fuel_Type_Encoded','Location','Year'],axis=1,inplace=True)
    return df

traindf=preprocess(pd.read_csv('./Datasets/train_data.csv'))
traindf=feature_engineering(traindf)
testdf=preprocess(pd.read_csv('./Datasets/test_data.csv'))
testdf=test_features(testdf)


# X = df.drop('target_variable', axis=1)
# y = df['target_variable']

In [8]:
traindf

Unnamed: 0,Name,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Age,Price_Drop
0,5,0.210280,2,1,0,0.542636,0.181818,0.090607,5.0,12,0.671489
1,21,0.163551,1,0,0,0.338700,0.889091,0.302534,8.0,7,0.667384
2,12,0.115383,2,1,0,0.642815,0.301364,0.127847,5.0,5,0.648543
3,15,0.509346,1,1,0,0.402504,0.762727,0.312926,7.0,9,0.758294
4,1,0.149449,1,0,0,0.676506,0.543636,0.349885,5.0,9,0.841128
...,...,...,...,...,...,...,...,...,...,...,...
476,5,0.146019,2,1,0,0.521765,0.181818,0.090607,5.0,6,0.657762
477,6,0.281893,2,1,0,0.563506,0.180909,0.073100,5.0,9,0.668730
478,5,0.317757,2,1,0,0.581395,0.181818,0.090760,5.0,9,0.664154
479,22,0.207907,1,0,0,0.641026,0.317727,0.141285,5.0,7,0.680035


In [9]:
testdf

Unnamed: 0,Name,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Age
0,1,0.380165,1,0,0,0.670244,0.288054,0.350384,5.0,0.675081,10
1,17,0.093689,2,1,1,0.425760,0.240607,0.217468,5.0,0.194647,7
2,4,0.023022,2,0,0,0.536673,0.168112,0.164450,5.0,0.140815,5
3,14,0.248205,1,0,0,0.584377,0.159441,0.141816,5.0,0.139599,6
4,9,0.370189,1,1,0,0.846750,0.108141,0.053708,5.0,0.069444,8
...,...,...,...,...,...,...,...,...,...,...,...
337,16,0.055490,2,1,0,0.523852,0.094894,0.091304,5.0,0.065795,7
338,18,0.220779,2,1,0,0.479726,0.192437,0.129156,5.0,0.107766,12
339,9,0.216104,1,1,0,0.724508,0.108141,0.090793,5.0,0.087693,7
340,4,0.195667,2,1,0,0.551580,0.096098,0.086445,5.0,0.054238,10


In [5]:
import pandas as pd
import tensorflow as tf
import numpy as np
import keras
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Preprocessing


def preprocess(df):
    # nan_cols = df.isna().any()
    # print("Empty Cols = ")
    # print(nan_cols[nan_cols == True].index.tolist())
    df.dropna(inplace=True)
    df['Fuel_Type_Encoded'] = df['Fuel_Type'].astype('category').cat.codes
    df['Transmission_Encoded'] = df['Transmission'].astype(
        'category').cat.codes
    df['Owner_Type_Encoded'] = df['Owner_Type'].astype('category').cat.codes
    df['New_Price'] = df['New_Price'].str.replace('Lakh', '')
    df['New_Price'] = df['New_Price'].str.replace('Cr', '')
    df['Power'] = df['Power'].str.replace('bhp', '')
    df['Engine'] = df['Engine'].str.replace('CC', '')
    df['Mileage'] = df['Mileage'].str.replace('kmpl', '')
    df['Mileage'] = df['Mileage'].str.replace('km/kg', '')
    df['New_Price'] = df['New_Price'].str.strip()
    df['Power'] = df['Power'].str.strip()
    df['Engine'] = df['Engine'].str.strip()
    df['Mileage'] = df['Mileage'].str.strip()
    df['Mileage'] = df['Mileage'].astype(float)
    df['Engine'] = df['Engine'].astype(float)
    df['Power'] = df['Power'].astype(float)
    df['New_Price'] = df['New_Price'].astype(float)
    df = df.reset_index(drop=True)
    return df

# Feature Engineering


def calcAge(year):
    current_year = datetime.now().year
    return current_year-year


def norm(srs):
    scaler = MinMaxScaler()
    scaled_srs = scaler.fit_transform(srs.values.reshape(-1, 1))
    return scaled_srs


def feature_engineering(df):
    df['Age'] = df['Year'].apply(calcAge)
    df['Kilometers_Driven'] = norm(df['Kilometers_Driven'])
    df['Engine'] = norm(df["Engine"])
    df['Power'] = norm(df["Power"])
    df['Mileage'] = norm(df["Mileage"])
    # df['Price_Drop']=df['New_Price']-df['Price']
    # df['Price']=norm(df["Price"])
    # df['Price_Drop']=norm(df["Price_Drop"])
    df['New_Price'] = norm(df["New_Price"])
    df['Fuel_Type'] = df['Fuel_Type_Encoded']
    df['Owner_Type'] = df['Owner_Type_Encoded']
    df['Transmission'] = df['Transmission_Encoded']
    df['Name'] = df['Name'].str.extract('(\w+)')
    df['Name'] = df['Name'].astype('category').cat.codes
    # df.drop(['Transmission_Encoded','Owner_Type_Encoded','Fuel_Type_Encoded','Location','New_Price','Price','Year'],axis=1,inplace=True)
    df.drop(['Transmission_Encoded', 'Owner_Type_Encoded',
            'Fuel_Type_Encoded', 'Location', 'Year'], axis=1, inplace=True)
    return df


def test_features(df):
    df['Age'] = df['Year'].apply(calcAge)
    df['Kilometers_Driven'] = norm(df['Kilometers_Driven'])
    df['Engine'] = norm(df["Engine"])
    df['Power'] = norm(df["Power"])
    df['Mileage'] = norm(df["Mileage"])
    # df['Price_Drop']=df['New_Price']-df['Price']
    # df['Price']=norm(df["Price"])
    # df['Price_Drop']=norm(df["Price_Drop"])
    df['New_Price'] = norm(df["New_Price"])
    df['Fuel_Type'] = df['Fuel_Type_Encoded']
    df['Owner_Type'] = df['Owner_Type_Encoded']
    df['Transmission'] = df['Transmission_Encoded']
    df['Name'] = df['Name'].str.extract('(\w+)')
    df['Name'] = df['Name'].astype('category').cat.codes
    df.drop(['Transmission_Encoded', 'Owner_Type_Encoded',
            'Fuel_Type_Encoded', 'Location', 'Year'], axis=1, inplace=True)
    return df


traindf = preprocess(pd.read_csv('./Datasets/train_data.csv'))
traindf = feature_engineering(traindf)
testdf = preprocess(pd.read_csv('./Datasets/test_data.csv'))
testdf = test_features(testdf)


X = traindf.drop('Price', axis=1)
y = traindf['Price']
# X = traindf.drop('Price_Drop', axis=1)
# y = traindf['Price_Drop']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_rmse = mean_squared_error(y_test, lr_pred, squared=False)
lr_r2 = r2_score(y_test, lr_pred)

# Random Forest Regressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)
rfr_rmse = mean_squared_error(y_test, rfr_pred, squared=False)
rfr_r2 = r2_score(y_test, rfr_pred)

# SVM
# svr = SVR()
# svr.fit(X_train, y_train)
# svr_pred = svr.predict(X_test)
# svr_rmse = mean_squared_error(y_test, svr_pred, squared=False)
# svr_r2 = r2_score(y_test, svr_pred)

print('Linear Regression RMSE:', lr_rmse)
print('Linear Regression R^2:', lr_r2)
print('Random Forest Regression RMSE:', rfr_rmse)
print('Random Forest Regression R^2:', rfr_r2)
# print('Support Vector Regression RMSE:', svr_rmse)
# print('Support Vector Regression R^2:', svr_r2)

# Price Drop Pred
# Linear Regression RMSE: 0.06854988080410929
# Linear Regression R^2: 0.0007094859253358177
# Random Forest Regression RMSE: 0.047372304163081906
# Random Forest Regression R^2: 0.5227703975083304
# Support Vector Regression RMSE: 0.07531974990854874
# Support Vector Regression R^2: -0.20641324201886424

# Final Price Pred
# Linear Regression RMSE: 6.182162793359798
# Linear Regression R^2: 0.8060657719191427
# Random Forest Regression RMSE: 3.2735414592396417
# Random Forest Regression R^2: 0.9456237463209227
# Support Vector Regression RMSE: 14.09856302734958
# Support Vector Regression R^2: -0.00861005204204579

# Therefore a Random Forest Regression is a better choice to calculate a price drop whereas a Linear Regression Model as well as Random Forest both are good at calculating new Prices but RF is slightly better.


# from joblib import dump
# # dump(lr, './Models/linear_regression_price.joblib')
# # dump(rfr, './Models/random_forest_price.joblib')
# dump(lr, './Models/linear_regression_drop.joblib')
# dump(rfr, './Models/random_forest_drop.joblib')


y_pred_lr = lr.predict(testdf)
y_pred_rf = rfr.predict(testdf)
testdf = preprocess(pd.read_csv('./Datasets/test_data.csv'))
testdf['Price']=y_pred_rf
# df=pd.DataFrame({'LR':y_pred_lr,'RF':y_pred_rf})
# df

Linear Regression RMSE: 6.182162793359798
Linear Regression R^2: 0.8060657719191427
Random Forest Regression RMSE: 3.683996583021125
Random Forest Regression R^2: 0.9311328628910225


In [6]:
testdf

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Fuel_Type_Encoded,Transmission_Encoded,Owner_Type_Encoded,Price
0,BMW 5 Series 520d Luxury Line,Delhi,2013,65000,Diesel,Automatic,First,22.48,1995.0,190.00,5.0,67.87,1,0,0,18.9679
1,Toyota Corolla Altis 1.8 G,Bangalore,2016,16471,Petrol,Manual,Second,14.28,1798.0,138.03,5.0,20.48,2,1,1,13.8277
2,Honda City i-VTEC CVT VX,Pune,2018,4500,Petrol,Automatic,First,18.00,1497.0,117.30,5.0,15.17,2,0,0,11.3622
3,Renault Duster 110PS Diesel RxZ AMT,Coimbatore,2017,42646,Diesel,Automatic,First,19.60,1461.0,108.45,5.0,15.05,1,0,0,10.1561
4,Maruti Swift VDI,Jaipur,2015,63310,Diesel,Manual,First,28.40,1248.0,74.00,5.0,8.13,1,1,0,4.3551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,Tata Bolt Revotron XT,Chennai,2016,10000,Petrol,Manual,First,17.57,1193.0,88.70,5.0,7.77,2,1,0,5.3283
338,Volkswagen Vento 1.6 Highline,Mumbai,2011,38000,Petrol,Manual,First,16.09,1598.0,103.50,5.0,11.91,2,1,0,4.6694
339,Maruti Vitara Brezza VDi,Pune,2016,37208,Diesel,Manual,First,24.30,1248.0,88.50,5.0,9.93,1,1,0,5.4863
340,Honda Brio 1.2 VX MT,Delhi,2013,33746,Petrol,Manual,First,18.50,1198.0,86.80,5.0,6.63,2,1,0,3.5602
