# `Predicting Old Car Price`

In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1) Importing the Dataset

In [143]:
df = pd.read_csv("./car_price.csv")
df = df.drop(df.columns[0], axis=1)

In [144]:
df.head()

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats


In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5512 entries, 0 to 5511
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   car_name             5512 non-null   object
 1   car_prices_in_rupee  5512 non-null   object
 2   kms_driven           5512 non-null   object
 3   fuel_type            5512 non-null   object
 4   transmission         5512 non-null   object
 5   ownership            5512 non-null   object
 6   manufacture          5512 non-null   int64 
 7   engine               5512 non-null   object
 8   Seats                5512 non-null   object
dtypes: int64(1), object(8)
memory usage: 387.7+ KB


In [146]:
df["car_name"].value_counts()

Maruti Alto 800 LXI          53
Maruti Swift VXI             46
Maruti Wagon R VXI BS IV     43
Maruti Swift Dzire VDI       42
Maruti Swift Dzire VXI       42
                             ..
Tata New Safari XT            1
Hyundai Creta 1.6 E Plus      1
Toyota Etios 2014-2016 GD     1
Mahindra Marazzo M8 8Str      1
BMW M Series M4 Coupe         1
Name: car_name, Length: 1896, dtype: int64

In [147]:
df.head()

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats


## 2) Data Preprocessing

### Converting Lakh to Numeric

In [148]:
def LakhtoNumber(x):
    
    x = x.replace(",", "").strip()
    
    if "Lakh" in x:
        return float(x.replace("Lakh", "").strip()) * 100000
    elif "Crore" in x:
        return float(x.replace("Crore", "").strip()) * 10000000
    else:
        return float(x)

df["car_prices_in_rupee"] = df["car_prices_in_rupee"].apply(LakhtoNumber)

### Converting kms_driven to Numeric

In [149]:
def kms_drivenTonNumeric(x):
    x = x.replace(",", "").strip()
    x = x.replace("kms", "").strip()
    return float(x)

df["kms_driven"] = df["kms_driven"].apply(kms_drivenTonNumeric)

### Dummy Variables

In [150]:
fuel_type = pd.get_dummies(df["fuel_type"], drop_first=True)
transmission = pd.get_dummies(df["transmission"], drop_first=True)
ownership = pd.get_dummies(df["ownership"], drop_first=True)

df = pd.concat([df, fuel_type, transmission, ownership], axis=1)

df = df.drop(["fuel_type", "transmission", "ownership"], axis=1)

### Label Encoding manufacture

In [151]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["manufacture"] = le.fit_transform(df["manufacture"])

### Converting Engine, Seats to Numeric

In [152]:
def engineToNumeric(x):
    x = x.replace("cc", "").strip()
    return float(x)

def SeatsToNumeric(x):
    x = x.replace("Seats", "").strip()
    return float(x)

df["engine"] = df["engine"].apply(engineToNumeric)
df["Seats"] = df["Seats"].apply(SeatsToNumeric)

### Dropping car_name

In [153]:
df = df.drop(["car_name"], axis=1)

In [154]:
df.head()

Unnamed: 0,car_prices_in_rupee,kms_driven,manufacture,engine,Seats,Diesel,Electric,Lpg,Petrol,Manual,1st Owner,2nd Owner,3rd Owner,4th Owner,5th Owner
0,1003000.0,86226.0,20,1956.0,5.0,1,0,0,0,1,1,0,0,0,0
1,1283000.0,13248.0,24,1330.0,5.0,0,0,0,1,0,1,0,0,0,0
2,1640000.0,60343.0,19,2494.0,5.0,0,0,0,1,0,1,0,0,0,0
3,777000.0,26696.0,21,1199.0,5.0,0,0,0,1,0,1,0,0,0,0
4,515000.0,69414.0,19,1199.0,5.0,0,0,0,1,1,1,0,0,0,0


In [155]:
df.dtypes

car_prices_in_rupee    float64
kms_driven             float64
manufacture              int64
engine                 float64
Seats                  float64
Diesel                   uint8
Electric                 uint8
Lpg                      uint8
Petrol                   uint8
Manual                   uint8
1st Owner                uint8
2nd Owner                uint8
3rd Owner                uint8
4th Owner                uint8
5th Owner                uint8
dtype: object

## 3) Building the Model

### Splitting into X and y

In [156]:
X = df.drop(["car_prices_in_rupee"], axis=1)
y = df["car_prices_in_rupee"]

### Splitting into Train and Test

In [157]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [158]:
X_train.shape, X_test.shape

((4409, 14), (1103, 14))

In [159]:
y_train.shape, y_test.shape

((4409,), (1103,))

### Linear Regression

In [160]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the Model

In [161]:
model.fit(X_train, y_train)

## 4) Evaluating the Model

In [162]:
model_score_train = model.score(X_train, y_train)
model_score_train

0.3536948582725108

In [163]:
model_score_test = model.score(X_test, y_test)
model_score_test

0.36738242123658904

In [164]:
predictions = model.predict(X_test)

In [165]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mse = mean_squared_error(y_test, predictions)
mse

2547681923195.9385

In [166]:
rmse = np.sqrt(mse)
rmse

1596145.959239298