In [1]:
import warnings 
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import torch 
import torch.nn as nn

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
train = pd.read_csv('train-data.csv')
test = pd.read_csv('test-data.csv')
print(train.shape)
print(train.info())
print(test.shape)
print(test.info())

(6019, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB
None
(1234, 13)
<class 'pandas.core.frame.DataFram

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
test.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,
2,2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,25.27 Lakh
3,3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,
4,4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,


In [5]:
print(train.shape, test.shape)
train.drop('New_Price', axis=1, inplace=True)
test.drop('New_Price', axis=1, inplace=True)
print(train.shape, test.shape)

(6019, 14) (1234, 13)
(6019, 13) (1234, 12)


# Data Cleaning and Feature Engineering

In [6]:
print(train.isnull().sum())
print("------------------------")
print(test.isnull().sum())

Unnamed: 0            0
Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64
------------------------
Unnamed: 0            0
Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               0
Engine               10
Power                10
Seats                11
dtype: int64


## Unnamed

In [7]:
print(train['Unnamed: 0'].value_counts())

2047    1
2720    1
2708    1
661     1
4759    1
       ..
5436    1
1342    1
3391    1
5440    1
0       1
Name: Unnamed: 0, Length: 6019, dtype: int64


In [8]:
unnamed_train = list(train['Unnamed: 0'].unique())
unnamed_test = list(test['Unnamed: 0'].unique())
for i in unnamed_test:
    if i not in unnamed_train:
        print(i)

## Name

In [9]:
print(train['Name'].value_counts())
print(train['Unnamed: 0'].value_counts())

Mahindra XUV500 W8 2WD          49
Maruti Swift VDI                45
Honda City 1.5 S MT             34
Maruti Swift Dzire VDI          34
Maruti Swift VDI BSIV           31
                                ..
BMW X3 xDrive 20d Expedition     1
Maruti Wagon R Duo Lxi           1
BMW 3 Series 328i Sport Line     1
Audi A4 2.0 TFSI                 1
Maruti Wagon R Duo LX BSIII      1
Name: Name, Length: 1876, dtype: int64
2047    1
2720    1
2708    1
661     1
4759    1
       ..
5436    1
1342    1
3391    1
5440    1
0       1
Name: Unnamed: 0, Length: 6019, dtype: int64


In [10]:
print(test['Name'].value_counts())

Maruti Alto LXi                     9
Honda City 1.5 V MT                 8
Volkswagen Polo 1.2 MPI Highline    8
Maruti Swift Dzire VDI              8
Maruti Wagon R LXI                  7
                                   ..
Renault Duster 85PS Diesel RxZ      1
Nissan Terrano XE 85 PS             1
Tata Indigo eCS eLX BS IV           1
Tata Nano Cx BSIV                   1
Honda Amaze S AT i-Vtech            1
Name: Name, Length: 768, dtype: int64


In [11]:
car_names_train = list(train['Name'].unique())
car_names_test = list(test['Name'].unique())
for i in car_names_test:
    if i not in car_names_train:
        print(i)

Toyota Innova Crysta Touring Sport 2.4 MT
Maruti Swift AMT ZXI
Skoda Laura 1.8 TSI Ambition
Honda Civic 2010-2013 1.8 S MT Inspire
Toyota Etios Liva 1.4 VXD
Maruti Celerio X VXI Option
Chevrolet Sail Hatchback 1.2
Mahindra Scorpio VLX Special Edition BS-IV
Tata Tiago AMT 1.2 Revotron XTA
Maruti Ciaz VXi
Nissan Teana XL
Tata Manza Club Class Safire90 LX
Ford Fiesta Classic 1.6 SXI Duratec
Mercedes-Benz B Class B180 Sports
Nissan Micra XL CVT
Tata Indica V2 DiCOR DLG BS-III
Tata Indica Vista Quadrajet LX
Audi Q3 30 TDI S Edition
Honda BRV i-DTEC V MT
BMW 3 Series GT 320d Sport Line
Toyota Etios Liva VD
Datsun GO T Petrol
Maruti A-Star Zxi
Mahindra KUV 100 mFALCON G80 K4 5str
Ford Fiesta 1.4 SXI Duratorq
Fiat Avventura FIRE Dynamic
Honda CR-V Diesel
Mahindra Xylo E9
Hyundai Creta 1.6 SX Automatic
Honda Mobilio V i VTEC
Tata Indica Vista Aqua TDI BSIII
Skoda Laura 1.9 TDI MT Elegance
Honda City ZX VTEC Plus
BMW 7 Series 730Ld DPE Signature
Hindustan Motors Contessa 2.0 DSL
Fiat Linea Dynam

We can not handle all of these values, we have to drop the name column

In [12]:
print(train.shape, test.shape)
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)
print(train.shape, test.shape)

(6019, 13) (1234, 12)
(6019, 12) (1234, 11)


## Location

In [13]:
train['Location'][:20]

0         Mumbai
1           Pune
2        Chennai
3        Chennai
4     Coimbatore
5      Hyderabad
6         Jaipur
7         Mumbai
8           Pune
9        Chennai
10         Kochi
11       Kolkata
12        Jaipur
13         Delhi
14          Pune
15         Delhi
16         Kochi
17        Jaipur
18         Kochi
19     Bangalore
Name: Location, dtype: object

In [14]:
train['Location'].value_counts()

Mumbai        790
Hyderabad     742
Kochi         651
Coimbatore    636
Pune          622
Delhi         554
Kolkata       535
Chennai       494
Jaipur        413
Bangalore     358
Ahmedabad     224
Name: Location, dtype: int64

In [15]:
test['Location'].value_counts()

Mumbai        159
Pune          143
Coimbatore    136
Hyderabad     134
Kochi         121
Kolkata       119
Delhi         106
Chennai        97
Jaipur         86
Bangalore      82
Ahmedabad      51
Name: Location, dtype: int64

## Fuel Type

In [16]:
train['Fuel_Type'][:20]

0        CNG
1     Diesel
2     Petrol
3     Diesel
4     Diesel
5        LPG
6     Diesel
7     Diesel
8     Diesel
9     Diesel
10    Petrol
11    Petrol
12    Diesel
13    Diesel
14    Diesel
15    Diesel
16    Diesel
17    Diesel
18    Diesel
19    Diesel
Name: Fuel_Type, dtype: object

In [17]:
train['Fuel_Type'].value_counts()

Diesel      3205
Petrol      2746
CNG           56
LPG           10
Electric       2
Name: Fuel_Type, dtype: int64

In [18]:
test['Fuel_Type'].value_counts()

Diesel    647
Petrol    579
CNG         6
LPG         2
Name: Fuel_Type, dtype: int64

## Transmission

In [19]:
train['Transmission'][:20]

0        Manual
1        Manual
2        Manual
3        Manual
4     Automatic
5        Manual
6        Manual
7     Automatic
8        Manual
9        Manual
10       Manual
11    Automatic
12       Manual
13    Automatic
14    Automatic
15       Manual
16       Manual
17       Manual
18       Manual
19    Automatic
Name: Transmission, dtype: object

In [20]:
train['Transmission'].value_counts()

Manual       4299
Automatic    1720
Name: Transmission, dtype: int64

In [21]:
test['Transmission'].value_counts()

Manual       905
Automatic    329
Name: Transmission, dtype: int64

## Owner_Type

In [22]:
train['Owner_Type'][:20]

0      First
1      First
2      First
3      First
4     Second
5      First
6      First
7      First
8      First
9     Second
10     First
11     First
12     First
13     First
14    Second
15     First
16     First
17     First
18     First
19     First
Name: Owner_Type, dtype: object

In [23]:
train['Owner_Type'].value_counts()

First             4929
Second             968
Third              113
Fourth & Above       9
Name: Owner_Type, dtype: int64

In [24]:
test['Owner_Type'].value_counts()

First             1023
Second             184
Third               24
Fourth & Above       3
Name: Owner_Type, dtype: int64

## Mileage

In [25]:
train['Mileage'][:20]

0     26.6 km/kg
1     19.67 kmpl
2      18.2 kmpl
3     20.77 kmpl
4      15.2 kmpl
5     21.1 km/kg
6     23.08 kmpl
7     11.36 kmpl
8     20.54 kmpl
9      22.3 kmpl
10    21.56 kmpl
11     16.8 kmpl
12     25.2 kmpl
13     12.7 kmpl
14      0.0 kmpl
15     13.5 kmpl
16     25.8 kmpl
17     28.4 kmpl
18    20.45 kmpl
19    14.84 kmpl
Name: Mileage, dtype: object

- km/kg: Kilometers per Kilogram
- kmpl: Kilometers per Litre

In [26]:
train['Mileage'].value_counts(dropna=False)

17.0 kmpl     172
18.9 kmpl     172
18.6 kmpl     119
20.36 kmpl     88
21.1 kmpl      86
             ... 
20.62 kmpl      1
13.58 kmpl      1
21.2 kmpl       1
23.01 kmpl      1
12.37 kmpl      1
Name: Mileage, Length: 443, dtype: int64

In [27]:
# Null Values
train['Mileage'] = train['Mileage'].fillna('17.0 kmpl')

We can make 2 new columns, one for the value and one for the measurement to handle the mile age column, then we drop it.

In [28]:
train['Mileage'][0].split()[0], train['Mileage'][0].split()[1]

('26.6', 'km/kg')

In [29]:
mile_age_val = pd.DataFrame(columns=['Mileage_value'])
mile_age_measure = pd.DataFrame(columns=['Mileage_measure'])
train = pd.concat([train, mile_age_val, mile_age_measure], axis=1)
test = pd.concat([test, mile_age_val, mile_age_measure], axis=1)

for i in range(train.shape[0]):
    value = train['Mileage'][i].split()[0]
    measure = train['Mileage'][i].split()[1]
    train['Mileage_value'][i] = value
    train['Mileage_measure'][i] = measure

for i in range(test.shape[0]):
    value = test['Mileage'][i].split()[0]
    measure = test['Mileage'][i].split()[1]
    test['Mileage_value'][i] = value
    test['Mileage_measure'][i] = measure
    
train.drop('Mileage', axis=1, inplace=True)
test.drop('Mileage', axis=1, inplace=True)

## Engine

In [30]:
train['Engine'][:20]

0      998 CC
1     1582 CC
2     1199 CC
3     1248 CC
4     1968 CC
5      814 CC
6     1461 CC
7     2755 CC
8     1598 CC
9     1248 CC
10    1462 CC
11    1497 CC
12    1248 CC
13    2179 CC
14    2179 CC
15    2477 CC
16    1498 CC
17    1248 CC
18    1461 CC
19    2143 CC
Name: Engine, dtype: object

In [31]:
train['Engine'].value_counts()

1197 CC    606
1248 CC    512
1498 CC    304
998 CC     259
2179 CC    240
          ... 
3200 CC      1
2995 CC      1
2349 CC      1
2112 CC      1
2773 CC      1
Name: Engine, Length: 146, dtype: int64

In [32]:
test['Engine'].value_counts()

1197 CC    126
1248 CC     98
1498 CC     66
1198 CC     54
1968 CC     50
          ... 
1997 CC      1
1047 CC      1
2979 CC      1
4395 CC      1
3498 CC      1
Name: Engine, Length: 104, dtype: int64

In [33]:
# Null Values
train['Engine'] = train['Engine'].fillna('1197 CC')
test['Engine'] = test['Engine'].fillna('1197 CC')

We are gonna do the same we did with Mileage

In [34]:
train['Engine'][0].split()[0], train['Engine'][0].split()[1]

('998', 'CC')

In [35]:
engine_val = pd.DataFrame(columns=['Engine_value'])
engine_measure = pd.DataFrame(columns=['Engine_measure'])
train = pd.concat([train, engine_val, engine_measure], axis=1)
test = pd.concat([test, engine_val, engine_measure], axis=1)

for i in range(train.shape[0]):
    value = train['Engine'][i].split()[0]
    measure = train['Engine'][i].split()[1]
    train['Engine_value'][i] = value
    train['Engine_measure'][i] = measure

for i in range(test.shape[0]):
    value = test['Engine'][i].split()[0]
    measure = test['Engine'][i].split()[1]
    test['Engine_value'][i] = value
    test['Engine_measure'][i] = measure
    
train.drop('Engine', axis=1, inplace=True)
test.drop('Engine', axis=1, inplace=True)

In [39]:
train['Engine_measure'].value_counts()

CC    6019
Name: Engine_measure, dtype: int64

In [40]:
train.drop('Engine_measure', axis=1, inplace=True)
test.drop('Engine_measure', axis=1, inplace=True)

In [41]:
train.shape, test.shape

((6019, 13), (1234, 12))

In [36]:
train.head()

Unnamed: 0.1,Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Power,Seats,Price,Mileage_value,Mileage_measure,Engine_value,Engine_measure
0,0,Mumbai,2010,72000,CNG,Manual,First,58.16 bhp,5.0,1.75,26.6,km/kg,998,CC
1,1,Pune,2015,41000,Diesel,Manual,First,126.2 bhp,5.0,12.5,19.67,kmpl,1582,CC
2,2,Chennai,2011,46000,Petrol,Manual,First,88.7 bhp,5.0,4.5,18.2,kmpl,1199,CC
3,3,Chennai,2012,87000,Diesel,Manual,First,88.76 bhp,7.0,6.0,20.77,kmpl,1248,CC
4,4,Coimbatore,2013,40670,Diesel,Automatic,Second,140.8 bhp,5.0,17.74,15.2,kmpl,1968,CC
