In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
car_resale_dataset = pd.read_csv("./data/train.csv")
car_resale_test = pd.read_csv("./data/test.csv")
car_resale_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16784 entries, 0 to 16783
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   listing_id         16784 non-null  int64  
 1   title              16784 non-null  object 
 2   make               14624 non-null  object 
 3   model              16784 non-null  object 
 4   description        16439 non-null  object 
 5   manufactured       16590 non-null  float64
 6   original_reg_date  318 non-null    object 
 7   reg_date           16583 non-null  object 
 8   type_of_vehicle    16784 non-null  object 
 9   category           16784 non-null  object 
 10  transmission       16784 non-null  object 
 11  curb_weight        16205 non-null  float64
 12  power              14447 non-null  float64
 13  fuel_type          3490 non-null   object 
 14  engine_cap         16731 non-null  float64
 15  no_of_owners       16608 non-null  float64
 16  depreciation       163

In [3]:
titles = car_resale_dataset["title"].str.split(pat=" ", expand=True)
titles.rename(columns={0:'title_make'},inplace=True)

car_resale_dataset_cleaned = car_resale_dataset.copy()
car_resale_dataset_cleaned.make.fillna(titles.title_make,inplace=True)
car_resale_dataset_cleaned.make = car_resale_dataset_cleaned.make.str.lower()
car_resale_dataset.make.value_counts(), car_resale_dataset_cleaned.make.value_counts()

(toyota           2163
 mercedes-benz    1967
 bmw              1636
 honda            1519
 nissan           1091
                  ... 
 chrysler            1
 international       1
 rover               1
 daf                 1
 riley               1
 Name: make, Length: 77, dtype: int64,
 toyota           2914
 mercedes-benz    2208
 honda            2156
 bmw              1636
 nissan           1091
                  ... 
 chrysler            1
 international       1
 rover               1
 daf                 1
 riley               1
 Name: make, Length: 77, dtype: int64)

In [4]:
car_resale_dataset_cleaned = car_resale_dataset_cleaned[car_resale_dataset_cleaned.manufactured <= 2021]

In [5]:
car_resale_dataset_cleaned.isnull().sum()

listing_id               0
title                    0
make                     0
model                    0
description            343
manufactured             0
original_reg_date    16459
reg_date                15
type_of_vehicle          0
category                 0
transmission             0
curb_weight            389
power                 2179
fuel_type            13150
engine_cap              50
no_of_owners            17
depreciation           245
coe                    887
road_tax              2117
dereg_value            999
mileage               3654
omv                     36
arf                     95
opc_scheme           16382
lifespan             14887
eco_category             0
features               441
accessories           2783
indicative_price     16589
price                    0
dtype: int64

In [6]:
car_resale_dataset_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16589 entries, 0 to 16783
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   listing_id         16589 non-null  int64  
 1   title              16589 non-null  object 
 2   make               16589 non-null  object 
 3   model              16589 non-null  object 
 4   description        16246 non-null  object 
 5   manufactured       16589 non-null  float64
 6   original_reg_date  130 non-null    object 
 7   reg_date           16574 non-null  object 
 8   type_of_vehicle    16589 non-null  object 
 9   category           16589 non-null  object 
 10  transmission       16589 non-null  object 
 11  curb_weight        16200 non-null  float64
 12  power              14410 non-null  float64
 13  fuel_type          3439 non-null   object 
 14  engine_cap         16539 non-null  float64
 15  no_of_owners       16572 non-null  float64
 16  depreciation       163

In [7]:
numeric_features = car_resale_dataset_cleaned.dtypes[car_resale_dataset_cleaned.dtypes != 'object'].index
numeric_features

Index(['listing_id', 'manufactured', 'curb_weight', 'power', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'indicative_price', 'price'],
      dtype='object')

In [8]:
car_resale_dataset_cleaned.reg_date = pd.to_datetime(car_resale_dataset_cleaned.reg_date)
car_resale_dataset_cleaned.reg_date = car_resale_dataset_cleaned.reg_date.apply(lambda x: float(x.year))
car_resale_dataset_cleaned.reg_date.describe()

count    16574.000000
mean      2014.648968
std          4.900726
min       1969.000000
25%       2011.000000
50%       2016.000000
75%       2018.000000
max       2022.000000
Name: reg_date, dtype: float64

In [9]:
car_resale_dataset_cleaned = car_resale_dataset_cleaned.drop(columns=['original_reg_date','fuel_type','opc_scheme','lifespan','eco_category','indicative_price'])
car_resale_dataset_cleaned.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,reg_date,type_of_vehicle,category,transmission,...,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,features,accessories,price
0,1030324,BMW 3 Series 320i Gran Turismo M-Sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,2013.0,luxury sedan,"parf car, premium ad car, low mileage car",auto,...,17700.0,77100.0,1210.0,47514.0,73000.0,45330.0,50462.0,"5 doors gt, powerful and fuel efficient 2.0l t...","bmw i-drive, navigation, bluetooth/aux/usb inp...",71300.0
1,1021510,Toyota Hiace 3.0M,toyota,hiace,high loan available! low mileage unit. wear an...,2014.0,2015.0,van,premium ad car,manual,...,11630.0,10660.0,,3648.0,110112.0,27502.0,1376.0,low mileage unit. well maintained vehicle. vie...,factory radio setting. front recording camera....,43800.0
2,1026909,Mercedes-Benz CLA-Class CLA180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,2016.0,luxury sedan,"parf car, premium ad car",auto,...,15070.0,53694.0,740.0,44517.0,80000.0,27886.0,26041.0,responsive and fuel efficient 1.6l inline 4 cy...,dual electric/memory seats. factory fitted aud...,95500.0
3,1019371,Mercedes-Benz E-Class E180 Avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,2020.0,luxury sedan,"parf car, almost new car, consignment car",auto,...,16400.0,40690.0,684.0,80301.0,9800.0,46412.0,56977.0,"1.5l inline-4 twin scroll turbocharged engine,...",64 colour ambient lighting. active parking ass...,197900.0
4,1031014,Honda Civic 1.6A VTi,honda,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,2019.0,mid-sized sedan,parf car,auto,...,10450.0,26667.0,742.0,36453.0,40000.0,20072.0,20101.0,"1.6l i-vtec engine, 123 bhp, earth dreams cvt ...","s/rims, premium leather seats, factory touch s...",103200.0


In [10]:
car_resale_dataset_cleaned = car_resale_dataset_cleaned.dropna()
car_resale_dataset_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10312 entries, 0 to 16783
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   listing_id       10312 non-null  int64  
 1   title            10312 non-null  object 
 2   make             10312 non-null  object 
 3   model            10312 non-null  object 
 4   description      10312 non-null  object 
 5   manufactured     10312 non-null  float64
 6   reg_date         10312 non-null  float64
 7   type_of_vehicle  10312 non-null  object 
 8   category         10312 non-null  object 
 9   transmission     10312 non-null  object 
 10  curb_weight      10312 non-null  float64
 11  power            10312 non-null  float64
 12  engine_cap       10312 non-null  float64
 13  no_of_owners     10312 non-null  float64
 14  depreciation     10312 non-null  float64
 15  coe              10312 non-null  float64
 16  road_tax         10312 non-null  float64
 17  dereg_value 

In [11]:
car_resale_dataset_cleaned[(car_resale_dataset_cleaned.price < 80000) & (car_resale_dataset_cleaned.make == "bmw")].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 0 to 16727
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   listing_id       236 non-null    int64  
 1   title            236 non-null    object 
 2   make             236 non-null    object 
 3   model            236 non-null    object 
 4   description      236 non-null    object 
 5   manufactured     236 non-null    float64
 6   reg_date         236 non-null    float64
 7   type_of_vehicle  236 non-null    object 
 8   category         236 non-null    object 
 9   transmission     236 non-null    object 
 10  curb_weight      236 non-null    float64
 11  power            236 non-null    float64
 12  engine_cap       236 non-null    float64
 13  no_of_owners     236 non-null    float64
 14  depreciation     236 non-null    float64
 15  coe              236 non-null    float64
 16  road_tax         236 non-null    float64
 17  dereg_value   

In [12]:
user_select_data = car_resale_dataset_cleaned[(car_resale_dataset_cleaned.price < 80000) & (car_resale_dataset_cleaned.make == "bmw")]
user_select_data.iloc[0:5]

Unnamed: 0,listing_id,title,make,model,description,manufactured,reg_date,type_of_vehicle,category,transmission,...,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,features,accessories,price
0,1030324,BMW 3 Series 320i Gran Turismo M-Sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,2013.0,luxury sedan,"parf car, premium ad car, low mileage car",auto,...,17700.0,77100.0,1210.0,47514.0,73000.0,45330.0,50462.0,"5 doors gt, powerful and fuel efficient 2.0l t...","bmw i-drive, navigation, bluetooth/aux/usb inp...",71300.0
57,1023935,BMW 1 Series 116i Urban,bmw,116i,low mileage and service regularly.,2012.0,2012.0,hatchback,"parf car, direct owner sale, low mileage car",auto,...,27010.0,56501.0,742.0,17432.0,67000.0,27217.0,27217.0,view specs of the bmw 1 series hatchback,"m-sport bodykit, m-sport brake, mitin suspensi...",35100.0
95,1024395,BMW 3 Series 316i,bmw,316i,perfect condition. interior nice and clean wit...,2013.0,2013.0,luxury sedan,parf car,auto,...,15130.0,79223.0,742.0,31297.0,80000.0,31108.0,25552.0,powerful and economical 1.6l turbocharged engi...,keyless entry/start. multi-function steering w...,48900.0
137,991635,BMW 3 Series 318i,bmw,318i,powerful and yet economical in 1.5l bmw 318i s...,2015.0,2016.0,luxury sedan,parf car,auto,...,12710.0,44001.0,684.0,34770.0,116000.0,26979.0,24771.0,powered by 1.5l turbocharged producing 134 bhp...,"factory fitted black leather seat, factory fit...",76800.0
238,1008086,BMW 1 Series 116d,bmw,116d,superb white on black fully pml maintained uni...,2016.0,2016.0,hatchback,"parf car, consignment car",auto,...,11800.0,49501.0,1082.0,29244.0,79000.0,23325.0,9655.0,powerful bmw 1.5l diesel twin power turbo deli...,"retractable side mirrors, drl, comfort access,...",67000.0


In [13]:
features = car_resale_dataset_cleaned.dtypes.index
features

Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'reg_date', 'type_of_vehicle', 'category', 'transmission',
       'curb_weight', 'power', 'engine_cap', 'no_of_owners', 'depreciation',
       'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'features',
       'accessories', 'price'],
      dtype='object')

In [14]:
selected_features = ['listing_id','make', 'manufactured',
       'reg_date', 'type_of_vehicle', 'transmission',
       'power', 'engine_cap', 'no_of_owners', 'depreciation',
       'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'price']


car_resale_simplified_dataset = car_resale_dataset_cleaned[selected_features]
car_resale_simplified_dataset.head()

Unnamed: 0,listing_id,make,manufactured,reg_date,type_of_vehicle,transmission,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,price
0,1030324,bmw,2013.0,2013.0,luxury sedan,auto,135.0,1997.0,1.0,17700.0,77100.0,1210.0,47514.0,73000.0,45330.0,50462.0,71300.0
2,1026909,mercedes-benz,2016.0,2016.0,luxury sedan,auto,90.0,1595.0,1.0,15070.0,53694.0,740.0,44517.0,80000.0,27886.0,26041.0,95500.0
3,1019371,mercedes-benz,2019.0,2020.0,luxury sedan,auto,115.0,1497.0,1.0,16400.0,40690.0,684.0,80301.0,9800.0,46412.0,56977.0,197900.0
4,1031014,honda,2019.0,2019.0,mid-sized sedan,auto,92.0,1597.0,1.0,10450.0,26667.0,742.0,36453.0,40000.0,20072.0,20101.0,103200.0
6,1012998,volvo,2015.0,2015.0,hatchback,auto,90.0,1498.0,3.0,11020.0,56001.0,684.0,37311.0,77777.0,22809.0,18933.0,62500.0


In [15]:
car_resale_simplified_dataset.type_of_vehicle.value_counts()

suv                2450
luxury sedan       2156
mid-sized sedan    1731
hatchback          1342
mpv                1205
sports car         1182
stationwagon        245
others                1
Name: type_of_vehicle, dtype: int64

In [16]:
car_resale_simplified_dataset.make.value_counts()

honda            1494
mercedes-benz    1448
toyota           1442
bmw              1201
audi              532
mazda             449
volkswagen        381
hyundai           376
nissan            353
kia               349
mitsubishi        290
porsche           250
lexus             239
subaru            237
volvo             163
mini              130
suzuki             94
jaguar             81
land rover         72
maserati           69
bentley            65
renault            55
skoda              52
peugeot            52
citroen            44
lamborghini        43
ford               41
ferrari            39
opel               31
ssangyong          31
chevrolet          30
rolls-royce        28
seat               26
infiniti           24
aston martin       19
mclaren            17
jeep               10
proton             10
alfa romeo          7
daihatsu            7
perodua             5
lotus               3
hummer              3
mercedes            3
mg                  2
cupra     

In [17]:
car_resale_simplified_dataset.make[car_resale_simplified_dataset.make == "mercedes"] = "mercedes-benz"
car_resale_simplified_dataset.make.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)


honda            1494
mercedes-benz    1451
toyota           1442
bmw              1201
audi              532
mazda             449
volkswagen        381
hyundai           376
nissan            353
kia               349
mitsubishi        290
porsche           250
lexus             239
subaru            237
volvo             163
mini              130
suzuki             94
jaguar             81
land rover         72
maserati           69
bentley            65
renault            55
skoda              52
peugeot            52
citroen            44
lamborghini        43
ford               41
ferrari            39
opel               31
ssangyong          31
chevrolet          30
rolls-royce        28
seat               26
infiniti           24
aston martin       19
mclaren            17
proton             10
jeep               10
daihatsu            7
alfa romeo          7
perodua             5
lotus               3
hummer              3
mg                  2
alpine              2
cupra     

In [18]:
car_resale_simplified_dataset.to_csv("./data/preprocessed_recommendation_data.csv",index=None)
car_resale_simplified_dataset.shape

(10312, 17)