In [448]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings

warnings.filterwarnings("ignore")

In [450]:
df = pd.read_csv('usedCarListing.csv')
del df['Unnamed: 0']
print(df.shape)
df.head()

(9841, 19)


Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,mpg_city,mpg_hwy,engine,transmission,drive_type,fuel_type,popular_feature,vehicle_history,cpo
0,2017,Ford,Fusion,SE FWD,Blue Mound,TX,32253,"$14,400",White Platinum Metallic Tri-Coat,Medium Light Stone,21,32,1.5L Inline-4 Gas Turbocharged,Automatic,FWD,Gas,"['Navigation', 'Backup Camera', 'Cruise Contro...","['0', ' reported accidents', 'Fleet or Mixed U...",False
1,2007,BMW,6,650i Convertible,Stanton,CA,59664,"$11,499",Mineral Silver Metallic,Tan,16,22,4.8L V-8 Gas,Automatic,RWD,Gas,[],"['0', ' reported accidents', 'Fleet or Mixed U...",False
2,2019,Kia,Optima,LX,Thornton,CO,4816,"$18,586",Ebony Black,Black,24,33,2.4L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Lane Keep Assist', 'Blueto...","['1', ' reported accidents', 'Personal or Rent...",True
3,2017,Ford,Fusion,SE FWD,Denver,NC,99515,"$8,999",Shadow Black,Black,21,32,2.5L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth', 'Backup Camera']","['1', ' reported accidents', 'Fleet or Mixed U...",False
4,2017,Hyundai,Elantra,SE 2.0L Sedan Automatic (alt),Wesley Chapel,FL,117200,"$6,109",Quartz White Pearl,Beige,29,38,2.0L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth']","['1', ' reported accidents', 'Personal or Rent...",False


## create new features

In [451]:
# split vehicle history into four variables(num_accident, usage_type, title, num_owner)
split_data = df["vehicle_history"].str.split(",")
data = split_data.to_list()
names = ["1", "2", "3", "4", '5', '6']
new_df = pd.DataFrame(data, columns=names)
df['num_accident'] = new_df['1'].apply(lambda x:x[2:-1])
df['usage_type'] = new_df['3'].apply(lambda x:x[2:-1]) 
df['title'] = new_df['4'].apply(lambda x:x[2:-1])
df['num_owner'] = new_df['5'].apply(lambda x:x[2:-2])

# delete the history column
del df['vehicle_history']

df.head()

Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
0,2017,Ford,Fusion,SE FWD,Blue Mound,TX,32253,"$14,400",White Platinum Metallic Tri-Coat,Medium Light Stone,...,1.5L Inline-4 Gas Turbocharged,Automatic,FWD,Gas,"['Navigation', 'Backup Camera', 'Cruise Contro...",False,0,Fleet or Mixed Use,Clean,1
1,2007,BMW,6,650i Convertible,Stanton,CA,59664,"$11,499",Mineral Silver Metallic,Tan,...,4.8L V-8 Gas,Automatic,RWD,Gas,[],False,0,Fleet or Mixed Use,Clean,3
2,2019,Kia,Optima,LX,Thornton,CO,4816,"$18,586",Ebony Black,Black,...,2.4L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Lane Keep Assist', 'Blueto...",True,1,Personal or Rental Use,Clean,2
3,2017,Ford,Fusion,SE FWD,Denver,NC,99515,"$8,999",Shadow Black,Black,...,2.5L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth', 'Backup Camera']",False,1,Fleet or Mixed Use,Clean,2
4,2017,Hyundai,Elantra,SE 2.0L Sedan Automatic (alt),Wesley Chapel,FL,117200,"$6,109",Quartz White Pearl,Beige,...,2.0L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth']",False,1,Personal or Rental Use,Clean,1


In [452]:
df.num_owner.unique(),df.num_accident.unique(),df.title.unique(),df.usage_type.unique()

(array(['1', '3', '2', '5', '4', '0', '6', '7', '8', 'alvage', '9', '13',
        'rame Damage'], dtype=object),
 array(['0', '1', '2', '3', '4', '5'], dtype=object),
 array(['Clean', 'Theft Recovered', 'Salvaged', 'Frame Damaged',
        'Theft Recovere', 'Lemon', 'Frame Damage'], dtype=object),
 array(['Fleet or Mixed Use', 'Personal or Rental Use'], dtype=object))

In [453]:
### Deal with title (combine theft recovered with theft recovere, combine frame damaged with frame damage)
df["title"]= df["title"].replace('Theft Recovere', "Theft Recovered") 
df["title"]= df["title"].replace('Frame Damage', "Frame Damaged") 
print(df.title.unique())

### Deal with num_owner that are strings
df["num_owner"]= df["num_owner"].replace('alvage', np.nan) 
df["num_owner"]= df["num_owner"].replace('rame Damage', np.nan) 
print(df.num_owner.unique())

['Clean' 'Theft Recovered' 'Salvaged' 'Frame Damaged' 'Lemon']
['1' '3' '2' '5' '4' '0' '6' '7' '8' nan '9' '13']


## check for missing values

In [454]:
df.isna().sum()

year               0
make               0
model              0
sub_model          0
city               0
state              0
mileage            0
price              0
exterior_color     0
interior_color     1
mpg_city           0
mpg_hwy            0
engine             0
transmission       0
drive_type         0
fuel_type          0
popular_feature    0
cpo                0
num_accident       0
usage_type         0
title              0
num_owner          5
dtype: int64

In [455]:
# categorical missing value, take the mode to fillna
df['interior_color'].fillna(df['interior_color'].value_counts().index[0], inplace=True)
# num_owern, numeric missing value, take the median to fillna
df['num_owner'].fillna(df['num_owner'].median(), inplace=True)
df.isna().sum()
# 

year               0
make               0
model              0
sub_model          0
city               0
state              0
mileage            0
price              0
exterior_color     0
interior_color     0
mpg_city           0
mpg_hwy            0
engine             0
transmission       0
drive_type         0
fuel_type          0
popular_feature    0
cpo                0
num_accident       0
usage_type         0
title              0
num_owner          0
dtype: int64

In [456]:
## look at mpg
print(df.mpg_city.unique())
print(df.mpg_hwy.unique())
## convert 'N' and 'cty' to nan.
df["mpg_city"]= df["mpg_city"].replace('N', np.nan) 
df["mpg_hwy"]= df["mpg_hwy"].replace('cty', np.nan) 
print(df.mpg_city.unique())
print(df.mpg_hwy.unique())

['21' '16' '24' '29' '31' '18' '17' '25' '23' '26' '19' '22' '15' 'N' '20'
 '14' '28' '27' '110' '30' '104' '13' '12' '101' '43' '44' '106' '34' '10'
 '95' '126' '85' '121' '32' '11' '9' '48' '122' '120' '41' '129' '40' '35'
 '38' '51' '49' '36' '66' '42' '33' '60' '124' '53' '128' '39' '91' '54'
 '47' '37' '8' '102']
['32' '22' '33' '38' '40' '25' '23' '30' '36' '31' '41' '29' '28' '34'
 '19' 'cty' '26' '21' '24' '37' '39' '99' '91' '18' '17' '93' '35' '44'
 '16' '92' '27' '15' '20' '81' '82' '103' '42' '45' '90' '108' '105' '102'
 '43' '48' '47' '66' '49' '51' '46' '109' '101' '100' '14' '50' '13' '110'
 '94' '12']
['21' '16' '24' '29' '31' '18' '17' '25' '23' '26' '19' '22' '15' nan '20'
 '14' '28' '27' '110' '30' '104' '13' '12' '101' '43' '44' '106' '34' '10'
 '95' '126' '85' '121' '32' '11' '9' '48' '122' '120' '41' '129' '40' '35'
 '38' '51' '49' '36' '66' '42' '33' '60' '124' '53' '128' '39' '91' '54'
 '47' '37' '8' '102']
['32' '22' '33' '38' '40' '25' '23' '30' '36' '31' '41'

In [457]:
# deal with mpg_city, mpg_hwy
# first convert mpg from string to numeric values
df['mpg_city'] = df['mpg_city'].astype(float)
df['mpg_hwy'] = df['mpg_hwy'].astype(float)

In [458]:
# since mpg is strongly correlated with vehicle fuel_type, especially for alternative fuel vehicles.
# Use vehicle with same fuel_type to fillna.
df[df['mpg_city'].isna()]['fuel_type'].unique()

array(['Gas', 'Hydrogen', 'Diesel', 'Plug-In', 'Hybrid'], dtype=object)

In [459]:
df[(df['fuel_type']=='Gas') & (df['mpg_city'].isna())].head()

Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
40,2017,Ford,Super,XL Crew Cab 8' Bed 4WD,Lexington,NC,94903,"$29,987",White,Medium Earth Gray,...,6.2L V-8 Gas,Automatic,4WD,Gas,[],False,0,Personal or Rental Use,Clean,1
159,2003,HUMMER,H2,SUV,Belton,TX,139739,"$11,900",Black,Wheat,...,6.0L V-8 Gas,Automatic,4WD,Gas,[],False,0,Fleet or Mixed Use,Clean,6
240,2004,HUMMER,H2,SUV,McCook,IL,149795,"$9,950",White,Wheat,...,6.0L V-8 Gas,Automatic,4WD,Gas,[],False,2,Personal or Rental Use,Clean,1
269,2017,Ford,Super,Lariat Crew Cab 6.75' Bed 4WD,Boise,ID,70634,"$35,995",Race Red,Unknown,...,6.2L V-8 Gas,Automatic,4WD,Gas,[],False,0,Personal or Rental Use,Clean,1
278,2004,Dodge,Ram,SLT Quad Cab Regular Bed 4WD,Colorado Springs,CO,159551,"$10,300",Gray,Black,...,5.7L V-8 Gas,Automatic,4WD,Gas,[],False,0,Fleet or Mixed Use,Clean,3


In [460]:
# use fuel_type (electric, plug-in, hybrid, hydrogen) to fillna first. 
# deal with mpg_city first.
for i in range(len(df)):
    if (df['fuel_type'][i] == 'Hydrogen' and np.isnan(df['mpg_city'][i])):
        df['mpg_city'][i]= df[df['fuel_type']=='Hydrogen']['mpg_city'].mean()
    elif (df['fuel_type'][i] == 'Plug-In' and np.isnan(df['mpg_city'][i])):
        df['mpg_city'][i]=df[df['fuel_type']=='Plug-In']['mpg_city'].mean()
    elif (df['fuel_type'][i] == 'Hybrid' and np.isnan(df['mpg_city'][i])):
        df['mpg_city'][i]=df[df['fuel_type']=='Hybrid']['mpg_city'].mean()
    # The missing values for Diesel and Gas vehicles are more like gas-gazzlers. So I can't use mean or median
    # to replace the missing values. 
    # I decided to use the 95% quantile values to fillna
    elif (df['fuel_type'][i] == 'Gas' and np.isnan(df['mpg_city'][i])):
        df['mpg_city'][i]=df[df['fuel_type']=='Gas']['mpg_city'].quantile(q=0.95)
    elif (df['fuel_type'][i] == 'Diesel' and np.isnan(df['mpg_city'][i])):
        df['mpg_city'][i]=df[df['fuel_type']=='Diesel']['mpg_city'].quantile(q=0.95)

# then deal with mpg_hwy.      
for i in range(len(df)):
    if (df['fuel_type'][i] == 'Hydrogen' and np.isnan(df['mpg_hwy'][i])):
        df['mpg_hwy'][i]=df[df['fuel_type']=='Hydrogen']['mpg_hwy'].mean()
    elif (df['fuel_type'][i] == 'Plug-In' and np.isnan(df['mpg_hwy'][i])):
        df['mpg_hwy'][i]=df[df['fuel_type']=='Plug-In']['mpg_hwy'].mean()
    elif (df['fuel_type'][i] == 'Hybrid' and np.isnan(df['mpg_hwy'][i])):
        df['mpg_hwy'][i]=df[df['fuel_type']=='Hybrid']['mpg_hwy'].mean()        
    
    # For diesel vehicles with missing values, I found all diesel vehicles are missing the mpg_hwy. 
    elif (df['fuel_type'][i] == 'Gas' and np.isnan(df['mpg_hwy'][i])):
        df['mpg_hwy'][i]=df[df['fuel_type']=='Gas']['mpg_hwy'].mean()
    elif (df['fuel_type'][i] == 'Diesel' and np.isnan(df['mpg_hwy'][i])):
        df['mpg_hwy'][i]=df[df['fuel_type']=='Diesel']['mpg_hwy'].mean()        
        
df.head()

Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
0,2017,Ford,Fusion,SE FWD,Blue Mound,TX,32253,"$14,400",White Platinum Metallic Tri-Coat,Medium Light Stone,...,1.5L Inline-4 Gas Turbocharged,Automatic,FWD,Gas,"['Navigation', 'Backup Camera', 'Cruise Contro...",False,0,Fleet or Mixed Use,Clean,1
1,2007,BMW,6,650i Convertible,Stanton,CA,59664,"$11,499",Mineral Silver Metallic,Tan,...,4.8L V-8 Gas,Automatic,RWD,Gas,[],False,0,Fleet or Mixed Use,Clean,3
2,2019,Kia,Optima,LX,Thornton,CO,4816,"$18,586",Ebony Black,Black,...,2.4L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Lane Keep Assist', 'Blueto...",True,1,Personal or Rental Use,Clean,2
3,2017,Ford,Fusion,SE FWD,Denver,NC,99515,"$8,999",Shadow Black,Black,...,2.5L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth', 'Backup Camera']",False,1,Fleet or Mixed Use,Clean,2
4,2017,Hyundai,Elantra,SE 2.0L Sedan Automatic (alt),Wesley Chapel,FL,117200,"$6,109",Quartz White Pearl,Beige,...,2.0L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth']",False,1,Personal or Rental Use,Clean,1


In [461]:
df.isna().sum()

year               0
make               0
model              0
sub_model          0
city               0
state              0
mileage            0
price              0
exterior_color     0
interior_color     0
mpg_city           0
mpg_hwy            0
engine             0
transmission       0
drive_type         0
fuel_type          0
popular_feature    0
cpo                0
num_accident       0
usage_type         0
title              0
num_owner          0
dtype: int64

In [462]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             9841 non-null   int64  
 1   make             9841 non-null   object 
 2   model            9841 non-null   object 
 3   sub_model        9841 non-null   object 
 4   city             9841 non-null   object 
 5   state            9841 non-null   object 
 6   mileage          9841 non-null   object 
 7   price            9841 non-null   object 
 8   exterior_color   9841 non-null   object 
 9   interior_color   9841 non-null   object 
 10  mpg_city         9841 non-null   float64
 11  mpg_hwy          9841 non-null   float64
 12  engine           9841 non-null   object 
 13  transmission     9841 non-null   object 
 14  drive_type       9841 non-null   object 
 15  fuel_type        9841 non-null   object 
 16  popular_feature  9841 non-null   object 
 17  cpo           

## Convert some categorical variables into numeric
Numeric variable: year, mileage, price, mpg_city, mpg_hwy, num_accident, num_owner

In [463]:
# deal with mileage for the comma.
df['mileage'] = df['mileage'].apply(lambda x: ''.join(x.split(',')))
df['mileage'] = df['mileage'].astype(int)
# deal with price
df['price'] = df['price'].apply(lambda x: ''.join(x[1:].split(',')))
df['price'] = df['price'].astype(int)
# deal with num_accident and num_owner
df['num_accident'] = df['num_accident'].astype(int)
df['num_owner'] = df['num_owner'].astype(int)

In [464]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             9841 non-null   int64  
 1   make             9841 non-null   object 
 2   model            9841 non-null   object 
 3   sub_model        9841 non-null   object 
 4   city             9841 non-null   object 
 5   state            9841 non-null   object 
 6   mileage          9841 non-null   int64  
 7   price            9841 non-null   int64  
 8   exterior_color   9841 non-null   object 
 9   interior_color   9841 non-null   object 
 10  mpg_city         9841 non-null   float64
 11  mpg_hwy          9841 non-null   float64
 12  engine           9841 non-null   object 
 13  transmission     9841 non-null   object 
 14  drive_type       9841 non-null   object 
 15  fuel_type        9841 non-null   object 
 16  popular_feature  9841 non-null   object 
 17  cpo           

## drop duplicate rows

In [465]:
# There are totally 139 duplcated rows
print(df.duplicated().sum())
# drop duplicated rows and keep the first row.
print(df.shape)
df = df.drop_duplicates(keep='first')
print(df.shape)

139
(9841, 22)
(9702, 22)


## Deal with potential wrong data (mileage, price)

In [471]:
# Check very low mileage and very high mileage
display(df.sort_values(by='mileage',ascending=False).head())
df.sort_values(by='mileage',ascending=True).head()
## The data seems making sense to me. I won't drop any data from mileage here.

Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
1346,2012,Ram,3500,SLT Crew Cab 8' Box 2WD,Charlotte,NC,484773,14995,Bright White,Dark Slate/Medium Graystone Interior,...,6.7L Inline-6 Diesel Turbocharged,Automatic,RWD,Diesel,"['Remote Engine Start', 'Cruise Control']",False,1,Fleet or Mixed Use,Clean,2
9194,2012,Volkswagen,Jetta,TDI SportWagen DSG,Kingsport,TN,445612,4900,Silver,Titan Black,...,2.0L Inline-4 Diesel Turbocharged,Automatic,FWD,Diesel,[],False,1,Personal or Rental Use,Clean,1
1916,2012,Ford,Super,"Lariat Crew Cab 172"" DRW 4WD",Phoenix,AZ,366633,15888,White,Unknown,...,6.7L V-8 Diesel Turbocharged,Automatic,4WD,Diesel,[],False,0,Fleet or Mixed Use,Clean,1
8470,2004,Toyota,Sequoia,Limited 4WD,East Landsdowne,PA,341000,3475,Pewter,Gray,...,4.7L V-8 Gas,Automatic,4WD,Gas,[],False,0,Personal or Rental Use,Clean,5
2427,2015,Chevrolet,Impala,LTZ with 2LZ,San Antonio,TX,325926,8250,Silver Ice Metallic,Jet Black/Dark Titanium,...,3.6L V-6 Gas,Automatic,FWD,Gas,"['Moonroof', 'Front Cooled Seats', 'Cruise Con...",False,0,Personal or Rental Use,Clean,1


Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
7003,2020,Mercedes-Benz,GLE,GLE 350 4MATIC,Little Rock,AR,7,52878,Polar White,Black,...,2.0L Inline-4 Gas Turbocharged,Automatic,AWD,Gas,[],False,0,Personal or Rental Use,Clean,0
2729,2019,Mercedes-Benz,Sprinter,"4500 170""",Monroe,NC,8,28998,White,Unknown,...,3.0L V-6 Diesel Turbocharged,Automatic,RWD,Diesel,[],False,0,Personal or Rental Use,Clean,0
8554,2018,Dodge,Journey,SE FWD,Dallas,GA,8,14767,Redline 2 Coat Pearl,Black,...,2.4L Inline-4 Gas,Automatic,FWD,Gas,"['Cruise Control', 'Bluetooth', 'Backup Camera']",False,0,Personal or Rental Use,Clean,1
880,2019,Mercedes-Benz,Sprinter,"3500XD 170""",Houston,TX,10,33995,White,Gray,...,3.0L V-6 Diesel Turbocharged,Automatic,RWD,Diesel,[],False,0,Personal or Rental Use,Clean,1
4521,2017,Ford,F-150,Lariat SuperCrew 5.5' Box 4WD,Somerville,NJ,10,31500,White Platinum Metallic Tri-Coat,Black,...,5.0L V-8 Gas Turbocharged,Automatic,4WD,Gas,"['Moonroof', 'Front Heated Seats', 'Remote Eng...",False,0,Personal or Rental Use,Clean,1


In [476]:
# Check very low price and very high price
display(df.sort_values(by='price',ascending=False).head(10))
df.sort_values(by='price',ascending=True).head(10)
## The high and low prices seems making sense to me. Luxury used cars have very high price. 
## Old used cars have very low prices
## I won't drop any data from mileage here.

Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
5513,2018,Bentley,Bentayga,W12 Signature,Downers Grove,IL,4118,169992,Magnolia,Beluga,...,6.0L W-12 Gas Turbocharged,Automatic,AWD,Gas,[],False,0,Personal or Rental Use,Clean,1
6761,2018,Bentley,Bentayga,Mulliner,Westlake Village,CA,9160,156491,Granite Metallic,Brown,...,6.0L W-12 Gas Turbocharged,Automatic,AWD,Gas,[],False,0,Personal or Rental Use,Clean,1
6392,2005,Lamborghini,Murcielago,Coupe,Boerne,TX,4565,148888,Grey Metallic,Red,...,6.2L V-12 Gas,Automatic,AWD,Gas,[],False,0,Personal or Rental Use,Clean,2
1898,2015,Rolls-Royce,Wraith,RWD,San Antonio,TX,14077,141968,Infinity Black,Black,...,6.6L V-12 Gas Turbocharged,Automatic,RWD,Gas,[],False,0,Personal or Rental Use,Clean,2
2647,2010,Ferrari,599,Coupe,Ontario,CA,30061,139500,Nero Daytona Metallic,Nero,...,6.0L V-12 Gas,Manual,RWD,Gas,[],False,0,Personal or Rental Use,Clean,5
434,2008,Ferrari,599,Coupe,Boerne,TX,18075,129888,Corsa Red,Interior Color,...,6.0L V-12 Gas,Automatic,RWD,Gas,[],False,0,Personal or Rental Use,Clean,5
436,2015,Ferrari,California,Convertible,Melbourne,FL,5835,128998,Nero Daytona Metallic,Sabbia,...,3.9L V-8 Gas Turbocharged,Automatic,RWD,Gas,[],False,0,Personal or Rental Use,Clean,2
6255,2008,Ferrari,599,Coupe,Marietta,GA,16148,124900,Silverstone Gray,Interior Color,...,6.0L V-12 Gas,Manual,RWD,Gas,[],False,0,Personal or Rental Use,Clean,4
1918,2012,Ferrari,FF,Hatchback,Pittsburgh,PA,25400,118075,Gray,Tan,...,6.3L V-12 Gas,Automatic,AWD,Gas,[],False,0,Personal or Rental Use,Clean,4
442,2008,Bentley,Azure,Convertible,Naples,FL,10393,109995,Beluga Black,Standard Interior,...,6.8L V-8 Gas Turbocharged,Automatic,RWD,Gas,[],False,0,Personal or Rental Use,Clean,2


Unnamed: 0,year,make,model,sub_model,city,state,mileage,price,exterior_color,interior_color,...,engine,transmission,drive_type,fuel_type,popular_feature,cpo,num_accident,usage_type,title,num_owner
779,1999,Suzuki,Grand,4dr JLX Auto 4WD,Manchester,MD,180410,1500,Black,Gray,...,2.5L V-6 Gas,Automatic,4WD,Gas,[],False,0,Personal or Rental Use,Clean,4
239,2005,Dodge,Caravan,Grand SXT LWB,Chicago,IL,183245,1500,Stone White,Unknown,...,3.8L V-6 Gas,Automatic,FWD,Gas,['Cruise Control'],False,1,Fleet or Mixed Use,Clean,4
4964,2000,Ford,Escort,ZX2 Coupe,Everett,WA,287225,1590,Silver Frost Metallic,Dark Charcoal,...,2.0L Inline-4 Gas,Manual,FWD,Gas,[],False,0,Personal or Rental Use,Clean,3
4788,1998,Chevrolet,Monte,LS,Kent,WA,188000,1600,Torch Red,Black,...,3.1L V-6 Gas,Automatic,FWD,Gas,[],False,0,Personal or Rental Use,Clean,6
6733,2001,Honda,Odyssey,EX 7-Passenger,Bloomfield Hills,MI,233255,1695,Dark Emerald Pearl,Ivory,...,3.5L V-6 Gas,Automatic,FWD,Gas,['Cruise Control'],False,1,Personal or Rental Use,Clean,1
4917,2005,Hyundai,Elantra,GLS Sedan Automatic,Orlando,FL,191215,1799,Red,Unknown,...,2.0L Inline-4 Gas,Automatic,FWD,Gas,[],False,2,Personal or Rental Use,Clean,1
4381,1998,Dodge,Caravan,Base 3-door FWD SWB,Boaz,AL,215798,1800,Blue,Unknown,...,3.0L V-6 Gas,Automatic,FWD,Gas,[],False,0,Personal or Rental Use,Clean,3
1376,2003,Buick,Regal,LS,Lakewood,NJ,128198,1800,Sterling Silver Metallic,Medium Gray,...,3.8L V-6 Gas,Automatic,FWD,Gas,['Cruise Control'],False,0,Personal or Rental Use,Clean,4
2611,2002,Honda,Odyssey,EX,Philadelphia,PA,215001,1850,Gray,Gray,...,3.5L V-6 Gas,Automatic,FWD,Gas,['Cruise Control'],False,0,Personal or Rental Use,Clean,3
351,2004,Hyundai,Santa,GLS 2.7L V6 FWD Automatic,East Landsdowne,PA,195000,1875,Black Obsidian,Tan,...,2.7L V-6 Gas,Automatic,FWD,Gas,['Cruise Control'],False,0,Personal or Rental Use,Clean,3


In [467]:
df.to_csv('usedCarListingCleaned.csv', encoding = 'utf-8')

After the feature engineering(creating new features and dropping features), dealing with missing values, dropping duplicated rows, there are totally 9702 records and 22 features left.