# Exploratory Data Analysis

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("autotrader_prices.csv", parse_dates=["year"])

In [24]:
df

Unnamed: 0,BHP,ULEZ,body,engine,fuel,link,mileage,name,owners,price,transmission,year
0,108.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20200...,82850,Audi A3 1.6 TDI SE 5d 109 BHP Nav Prep - Bluet...,,"£7,991",Manual,2015 (15 reg)
1,148.0,ULEZ,Hatchback,2.0L,Diesel,https://www.autotrader.co.uk/car-details/20201...,38627,"Audi A3 2.0 TDI SE 5d 148 BHP, £20 YEAR ROAD T...",,"£10,980",Manual,2015 (65 reg)
2,108.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,84812,Audi A3 1.6 TDI 110 Sport 5dr Satellite Naviga...,,"£8,350",Manual,2015 (15 reg)
3,182.0,ULEZ,Hatchback,2.0L,Diesel,https://www.autotrader.co.uk/car-details/20200...,74413,Audi A3 2.0 TDI Sport Sportback S Tronic quatt...,,"£9,699",Automatic,2015 (15 reg)
4,108.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,59152,Audi A3 1.6 TDI SE Sportback 5dr,,"£8,990",Manual,2015 (15 reg)
5,108.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,78876,Audi A3 1.6 TDI Ultra 110 SE 5dr Bluetooth,,"£8,800",Manual,2015 (65 reg)
6,108.0,ULEZ,Saloon,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,55591,Audi A3 1.6 TDI S LINE 4d 109 BHP,,"£12,799",Manual,2015 (15 reg)
7,108.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,47606,Audi A3 1.6 TDI SE Sportback 5dr,,"£9,490",Manual,2015 (15 reg)
8,148.0,ULEZ,Hatchback,2.0L,Diesel,https://www.autotrader.co.uk/car-details/20201...,35439,Audi A3 2.0 TDI S Line 5dr Satellite Navigation,,"£14,000",Manual,2015 (15 reg)
9,148.0,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,54831,Audi A3 1.4 TFSI 150 S Line 3dr Bluetooth,,"£11,750",Manual,2015 (65 reg)


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626 entries, 0 to 625
Data columns (total 12 columns):
BHP             469 non-null float64
ULEZ            617 non-null object
body            626 non-null object
engine          626 non-null object
fuel            625 non-null object
link            626 non-null object
mileage         626 non-null int64
name            626 non-null object
owners          34 non-null float64
price           626 non-null object
transmission    626 non-null object
year            626 non-null object
dtypes: float64(2), int64(1), object(9)
memory usage: 58.8+ KB


In [28]:
df.describe()

Unnamed: 0,BHP,mileage,owners
count,469.0,626.0,34.0
mean,157.720682,27150.118211,2.264706
std,63.677983,18308.22244,0.567227
min,68.0,10.0,2.0
25%,116.0,14663.25,2.0
50%,148.0,25153.5,2.0
75%,190.0,35429.5,2.0
max,340.0,124395.0,4.0


In [70]:
df.isna().sum()

BHP             157
ULEZ              9
body              0
engine            0
fuel              1
link              0
mileage           0
name              0
owners          592
price             0
transmission      0
year              0
dtype: int64

## Data cleaning

### Filling missing BHP
Tried to first replace with median value, however, need to use **a more sensible way of filling up this feature**

In [121]:
df_clean = df.copy()
df_clean["BHP"].fillna(df.BHP.median(), inplace=True)

### Filling ULEZ compliance

In [122]:
df_clean["ULEZ"].fillna("Non-ULEZ", inplace=True)

### Engine size parsing
Get rid of the Liter "L" and convert to float

In [123]:
df_clean["engine"] = df_clean["engine"].apply(lambda x: float(x[:-1]))

### Remove link

In [124]:
df_clean.drop(["link"], axis=1, inplace=True)

### Parsing name into multiple extra features

In [125]:
df_clean["make"] = df_clean["name"].apply(lambda x:x.split()[0])
df_clean["model"] = df_clean["name"].apply(lambda x:x.split()[1])

In [126]:
# diffentiate from "ECO", SE", "Sport", "M Sport", "Lux", Nav/Satellite", "Bluetooth",
# "Leather", "3-door", 
df_clean["se"] = df_clean["name"].apply(lambda x: 1 if "se" in x.lower() else 0)
df_clean["se"].value_counts()

1    384
0    242
Name: se, dtype: int64

In [127]:
# maybe too small/biased to be relevant
df_clean["eco"] = df_clean["name"].apply(lambda x: 1 if "eco" in x.lower() else 0)
df_clean["eco"].value_counts()

0    622
1      4
Name: eco, dtype: int64

In [128]:
df_clean["sport"] = df_clean["name"].apply(lambda x: 1 if "sport" in x.lower() else 0)
df_clean["sport"].value_counts()

1    380
0    246
Name: sport, dtype: int64

In [129]:
df_clean["m_sport"] = df_clean["name"].apply(lambda x: 1 if "m sport" in x.lower() else 0)
df_clean["m_sport"].value_counts()

0    443
1    183
Name: m_sport, dtype: int64

In [130]:
# maybe too small/biased to be relevant
df_clean["lux"] = df_clean["name"].apply(lambda x: 1 if "lux" in x.lower() else 0)
df_clean["lux"].value_counts()

0    620
1      6
Name: lux, dtype: int64

In [131]:
df_clean["sat_nav"] = df_clean["name"].apply(lambda x: 1 if "sat" in x.lower() or "nav" in x.lower() else 0)
df_clean["sat_nav"].value_counts()

0    522
1    104
Name: sat_nav, dtype: int64

In [132]:
df_clean["bluetooth"] = df_clean["name"].apply(lambda x: 1 if "bluetooth" in x.lower() else 0)
df_clean["bluetooth"].value_counts()

0    605
1     21
Name: bluetooth, dtype: int64

In [133]:
df_clean["leather"] = df_clean["name"].apply(lambda x: 1 if "leather" in x.lower() else 0)
df_clean["leather"].value_counts()

0    605
1     21
Name: leather, dtype: int64

In [134]:
df_clean["three_door"] = df_clean["name"].apply(lambda x: 1 if "3-door" in x.lower() else 0)
df_clean["three_door"].value_counts()

0    621
1      5
Name: three_door, dtype: int64

### Filling missing owners
Fill missing values with median value

In [135]:
df_clean["owners"].fillna(df.owners.median(), inplace=True)

### Parsing years

In [136]:
df_clean["year_reg"] = df_clean["year"].apply(lambda x: x.split("(")[1].replace(")",""))
df_clean["year"] = df_clean["year"].apply(lambda x: int(x.split("(")[0]))

### Parsing price

In [137]:
df_clean["price"] = df_clean["price"].apply(lambda x: int(x.replace("£","").replace(",","")))

### Parsing fuel

In [155]:
df_fuel = df_clean[df_clean.fuel.isna()]
df_fuel.make

618    Fiat
Name: make, dtype: object

In [158]:
df_fuel = df_clean[df_clean.make == "Fiat"]
df_fuel.fuel.value_counts()

Petrol    65
Name: fuel, dtype: int64

In [159]:
df_clean["fuel"].fillna("Petrol", inplace=True)

### Check and save

In [163]:
df_clean.drop(["name"], axis=1, inplace=True)
df_clean.head()

Unnamed: 0,BHP,ULEZ,body,engine,fuel,mileage,owners,price,transmission,year,...,se,eco,sport,m_sport,lux,sat_nav,bluetooth,leather,three_door,year_reg
0,108.0,ULEZ,Hatchback,1.6,Diesel,82850,2.0,7991,Manual,2015,...,1,0,0,0,0,1,1,0,0,15 reg
1,148.0,ULEZ,Hatchback,2.0,Diesel,38627,2.0,10980,Manual,2015,...,1,0,0,0,0,1,0,0,0,65 reg
2,108.0,ULEZ,Hatchback,1.6,Diesel,84812,2.0,8350,Manual,2015,...,0,0,1,0,0,1,0,0,0,15 reg
3,182.0,ULEZ,Hatchback,2.0,Diesel,74413,2.0,9699,Automatic,2015,...,0,0,1,0,0,0,0,0,0,15 reg
4,108.0,ULEZ,Hatchback,1.6,Diesel,59152,2.0,8990,Manual,2015,...,1,0,1,0,0,0,0,0,0,15 reg


In [164]:
df_clean.to_csv("autotrader_prices_cleaned.csv", index=False)