# Data Cleaning
Clean the raw data collected from the scrapper and prepare the data ready for exploratory data analysis

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("autotrader_prices_raw.csv", parse_dates=["year"])

In [6]:
df.head()

Unnamed: 0,BHP,ULEZ,body,engine,fuel,link,mileage,name,owners,price,transmission,year
0,123.0,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,34414,"Audi A1 1.4 TFSI SPORT 3d 123 BHP, SPORTS SEAT...",,"£9,980",Manual,2017 (17 reg)
1,,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,16773,Audi A1 TFSI Sport 3dr 1.4,,"£11,500",Manual,2017 (67 reg)
2,114.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20200...,76922,Audi A1 1.6 TDI SPORT 3d 114 BHP PARKING SENSO...,,"£8,599",Manual,2017 (17 reg)
3,,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,10950,Audi A1 1.4 TFSI Sport 5dr,,"£12,174",Manual,2017 (67 reg)
4,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,58278,Audi A1 1.6 TDI Sport 3dr,,"£9,172",Manual,2017 (67 reg)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1233 entries, 0 to 1232
Data columns (total 12 columns):
BHP             773 non-null float64
ULEZ            1223 non-null object
body            1233 non-null object
engine          1227 non-null object
fuel            1232 non-null object
link            1233 non-null object
mileage         1233 non-null int64
name            1233 non-null object
owners          223 non-null object
price           1233 non-null object
transmission    1232 non-null object
year            1233 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 115.7+ KB


In [8]:
df.describe()

Unnamed: 0,BHP,mileage
count,773.0,1233.0
mean,154.737387,19593.0
std,58.945633,13701.411563
min,68.0,10.0
25%,114.0,10665.0
50%,138.0,16805.0
75%,181.0,26179.0
max,469.0,124395.0


In [9]:
df.isna().sum()

BHP              460
ULEZ              10
body               0
engine             6
fuel               1
link               0
mileage            0
name               0
owners          1010
price              0
transmission       1
year               0
dtype: int64

## Data cleaning

### Filling missing BHP
* Find all samples which has "BHP" and "PS" in the "name" column
* Extract numbers in the "name" column
* Take the highest number as "BHP" value

In [96]:
import re

df["name_BHP"] = df.name.apply(lambda x: np.max(list(map(int, re.findall(r'\d+', x)))) if "bhp" in x.lower() else np.nan)
df["name_PS"] = df.name.apply(lambda x: np.max(list(map(int, re.findall(r'\d+', x)))) if "ps" in x.lower() else np.nan)

In [97]:
df

Unnamed: 0,BHP,ULEZ,body,engine,fuel,link,mileage,name,owners,price,transmission,year,name_BHP,name_PS
0,123.0,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,34414,"Audi A1 1.4 TFSI SPORT 3d 123 BHP, SPORTS SEAT...",,"£9,980",Manual,2017 (17 reg),123.0,
1,,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,16773,Audi A1 TFSI Sport 3dr 1.4,,"£11,500",Manual,2017 (67 reg),,
2,114.0,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20200...,76922,Audi A1 1.6 TDI SPORT 3d 114 BHP PARKING SENSO...,,"£8,599",Manual,2017 (17 reg),114.0,
3,,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,10950,Audi A1 1.4 TFSI Sport 5dr,,"£12,174",Manual,2017 (67 reg),,
4,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,58278,Audi A1 1.6 TDI Sport 3dr,,"£9,172",Manual,2017 (67 reg),,
5,,ULEZ,Hatchback,1.0L,Petrol,https://www.autotrader.co.uk/car-details/20201...,58000,Audi 1.0 SPORTBACK TFSI SPORT 5d 93 BHP STUNNI...,,"£10,995",Manual,2017 (17 reg),93.0,
6,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,27200,Audi A1 1.6 TDI Sport Sportback (s/s) 5dr,,"£11,590",Manual,2017 (67 reg),,
7,123.0,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,26727,Audi A1 1.4 TFSI S LINE 3d 123 BHP AIRCON - AL...,,"£12,000",Manual,2017 (66 reg),123.0,
8,94.0,ULEZ,Hatchback,1.0L,Petrol,https://www.autotrader.co.uk/car-details/20201...,53242,Audi A1 1L Sport TFSI 3dr,,"£10,825",Automatic,2017 (67 reg),,
9,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,21488,Audi A1 1.6 TDI Black Edition 5dr Special Edit...,,"£14,999",Manual,2017 (67 reg),,


In [98]:
# update from name_BHP
index_noBHP = df[df.BHP.isna()].index
df.at[index_noBHP, "BHP"] = df["name_BHP"]

In [99]:
# update from name_PS
index_noBHP = df[df.BHP.isna()].index
df.at[index_noBHP, "BHP"] = df["name_PS"]*0.98

In [100]:
df

Unnamed: 0,BHP,ULEZ,body,engine,fuel,link,mileage,name,owners,price,transmission,year,name_BHP,name_PS
0,123.0000,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,34414,"Audi A1 1.4 TFSI SPORT 3d 123 BHP, SPORTS SEAT...",,"£9,980",Manual,2017 (17 reg),123.0,
1,,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,16773,Audi A1 TFSI Sport 3dr 1.4,,"£11,500",Manual,2017 (67 reg),,
2,114.0000,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20200...,76922,Audi A1 1.6 TDI SPORT 3d 114 BHP PARKING SENSO...,,"£8,599",Manual,2017 (17 reg),114.0,
3,,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,10950,Audi A1 1.4 TFSI Sport 5dr,,"£12,174",Manual,2017 (67 reg),,
4,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,58278,Audi A1 1.6 TDI Sport 3dr,,"£9,172",Manual,2017 (67 reg),,
5,93.0000,ULEZ,Hatchback,1.0L,Petrol,https://www.autotrader.co.uk/car-details/20201...,58000,Audi 1.0 SPORTBACK TFSI SPORT 5d 93 BHP STUNNI...,,"£10,995",Manual,2017 (17 reg),93.0,
6,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,27200,Audi A1 1.6 TDI Sport Sportback (s/s) 5dr,,"£11,590",Manual,2017 (67 reg),,
7,123.0000,ULEZ,Hatchback,1.4L,Petrol,https://www.autotrader.co.uk/car-details/20201...,26727,Audi A1 1.4 TFSI S LINE 3d 123 BHP AIRCON - AL...,,"£12,000",Manual,2017 (66 reg),123.0,
8,94.0000,ULEZ,Hatchback,1.0L,Petrol,https://www.autotrader.co.uk/car-details/20201...,53242,Audi A1 1L Sport TFSI 3dr,,"£10,825",Automatic,2017 (67 reg),,
9,,ULEZ,Hatchback,1.6L,Diesel,https://www.autotrader.co.uk/car-details/20201...,21488,Audi A1 1.6 TDI Black Edition 5dr Special Edit...,,"£14,999",Manual,2017 (67 reg),,


In [10]:
df_clean = df.copy()
df_clean["BHP"].fillna(df.BHP.median(), inplace=True)

### Filling ULEZ compliance

In [11]:
df_clean["ULEZ"].fillna("Non-ULEZ", inplace=True)

### Engine size parsing
Get rid of the Liter "L" and convert to float

In [12]:
df_clean["engine"] = df_clean["engine"].apply(lambda x: float(x[:-1]))

### Remove link

In [13]:
df_clean.drop(["link"], axis=1, inplace=True)

### Parsing name into multiple extra features

In [14]:
df_clean["make"] = df_clean["name"].apply(lambda x:x.split()[0])
df_clean["model"] = df_clean["name"].apply(lambda x:x.split()[1])

In [15]:
# diffentiate from "ECO", SE", "Sport", "M Sport", "Lux", Nav/Satellite", "Bluetooth",
# "Leather", "3-door", 
df_clean["se"] = df_clean["name"].apply(lambda x: 1 if "se" in x.lower() else 0)
df_clean["se"].value_counts()

1    384
0    242
Name: se, dtype: int64

In [16]:
# maybe too small/biased to be relevant
df_clean["eco"] = df_clean["name"].apply(lambda x: 1 if "eco" in x.lower() else 0)
df_clean["eco"].value_counts()

0    622
1      4
Name: eco, dtype: int64

In [17]:
df_clean["sport"] = df_clean["name"].apply(lambda x: 1 if "sport" in x.lower() else 0)
df_clean["sport"].value_counts()

1    380
0    246
Name: sport, dtype: int64

In [18]:
df_clean["m_sport"] = df_clean["name"].apply(lambda x: 1 if "m sport" in x.lower() else 0)
df_clean["m_sport"].value_counts()

0    443
1    183
Name: m_sport, dtype: int64

In [19]:
# maybe too small/biased to be relevant
df_clean["lux"] = df_clean["name"].apply(lambda x: 1 if "lux" in x.lower() else 0)
df_clean["lux"].value_counts()

0    620
1      6
Name: lux, dtype: int64

In [20]:
df_clean["sat_nav"] = df_clean["name"].apply(lambda x: 1 if "sat" in x.lower() or "nav" in x.lower() else 0)
df_clean["sat_nav"].value_counts()

0    522
1    104
Name: sat_nav, dtype: int64

In [21]:
df_clean["bluetooth"] = df_clean["name"].apply(lambda x: 1 if "bluetooth" in x.lower() else 0)
df_clean["bluetooth"].value_counts()

0    605
1     21
Name: bluetooth, dtype: int64

In [22]:
df_clean["leather"] = df_clean["name"].apply(lambda x: 1 if "leather" in x.lower() else 0)
df_clean["leather"].value_counts()

0    605
1     21
Name: leather, dtype: int64

In [23]:
df_clean["three_door"] = df_clean["name"].apply(lambda x: 1 if "3-door" in x.lower() else 0)
df_clean["three_door"].value_counts()

0    621
1      5
Name: three_door, dtype: int64

### Filling missing owners
Fill missing values with median value

In [24]:
df_clean["owners"].fillna(df.owners.median(), inplace=True)

### Parsing years

In [25]:
df_clean["year_reg"] = df_clean["year"].apply(lambda x: x.split("(")[1].replace(")",""))
df_clean["year"] = df_clean["year"].apply(lambda x: int(x.split("(")[0]))

### Parsing price

In [26]:
df_clean["price"] = df_clean["price"].apply(lambda x: int(x.replace("£","").replace(",","")))

### Parsing fuel

In [27]:
df_fuel = df_clean[df_clean.fuel.isna()]
df_fuel.make

618    Fiat
Name: make, dtype: object

In [28]:
df_fuel = df_clean[df_clean.make == "Fiat"]
df_fuel.fuel.value_counts()

Petrol    65
Name: fuel, dtype: int64

In [29]:
df_clean["fuel"].fillna("Petrol", inplace=True)

### Check and save

In [30]:
df_clean.head()

Unnamed: 0,BHP,ULEZ,body,engine,fuel,mileage,name,owners,price,transmission,...,se,eco,sport,m_sport,lux,sat_nav,bluetooth,leather,three_door,year_reg
0,108.0,ULEZ,Hatchback,1.6,Diesel,82850,Audi A3 1.6 TDI SE 5d 109 BHP Nav Prep - Bluet...,2.0,7991,Manual,...,1,0,0,0,0,1,1,0,0,15 reg
1,148.0,ULEZ,Hatchback,2.0,Diesel,38627,"Audi A3 2.0 TDI SE 5d 148 BHP, £20 YEAR ROAD T...",2.0,10980,Manual,...,1,0,0,0,0,1,0,0,0,65 reg
2,108.0,ULEZ,Hatchback,1.6,Diesel,84812,Audi A3 1.6 TDI 110 Sport 5dr Satellite Naviga...,2.0,8350,Manual,...,0,0,1,0,0,1,0,0,0,15 reg
3,182.0,ULEZ,Hatchback,2.0,Diesel,74413,Audi A3 2.0 TDI Sport Sportback S Tronic quatt...,2.0,9699,Automatic,...,0,0,1,0,0,0,0,0,0,15 reg
4,108.0,ULEZ,Hatchback,1.6,Diesel,59152,Audi A3 1.6 TDI SE Sportback 5dr,2.0,8990,Manual,...,1,0,1,0,0,0,0,0,0,15 reg


In [32]:
df_clean.to_csv("autotrader_prices_cleaned.csv", index=False)