# Craigslist Dataset - Clean the data


This notebook applies all the **filters** that we explored in **`Craigslist_Car_Listings_Scraper.ipynb`**

## Import the dataset

In [3]:
import pandas as pd

df = pd.read_csv("../data/all-vehicles-model__enriched.csv")

## Remove duplicates

In [5]:
df = df.drop_duplicates(subset='url')
df = df.sort_values('posted_at',ascending = False).drop_duplicates(subset=[col for col in df.columns if col not in ['url', 'region','location', 'state','date','Day','posted_at']], keep='first')

## Dealer / Owner

In [7]:
df.insert(2, 'seller', df['url'].str.extract(r'/([a-z]+)/d/')[0].map({'cto': 'owner', 'ctd': 'dealer'}))

## Date and Day of publication

In [9]:
df['posted_at'] = pd.to_datetime(df['posted_at'], format="%Y-%m-%d", errors='coerce')
df['Day'] = df['posted_at'].dt.day_name()

## Numericals Values

In [11]:
df['price'] = pd.to_numeric(df['price'].str.replace(r'[\$,]', '', regex=True).replace('', pd.NA), errors='coerce').astype('Int64')
df['odometer'] = pd.to_numeric(df['odometer'].str.replace(r'[\,]', '', regex=True).replace('', pd.NA), errors='coerce').astype('Int64')
df['age'] = 2025 - df['year']

df = df[(df['price'] >= 2500) & (df['price'] < 1000000)]
df = df[(df['year'] >= 1985) & (df['year'] <= 2025)]
df = df[(df['odometer'] >= 5) & (df['odometer'] <= 350000)]

## Unknown models

In [13]:
df = df[~df['model'].isna()]

## Log Price

In [16]:
import numpy as np
df['log_price'] = np.log(df['price']+1)

## Save

In [19]:
df.to_excel("../data/df_cleaned.xlsx", index=False)