In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def clean_mileage(value):
    if pd.isna(value):
        return None
    value = str(value).lower().replace("km", "").replace(",", "").strip()
    digits = re.sub(r"\D", "", value)
    return int(digits) if digits else None

In [3]:
def clean_engine(value):
    if pd.isna(value):
        return None

    value = str(value).lower()
    value = value.replace("cc", "").replace(",", "").strip()
    digits = re.sub(r"\D", "", value)
    return int(digits) if digits else None


In [4]:
def clean_price(value):
    if pd.isna(value):
        return None
    digits = re.sub(r"\D", "", str(value))
    return int(digits) if digits else None

In [5]:
def normalize_text(value):
    if pd.isna(value):
        return None
    return value.strip().title()

In [6]:
def clean_year(value):
    if pd.isna(value):
        return None
    match = re.search(r"(19|20)\d{2}", str(value))
    return int(match.group()) if match else None

In [7]:

jiji = pd.read_csv("../01_data_collection/raw_data/jiji_raw.csv")
autorec = pd.read_csv("../01_data_collection/raw_data/autorec_raw.csv")
bef = pd.read_csv("../01_data_collection/raw_data/beforward_raw.csv")

In [8]:
print("Loaded datasets:")
print("Jiji:", jiji.shape)
print("Autorec:", autorec.shape)
print("BeForward:", bef.shape)

Loaded datasets:
Jiji: (120, 9)
Autorec: (50, 9)
BeForward: (2446, 16)


In [None]:
#Leave out Jiji, doesn't have mileage in the tree view. Would need me to go into form view

# jiji["price"] = jiji["price"].apply(clean_price)
# jiji["year"] = jiji["year"].apply(clean_year)
# jiji["make"] = jiji["make"].apply(normalize_text)
# jiji["model"] = jiji["model"].apply(normalize_text)

# jiji_clean = jiji.copy()
# jiji_clean.to_csv("clean_data/jiji_clean.csv", index=False)


In [14]:
autorec["mileage"] = autorec["mileage"].apply(clean_mileage)
autorec["engine_cc"] = autorec["engine_cc"].apply(clean_engine)
autorec["year"] = autorec["year"].apply(clean_year)
autorec["price_usd"] = autorec["price_usd"].apply(clean_price)
autorec["price_ugx"] = autorec["price_ugx"].apply(clean_price)
autorec["make"] = autorec["make"].apply(normalize_text)
autorec["model"] = autorec["model"].apply(normalize_text)

autorec_clean = autorec.copy()
autorec_clean.to_csv("clean_data/autorec_clean.csv", index=False)

In [15]:
bef["mileage"] = bef["mileage"].apply(clean_mileage)
bef["engine_cc"] = bef["engine_cc"].apply(clean_engine)
bef["year"] = bef["year"].apply(clean_year)
bef["price_usd"] = bef["price_usd"].apply(clean_price)
bef["total_price_usd"] = bef["total_price_usd"].apply(clean_price)
bef["price_ugx"] = bef["price_ugx"].apply(clean_price)
bef["total_price_ugx"] = bef["total_price_ugx"].apply(clean_price)
bef["make"] = bef["make"].apply(normalize_text)
bef["model"] = bef["model"].apply(normalize_text)

bef_clean = bef.copy()
bef_clean.to_csv("clean_data/beforward_clean.csv", index=False)

In [16]:
print("\nCLEANING COMPLETE!")
# print(jiji_clean.head())
print(autorec_clean.head())
print(bef_clean.head())


CLEANING COMPLETE!
                              title    make     model  year  engine_cc  \
0                 2013 Mazda Verisa   Mazda    Verisa  2013    15000.0   
1                 2011 Mazda Biante   Mazda    Biante  2011    20000.0   
2              2008 Toyota Vellfire  Toyota  Vellfire  2008    23900.0   
3  2012 Subaru Legacy Touring Wagon  Subaru    Legacy  2012    24900.0   
4               2012 Subaru Impreza  Subaru   Impreza  2012    15900.0   

     mileage  price_usd  price_ugx  \
0   932000.0       4500   16200000   
1  1005000.0       4500   16200000   
2  1027000.0       5200   18720000   
3  1024000.0       4900   17640000   
4   669000.0       4700   16920000   

                                                 url  
0  /cars-list/hatchback/2013-mazda-verisa-7204?se...  
1  /cars-list/mini-van/2011-mazda-biante-7203?sel...  
2  /cars-list/mini-van/2008-toyota-vellfire-7200?...  
3  /cars-list/wagon/2012-subaru-legacy-touring-wa...  
4  /cars-list/hatchback/2012-su