In [49]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [50]:
# jiji = pd.read_csv("jiji_clean.csv")
autorec = pd.read_csv("clean_data/autorec_clean.csv")
bef = pd.read_csv("clean_data/beforward_clean.csv")

In [51]:
print("Loaded:")
# print("Jiji:", jiji.shape)
print("Autorec:", autorec.shape)
print("BeForward:", bef.shape)

Loaded:
Autorec: (50, 9)
BeForward: (2446, 16)


In [52]:
autorec["source"] = "Autorec"
bef["source"] = "BeForward"

In [53]:
master_columns = [
    "ref_no",
    "title",
    "make",
    "model",
    "year",
    "mileage",
    "engine_cc",
    "transmission",
    "fuel",
    "seats",
    "location",
    "price_usd",
    "total_price_usd",
    "price_ugx",
    "total_price_ugx",
    "url",
    "source"
]

In [19]:
# jiji_merged = pd.DataFrame({
#     "source": jiji["source"],
#     "make": jiji["make"],
#     "model": jiji["model"],
#     "year": jiji["year"],
#     "mileage": jiji["mileage"],
#     "engine_cc": None,
#     "transmission": jiji["transmission"],
#     "fuel": jiji["fuel"],
#     "location": jiji.get("location", None),
#     "price_ugx": jiji["price"],
#     "url": jiji["url"]
# })

In [54]:
bef_mapped = bef.copy()
bef_mapped["source"] = "BeForward"

# Make sure all master columns exist
for col in master_columns:
    if col not in bef_mapped.columns:
        bef_mapped[col] = None

In [55]:
autorec_mapped = pd.DataFrame(columns=master_columns)

autorec_mapped["title"] = autorec["title"]
autorec_mapped["make"] = autorec["make"]
autorec_mapped["model"] = autorec["model"]
autorec_mapped["year"] = autorec["year"]
autorec_mapped["mileage"] = autorec["mileage"]
autorec_mapped["engine_cc"] = autorec["engine_cc"]

# Prices
autorec_mapped["price_usd"] = autorec["price_usd"]
autorec_mapped["price_ugx"] = autorec["price_ugx"]

# Missing columns for autorec
autorec_mapped["ref_no"] = None
autorec_mapped["transmission"] = autorec.get("transmission", None)
autorec_mapped["fuel"] = None
autorec_mapped["seats"] = None
autorec_mapped["location"] = None

# No total prices in autorec
autorec_mapped["total_price_usd"] = None
autorec_mapped["total_price_ugx"] = None

autorec_mapped["url"] = autorec["url"]
autorec_mapped["source"] = "Autorec"

In [28]:
# When I oversampled, it affected the accuracy of my model

# oversample_factor = max(1, int(len(bef_mapped) / len(autorec_mapped)))

# autorec_oversampled = resample(
#     autorec_mapped,
#     replace=True,
#     n_samples=len(autorec_mapped) * oversample_factor,
#     random_state=42
# )

# print("Autorec oversampled size:", autorec_oversampled.shape)

In [66]:
autorec_filled = autorec_mapped.copy()

# If Autorec has no total_price_usd, use price_usd
autorec_filled["total_price_usd"] = autorec_filled["total_price_usd"].fillna(
    autorec_filled["price_usd"]
)

autorec_filled["total_price_ugx"] = autorec_filled["total_price_ugx"].fillna(
    autorec_filled["price_ugx"]
)

for col in ["engine_cc", "mileage", "year", "price_usd", "price_ugx"]:
    if col in autorec_filled:
        autorec_filled[col] = autorec_filled[col].fillna(autorec_filled[col].median())

autorec_filled["fuel"] = autorec_filled["fuel"].fillna("Petrol")
autorec_filled["transmission"] = autorec_filled["transmission"].fillna("AT")
autorec_filled["location"] = autorec_filled["location"].fillna("Unknown")

# Seats: fill with 5 (safe general default)
autorec_filled["seats"] = autorec_filled["seats"].fillna(5)

print("Autorec cleaned rows:")
print(autorec_filled.head())


Autorec cleaned rows:
  ref_no                             title    make     model  year    mileage  \
0   None                 2013 Mazda Verisa   Mazda    Verisa  2013   932000.0   
1   None                 2011 Mazda Biante   Mazda    Biante  2011  1005000.0   
2   None              2008 Toyota Vellfire  Toyota  Vellfire  2008  1027000.0   
3   None  2012 Subaru Legacy Touring Wagon  Subaru    Legacy  2012  1024000.0   
4   None               2012 Subaru Impreza  Subaru   Impreza  2012   669000.0   

   engine_cc transmission    fuel  seats location  price_usd  total_price_usd  \
0    15000.0           AT  Petrol      5  Unknown       4500             4500   
1    20000.0           AT  Petrol      5  Unknown       4500             4500   
2    23900.0           AT  Petrol      5  Unknown       5200             5200   
3    24900.0           AT  Petrol      5  Unknown       4900             4900   
4    15900.0           AT  Petrol      5  Unknown       4700             4700   

   p

  autorec_filled["total_price_usd"] = autorec_filled["total_price_usd"].fillna(
  autorec_filled["total_price_ugx"] = autorec_filled["total_price_ugx"].fillna(
  autorec_filled["seats"] = autorec_filled["seats"].fillna(5)


In [67]:
# merged = pd.concat([bef_mapped, autorec_oversampled], ignore_index=True) # Oversampling affected my model
merged = pd.concat([bef_mapped, autorec_filled], ignore_index=True) # Let me use the fill insteas

In [68]:
# merged = pd.concat([jiji_merged, autorec_merged, bef_merged], ignore_index=True)
# merged = pd.concat([autorec_merged, bef_merged], ignore_index=True)
# print("\nMerged shape before cleaning:", merged.shape)

In [69]:
merged = merged[master_columns]
print("Merged dataset shape:", merged.shape)

Merged dataset shape: (2496, 17)


In [70]:
merged.to_csv("clean_data/cars_merged.csv", index=False)

print("\nSaved unified dataset → cars_merged.csv")
merged.head(10)


Saved unified dataset → cars_merged.csv


Unnamed: 0,ref_no,title,make,model,year,mileage,engine_cc,transmission,fuel,seats,location,price_usd,total_price_usd,price_ugx,total_price_ugx,url,source
0,BY759022,2006 HONDA ACTY TRUCK,Honda,Acty,2006,111250.0,6500.0,MT,Petrol,2,Location,2370,397300.0,8532000,1430280000.0,/honda/acty-truck/by759022/id/11910020/,BeForward
1,BY759024,2013 MITSUBISHI CANTER,Mitsubishi,Canter,2013,312401.0,29900.0,Semi AT,Diesel,3,Location,3920,774700.0,14112000,2788920000.0,/mitsubishi/canter/by759024/id/11910065/,BeForward
2,CA462307,2014 TOYOTA HIACE VAN\n ...,Toyota,Hiace,2014,161352.0,29800.0,AT,Diesel,3,Location,5960,849200.0,21456000,3057120000.0,/toyota/hiace-van/ca462307/id/12565207/,BeForward
3,CA740447,2011 TOYOTA WISH\n ...,Toyota,Wish,2011,154001.0,17900.0,AT,Petrol,7,Location,2430,454400.0,8748000,1635840000.0,/toyota/wish/ca740447/id/12831915/,BeForward
4,CB026605,2017 HONDA FIT HYBRID\n ...,Honda,Fit,2017,111072.0,14900.0,AT,Hybrid(Petrol),5,Location,4790,663300.0,17244000,2387880000.0,/honda/fit-hybrid/cb026605/id/13106276/,BeForward
5,CB205955,2022 MAZDA MAZDA2\n ...,Mazda,Mazda2,2022,74400.0,12990.0,AT,Petrol,5,Location,12190,1353900.0,43884000,4874040000.0,/mazda/mazda2/cb205955/id/13263374/,BeForward
6,CB205954,2016 TOYOTA HILUX\n ...,Toyota,Hilux,2016,13000.0,23930.0,MT,Diesel,4,Location,12920,1499400.0,46512000,5397840000.0,/toyota/hilux/cb205954/id/13263373/,BeForward
7,CB205953,2019 TOYOTA HILUX\n ...,Toyota,Hilux,2019,48000.0,23930.0,AT,Diesel,5,Location,19670,2174400.0,70812000,7827840000.0,/toyota/hilux/cb205953/id/13263372/,BeForward
8,CB205952,2017 FORD RANGER\n ...,Ford,Ranger,2017,50000.0,21980.0,AT,Diesel,5,Location,15140,1725400.0,54504000,6211440000.0,/ford/ranger/cb205952/id/13263371/,BeForward
9,CB205951,2012 FORD RANGER\n ...,Ford,Ranger,2012,65000.0,21980.0,AT,Diesel,5,Location,11560,1367400.0,41616000,4922640000.0,/ford/ranger/cb205951/id/13263370/,BeForward
