In [14]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [16]:
# jiji = pd.read_csv("jiji_clean.csv")
autorec = pd.read_csv("clean_data/autorec_clean.csv")
bef = pd.read_csv("clean_data/beforward_clean.csv")

In [None]:
print("Loaded:")
# print("Jiji:", jiji.shape)
print("Autorec:", autorec.shape)
print("BeForward:", bef.shape)

Loaded:
Autorec: (50, 9)
BeForward: (2500, 16)


In [5]:
autorec["source"] = "Autorec"
bef["source"] = "BeForward"

In [18]:
master_columns = [
    "ref_no",
    "title",
    "make",
    "model",
    "year",
    "mileage",
    "engine_cc",
    "transmission",
    "fuel",
    "seats",
    "location",
    "price_usd",
    "total_price_usd",
    "price_ugx",
    "total_price_ugx",
    "url",
    "source"
]

In [None]:
# jiji_merged = pd.DataFrame({
#     "source": jiji["source"],
#     "make": jiji["make"],
#     "model": jiji["model"],
#     "year": jiji["year"],
#     "mileage": jiji["mileage"],
#     "engine_cc": None,
#     "transmission": jiji["transmission"],
#     "fuel": jiji["fuel"],
#     "location": jiji.get("location", None),
#     "price_ugx": jiji["price"],
#     "url": jiji["url"]
# })

In [19]:
bef_mapped = bef.copy()
bef_mapped["source"] = "BeForward"

# Make sure all master columns exist
for col in master_columns:
    if col not in bef_mapped.columns:
        bef_mapped[col] = None

In [20]:
autorec_mapped = pd.DataFrame(columns=master_columns)

autorec_mapped["title"] = autorec["title"]
autorec_mapped["make"] = autorec["make"]
autorec_mapped["model"] = autorec["model"]
autorec_mapped["year"] = autorec["year"]
autorec_mapped["mileage"] = autorec["mileage"]
autorec_mapped["engine_cc"] = autorec["engine_cc"]

# Prices
autorec_mapped["price_usd"] = autorec["price_usd"]
autorec_mapped["price_ugx"] = autorec["price_ugx"]

# Missing columns for autorec
autorec_mapped["ref_no"] = None
autorec_mapped["transmission"] = autorec.get("transmission", None)
autorec_mapped["fuel"] = None
autorec_mapped["seats"] = None
autorec_mapped["location"] = None

# No total prices in autorec
autorec_mapped["total_price_usd"] = None
autorec_mapped["total_price_ugx"] = None

autorec_mapped["url"] = autorec["url"]
autorec_mapped["source"] = "Autorec"

In [21]:
oversample_factor = max(1, int(len(bef_mapped) / len(autorec_mapped)))

autorec_oversampled = resample(
    autorec_mapped,
    replace=True,
    n_samples=len(autorec_mapped) * oversample_factor,
    random_state=42
)

print("Autorec oversampled size:", autorec_oversampled.shape)

Autorec oversampled size: (2500, 17)


In [22]:
merged = pd.concat([bef_mapped, autorec_oversampled], ignore_index=True)

In [9]:
# merged = pd.concat([jiji_merged, autorec_merged, bef_merged], ignore_index=True)
merged = pd.concat([autorec_merged, bef_merged], ignore_index=True)
print("\nMerged shape before cleaning:", merged.shape)


Merged shape before cleaning: (2550, 11)


In [24]:
merged = merged[master_columns]
print("Merged dataset shape:", merged.shape)

Merged dataset shape: (5000, 17)


In [25]:
merged.to_csv("clean_data/cars_merged.csv", index=False)

print("\nSaved unified dataset → cars_merged.csv")
merged.head(10)


Saved unified dataset → cars_merged.csv


Unnamed: 0,ref_no,title,make,model,year,mileage,engine_cc,transmission,fuel,seats,location,price_usd,total_price_usd,price_ugx,total_price_ugx,url,source
0,BW726274,2015 TOYOTA SIENTA\n ...,Toyota,Sienta,2015,36939.0,1490.0,AT,Petrol,7,Location,,,,,/toyota/sienta/bw726274/id/9989384/,BeForward
1,BY750253,2015 MERCEDES-BENZ M-CLASS\n ...,Mercedes-Benz,M-Class,2015,52427.0,3490.0,AT,Petrol,5,Location,,,,,/mercedes-benz/m-class/by750253/id/11901853/,BeForward
2,BY759022,2006 HONDA ACTY TRUCK,Honda,Acty,2006,111250.0,650.0,MT,Petrol,2,Location,,,,,/honda/acty-truck/by759022/id/11910020/,BeForward
3,BY759024,2013 MITSUBISHI CANTER,Mitsubishi,Canter,2013,312401.0,2990.0,Semi AT,Diesel,3,Location,,,,,/mitsubishi/canter/by759024/id/11910065/,BeForward
4,CA462307,2014 TOYOTA HIACE VAN\n ...,Toyota,Hiace,2014,161352.0,2980.0,AT,Diesel,3,Location,,,,,/toyota/hiace-van/ca462307/id/12565207/,BeForward
5,CA740447,2011 TOYOTA WISH\n ...,Toyota,Wish,2011,154001.0,1790.0,AT,Petrol,7,Location,,,,,/toyota/wish/ca740447/id/12831915/,BeForward
6,CB026605,2017 HONDA FIT HYBRID\n ...,Honda,Fit,2017,111072.0,1490.0,AT,Hybrid(Petrol),5,Location,,,,,/honda/fit-hybrid/cb026605/id/13106276/,BeForward
7,CA848029,2023 SUZUKI JIMNY SIERRA\n ...,Suzuki,Jimny,2023,18932.0,1460.0,AT,Petrol,4,Location,,,,,/suzuki/jimny-sierra/ca848029/id/12936910/,BeForward
8,CA797413,2022 TOYOTA ALPHARD\n ...,Toyota,Alphard,2022,23020.0,2493.0,CVT,Petrol,7,Location,,,,,/toyota/alphard/ca797413/id/12886566/,BeForward
9,CA809714,2022 HONDA N BOX\n ...,Honda,N,2022,17769.0,658.0,CVT,Petrol,4,Location,,,,,/honda/n-box/ca809714/id/12898876/,BeForward
