In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import joblib
import re

sns.set(style="whitegrid")


In [2]:
df = pd.read_csv("../02_data_cleaning/clean_data/cars_merged.csv")
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (4846, 17)


Unnamed: 0,ref_no,title,make,model,year,mileage,engine_cc,transmission,fuel,seats,location,price_usd,total_price_usd,price_ugx,total_price_ugx,url,source
0,BY759022,2006 HONDA ACTY TRUCK,Honda,Acty,2006,111250.0,650.0,MT,Petrol,2,Location,2370,39730.0,8532000,143028000.0,/honda/acty-truck/by759022/id/11910020/,BeForward
1,BY759024,2013 MITSUBISHI CANTER,Mitsubishi,Canter,2013,312401.0,2990.0,Semi AT,Diesel,3,Location,3920,77470.0,14112000,278892000.0,/mitsubishi/canter/by759024/id/11910065/,BeForward
2,CA462307,2014 TOYOTA HIACE VAN DX,Toyota,Hiace,2014,161352.0,2980.0,AT,Diesel,3,Location,5960,84920.0,21456000,305712000.0,/toyota/hiace-van/ca462307/id/12565207/,BeForward
3,CA740447,2011 TOYOTA WISH\n ...,Toyota,Wish,2011,154001.0,1790.0,AT,Petrol,7,Location,2430,45440.0,8748000,163584000.0,/toyota/wish/ca740447/id/12831915/,BeForward
4,CB026605,2017 HONDA FIT HYBRID\n ...,Honda,Fit,2017,111072.0,1490.0,AT,Hybrid(Petrol),5,Location,4790,66330.0,17244000,238788000.0,/honda/fit-hybrid/cb026605/id/13106276/,BeForward


In [3]:
# Keep relevant columns
df = df[[
    "make","model","year","mileage","engine_cc","seats",
    "transmission","fuel","source",
    "price_ugx"
]].copy()

# Remove missing prices
df = df[df["price_ugx"].notnull()]


In [4]:
def force_numeric(val):
    if pd.isna(val):
        return np.nan
    s = str(val).lower().strip()
    if s in ["ask", "-", "--", "", "nan", "unknown"]:
        return np.nan
    digits = re.sub(r"\D", "", s)
    return float(digits) if digits else np.nan

for col in ["mileage", "engine_cc", "year", "seats"]:
    df[col] = df[col].apply(force_numeric)

# Fill missing
df["mileage"].fillna(df["mileage"].median(), inplace=True)
df["engine_cc"].fillna(df["engine_cc"].median(), inplace=True)
df["year"].fillna(df["year"].median(), inplace=True)
df["seats"].fillna(4, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["mileage"].fillna(df["mileage"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["engine_cc"].fillna(df["engine_cc"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

In [5]:
# Compute median price per make+model+year
group_key = ["make","model","year"]
median_prices = df.groupby(group_key)["price_ugx"].median().reset_index()
median_prices.rename(columns={"price_ugx":"fair_price"}, inplace=True)

df = df.merge(median_prices, on=group_key, how="left")

# Define labels
def classify(row):
    if row["price_ugx"] > row["fair_price"] * 1.1:
        return "overpriced"
    elif row["price_ugx"] < row["fair_price"] * 0.9:
        return "underpriced"
    else:
        return "fair"

df["price_status"] = df.apply(classify, axis=1)

df["price_status"].value_counts()


price_status
fair           4025
underpriced     417
overpriced      404
Name: count, dtype: int64

In [6]:
features = [
    "make","model","year","mileage","engine_cc","seats",
    "transmission","fuel","source"
]

target = "price_status"

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((3634, 9), (1212, 9))

In [None]:
numeric = ["year","mileage","engine_cc","seats"]
categorical = ["make","model","transmission","fuel","source"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ]
)

In [8]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=200),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

In [9]:
for name, model in models.items():
    print(f"\nTraining: {name}")

    clf = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    print("Accuracy:", acc)
    print(classification_report(y_test, preds))

    results[name] = acc



Training: LogisticRegression
Accuracy: 0.8283828382838284
              precision    recall  f1-score   support

        fair       0.85      0.97      0.91      1007
  overpriced       0.33      0.07      0.11       101
 underpriced       0.49      0.16      0.24       104

    accuracy                           0.83      1212
   macro avg       0.56      0.40      0.42      1212
weighted avg       0.77      0.83      0.78      1212


Training: DecisionTree
Accuracy: 0.7929042904290429
              precision    recall  f1-score   support

        fair       0.89      0.90      0.89      1007
  overpriced       0.26      0.25      0.25       101
 underpriced       0.31      0.30      0.31       104

    accuracy                           0.79      1212
   macro avg       0.49      0.48      0.48      1212
weighted avg       0.79      0.79      0.79      1212


Training: RandomForest
Accuracy: 0.8333333333333334
              precision    recall  f1-score   support

        fair      

In [None]:
best_model_name = max(results, key=results.get)
best_model_name


'RandomForest'

In [11]:
best_model = models[best_model_name]

final_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", best_model)
])

final_clf.fit(X, y)

joblib.dump(final_clf, "models/overpricing_model.pkl")
print("Saved → overpricing_model.pkl")


Saved → overpricing_model.pkl
