In [1]:
import numpy as np
from numpy.linalg import norm
from pymongo import MongoClient
from dotenv import load_dotenv
import os
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load .env
load_dotenv()
mongo_url = os.getenv("DB_URL")

In [6]:
# MongoDB client with timeout settings
mdb_client = MongoClient(
    mongo_url,
    serverSelectionTimeoutMS=5000,
    connectTimeoutMS=10000,
    socketTimeoutMS=30000,
    maxPoolSize=50,
    retryWrites=True
)
db = mdb_client["Suzuki_cars"]   ####################################################################################################

In [7]:
# Load sentence-transformers model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
# ---------------- HELPER FUNCTIONS ----------------
def embed(text):
    """Return embedding vector for a text."""
    return model.encode(text)

In [9]:
def avg(vectors):
    return np.mean(vectors, axis=0)

In [10]:
def cos(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

In [11]:
# ---------------- VARIABLES ----------------
good_refs = [
    # English
    "excellent condition",
    "well maintained",
    "genuine car",
    "non accidental",
    "original paint",
    "bumper to bumper genuine",
    "engine in perfect condition",
    "smooth drive",
    "just buy and drive",
    "family used car",
    "first owner",
    "low mileage",
    "documents complete",
    "original file and smart card",
    "soundless engine",
    "clean interior",
    "neat condition",

    # Pakistani English / Urdu mix
    "total genuine",
    "100% genuine",
    "scratchless",
    "no touching",
    "no work required",
    "koi kaam nahi",
    "engine 100%",
    "suspension smooth",
    "Alhamdulillah",
    "bilkul theek",
    "new condition",
    "lush condition",
    "water drop engine",
    "biometric on the spot",
    "book file complete"
]

In [12]:
bad_refs = [
    # English
    "accident car",
    "accident damaged",
    "engine problem",
    "gear problem",
    "suspension issue",
    "body work required",
    "paint work",
    "dent and scratch",
    "major repair needed",
    "mechanical issue",
    "poor condition",
    "rust",
    "chassis damage",

    # Pakistani phrasing
    "touching",
    "shower",
    "patch",
    "half paint",
    "1.5 piece",
    "alignment work needed",
    "engine kharab",
    "gear kharab",
    "accident hai",
    "kaam hai",
    "work required",
    "meter reversed",
    "document issue",
    "file missing"
]

In [14]:
# ---------------- RATING FUNCTIONS ----------------
def get_rating_of_a_car(car_description, og_numeric_rating, good_vector, bad_vector):
    car_vec = embed(car_description)

    try:
        og_numeric_rating = float(og_numeric_rating)
        has_numeric_rating = True
    except (TypeError, ValueError):
        has_numeric_rating = False

    if has_numeric_rating:
        if og_numeric_rating > 8:
            rating = "Excellent"
        elif og_numeric_rating < 2:
            rating = "Bad"
        else:
            good_score = cos(car_vec, good_vector)
            bad_score = cos(car_vec, bad_vector)

            if good_score > bad_score + 0.05:
                rating = "Above Average"
            elif bad_score > good_score + 0.05:
                rating = "Average"
            else:
                rating = "Below Average"
    else:
        good_score = cos(car_vec, good_vector)
        bad_score = cos(car_vec, bad_vector)

        if good_score > bad_score + 0.05:
            rating = "Above Average"
        elif bad_score > good_score + 0.05:
            rating = "Average"
        else:
            rating = "Below Average"

    return rating

In [15]:
def check_description(car_description, og_numeric_rating, good_vector, bad_vector):
    if not car_description or not car_description.strip():
        has_description = 0
        rating = "Null"
    else:
        rating = get_rating_of_a_car(car_description, og_numeric_rating, good_vector, bad_vector)
        has_description = 1
    return has_description, rating

In [16]:
def get_car_listings(collection_name):
    collection = db[collection_name]
    return list(collection.find({}).batch_size(100))

In [17]:
def write_rating_back_to_db(doc_id, rating, collection):
    excellent_state = 1 if rating == "Excellent" else 0
    above_avg_state = 1 if rating == "Above Average" else 0
    avg_state = 1 if rating == "Average" else 0
    below_avg_state = 1 if rating == "Below Average" else 0
    bad_state = 1 if rating == "Bad" else 0

    max_retries = 3
    for attempt in range(max_retries):
        try:
            collection.update_one(
                {"_id": doc_id},
                {"$set": {"has_description": 1,"Excellent": excellent_state,"Above Average": above_avg_state, "Average": avg_state, "Below Average": below_avg_state, "Bad": bad_state}}
            )
            break
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Retry {attempt + 1} for doc {doc_id}")
            else:
                print(f"Failed to update doc {doc_id}: {e}")

In [18]:
def write_null_rating_back_to_db(doc_id, collection):

    max_retries = 3
    for attempt in range(max_retries):
        try:
            collection.update_one(
                {"_id": doc_id},
                {"$set": {"has_description": 0,"Excellent": None,"Above Average": None, "Average": None, "Below Average": None, "Bad": None}}
            )
            break
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Retry {attempt + 1} for doc {doc_id}")
            else:
                print(f"Failed to update doc {doc_id}: {e}")

In [19]:
# ---------------- MAIN FUNCTION ----------------
def description_embedder(collection_name):
    try:
        # Test connection
        mdb_client.admin.command('ping')
        print("MongoDB connection successful")
        
        docs = get_car_listings(collection_name)
        
        # Precompute Good/Bad vectors
        good_vector = np.mean(np.vstack([embed(x) for x in good_refs]), axis=0)
        bad_vector  = np.mean(np.vstack([embed(x) for x in bad_refs]), axis=0)

        
        collection = db[collection_name]
        
        for i, car in enumerate(docs):
            description = car.get("description", "")
            og_numeric_rating = car.get("rating", "")
            has_description, rating = check_description(description, og_numeric_rating, good_vector, bad_vector)
            if has_description:
                write_rating_back_to_db(car["_id"], rating, collection)
            else:
                write_null_rating_back_to_db(car["_id"], collection)
            print(f"Embedded {i+1}/{len(docs)}")
            
    except Exception as e:
        print(f"Error: {e}")
    finally:
        mdb_client.close()
        print("MongoDB connection closed")

In [20]:
# ---------------- RUN ----------------
description_embedder("listings")

MongoDB connection successful
Embedded 1/2167
Embedded 2/2167
Embedded 3/2167
Embedded 4/2167
Embedded 5/2167
Embedded 6/2167
Embedded 7/2167
Embedded 8/2167
Embedded 9/2167
Embedded 10/2167
Embedded 11/2167
Embedded 12/2167
Embedded 13/2167
Embedded 14/2167
Embedded 15/2167
Embedded 16/2167
Embedded 17/2167
Embedded 18/2167
Embedded 19/2167
Embedded 20/2167
Embedded 21/2167
Embedded 22/2167
Embedded 23/2167
Embedded 24/2167
Embedded 25/2167
Embedded 26/2167
Embedded 27/2167
Embedded 28/2167
Embedded 29/2167
Embedded 30/2167
Embedded 31/2167
Embedded 32/2167
Embedded 33/2167
Embedded 34/2167
Embedded 35/2167
Embedded 36/2167
Embedded 37/2167
Embedded 38/2167
Embedded 39/2167
Embedded 40/2167
Embedded 41/2167
Embedded 42/2167
Embedded 43/2167
Embedded 44/2167
Embedded 45/2167
Embedded 46/2167
Embedded 47/2167
Embedded 48/2167
Embedded 49/2167
Embedded 50/2167
Embedded 51/2167
Embedded 52/2167
Embedded 53/2167
Embedded 54/2167
Embedded 55/2167
Embedded 56/2167
Embedded 57/2167
Embedded 5