In [None]:
import numpy as np
from numpy.linalg import norm
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, OperationFailure
from dotenv import load_dotenv
import os
from sentence_transformers import SentenceTransformer
import logging
from datetime import datetime
import sys

In [None]:
# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'embedding_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

In [None]:
# Load .env
load_dotenv()
mongo_url = os.getenv("MONGO_URL")

In [None]:
# Validate environment variables
if not mongo_url:
    logger.error("MONGO_URL not found in environment variables")
    sys.exit(1)

In [None]:
# MongoDB client
try:
    mdb_client = MongoClient(mongo_url, serverSelectionTimeoutMS=5000)
    mdb_client.admin.command('ping')
    logger.info("Successfully connected to MongoDB")
    db = mdb_client["Honda_cars"]  # Database name
except ConnectionFailure as e:
    logger.error(f"Failed to connect to MongoDB: {e}")
    sys.exit(1)

In [None]:
# Load sentence-transformers model
try:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    logger.info("Successfully loaded SentenceTransformer model")
except Exception as e:
    logger.error(f"Failed to load model: {e}")
    sys.exit(1)

In [None]:
# ---------------- HELPER FUNCTIONS ----------------
def embed(text):
    """Return embedding vector for a text."""
    try:
        if not text or not isinstance(text, str):
            logger.warning(f"Invalid text input: {text}")
            return None
        return model.encode(text)
    except Exception as e:
        logger.error(f"Error embedding text: {e}")
        return None

In [None]:
def avg(vectors):
    """Calculate average of vectors, filtering out None values."""
    valid_vectors = [v for v in vectors if v is not None]
    if not valid_vectors:
        logger.error("No valid vectors to average")
        return None
    return np.mean(valid_vectors, axis=0)

In [None]:
def cos(a, b):
    """Calculate cosine similarity between two vectors."""
    try:
        return np.dot(a, b) / (norm(a) * norm(b))
    except Exception as e:
        logger.error(f"Error calculating cosine similarity: {e}")
        return 0

In [None]:
# ---------------- REFERENCE VECTORS ----------------
good_refs = [
    "well maintained car with low mileage and no accidents",
    "clean car with full service history",
    "excellent condition vehicle",
    "car is in pristine condition, no dents or scratches",
    "smooth engine performance and recently serviced",
    "interior and exterior are very clean and intact",
    "all original parts and properly maintained",
    "low usage, tires and brakes in excellent condition",
    "owner is careful and car drives like new",
    "perfect running condition with no mechanical issues",
    "minor cosmetic wear only, fully functional",
    "reliable car with detailed maintenance records",
    "Gari achi condition me hai, low mileage aur koi accident nahi",
    "Saaf suthri gari, full service history ke sath",
    "Perfect condition me gari",
    "Gari bilkul nayi jaisi, koi dents ya scratches nahi",
    "Engine smooth hai, recently serviced",
    "Interior aur exterior bilkul saaf aur intact hai",
    "Saare original parts hain aur properly maintain hui hai",
    "Kam use hui, tires aur brakes perfect condition me",
    "Owner careful hai aur gari bilkul nayi jaisi chalti hai",
    "Perfect running condition, koi mechanical issues nahi",
    "Minor cosmetic wear hai, fully functional",
    "Reliable gari, maintenance records available",
    "Koi kam nahi hone wala"
]

In [None]:
bad_refs = [
    "accident damaged car",
    "engine problems and rust",
    "poor condition vehicle with issues",
    "car has major dents and paint peeling",
    "frequent mechanical failures and service needed",
    "interior and exterior badly worn out",
    "brakes and suspension need replacement",
    "high mileage and poorly maintained",
    "significant engine noise and transmission issues",
    "rust on chassis and underbody",
    "owner reports multiple breakdowns",
    "unreliable car with missing parts",
    "Accident damaged gari",
    "Engine me problems aur rust hai",
    "Poor condition, kai issues hain",
    "Gari me bohot dents aur paint peeling hai",
    "Mechanical failures frequent, service required",
    "Interior aur exterior badly worn out",
    "Brakes aur suspension replace karne ki zarurat hai",
    "High mileage aur poorly maintained",
    "Engine me noise aur transmission problems",
    "Chassis aur underbody me rust hai",
    "Owner ne multiple breakdowns report kiye",
    "Unreliable gari, kuch parts missing hain"
]

In [None]:
# ---------------- RATING FUNCTIONS ----------------
def get_rating_of_a_car(car_description, good_vector, bad_vector):
    """Calculate rating based on cosine similarity to good/bad reference vectors."""
    if not car_description or good_vector is None or bad_vector is None:
        logger.warning("Invalid input for rating calculation")
        return "Normal"
    
    car_vec = embed(car_description)
    if car_vec is None:
        return "Normal"

    good_score = cos(car_vec, good_vector)
    bad_score = cos(car_vec, bad_vector)

    if good_score > bad_score + 0.05:
        rating = "Good"
    elif bad_score > good_score + 0.05:
        rating = "Bad"
    else:
        rating = "Normal"

    logger.debug(f"Good score: {good_score:.3f}, Bad score: {bad_score:.3f}, Rating: {rating}")
    return rating

In [None]:
def get_car_listings(collection_name):
    """Fetch all car listings from MongoDB collection."""
    try:
        collection = db[collection_name]
        docs = list(collection.find({}))
        logger.info(f"Retrieved {len(docs)} documents from {collection_name}")
        return docs
    except OperationFailure as e:
        logger.error(f"Failed to fetch documents from {collection_name}: {e}")
        return []

In [None]:
def write_rating_back_to_db(doc_id, rating, collection):
    """Update document with rating information."""
    try:
        good_state = 1 if rating == "Good" else 0
        bad_state = 1 if rating == "Bad" else 0
        normal_state = 1 if rating == "Normal" else 0

        result = collection.update_one(
            {"_id": doc_id},
            {"$set": {
                "rating": rating,
                "Good": good_state,
                "Normal": normal_state,
                "Bad": bad_state,
                "rated_at": datetime.now()
            }}
        )
        
        if result.modified_count == 0:
            logger.warning(f"No document updated for ID: {doc_id}")
        
        return result.modified_count > 0
    except Exception as e:
        logger.error(f"Failed to update document {doc_id}: {e}")
        return False

In [None]:
# ---------------- MAIN FUNCTION ----------------
def description_embedder(collection_name, skip_processed=True, batch_size=100):
    """
    Process car descriptions and add ratings to database.
    
    Args:
        collection_name: Name of the MongoDB collection
        skip_processed: Skip documents that already have a rating
        batch_size: Number of documents to process before logging progress
    """
    logger.info(f"Starting embedding process for collection: {collection_name}")
    
    # Check if collection exists
    if collection_name not in db.list_collection_names():
        logger.error(f"Collection '{collection_name}' does not exist")
        return
    
    # Fetch documents
    docs = get_car_listings(collection_name)
    if not docs:
        logger.warning("No documents found to process")
        return
    
    # Precompute Good/Bad vectors
    logger.info("Computing reference vectors...")
    good_embeddings = [embed(x) for x in good_refs]
    bad_embeddings = [embed(x) for x in bad_refs]
    
    good_vector = avg(good_embeddings)
    bad_vector = avg(bad_embeddings)
    
    if good_vector is None or bad_vector is None:
        logger.error("Failed to compute reference vectors")
        return
    
    logger.info("Reference vectors computed successfully")
    
    # Process documents
    collection = db[collection_name]
    processed = 0
    skipped = 0
    failed = 0
    
    for i, car in enumerate(docs):
        try:
            # Skip if already processed
            if skip_processed and "rating" in car:
                skipped += 1
                continue
            
            description = car.get("description", "")
            if not description:
                logger.warning(f"Document {car.get('_id')} has no description")
                description = ""
            
            rating = get_rating_of_a_car(description, good_vector, bad_vector)
            success = write_rating_back_to_db(car["_id"], rating, collection)
            
            if success:
                processed += 1
            else:
                failed += 1
            
            # Log progress
            if (i + 1) % batch_size == 0:
                logger.info(f"Progress: {i+1}/{len(docs)} | Processed: {processed} | Skipped: {skipped} | Failed: {failed}")
        
        except Exception as e:
            logger.error(f"Error processing document {i+1}: {e}")
            failed += 1
            continue
    
    # Final summary
    logger.info("="*60)
    logger.info("EMBEDDING PROCESS COMPLETED")
    logger.info(f"Total documents: {len(docs)}")
    logger.info(f"Successfully processed: {processed}")
    logger.info(f"Skipped (already rated): {skipped}")
    logger.info(f"Failed: {failed}")
    logger.info("="*60)

In [None]:
# ---------------- RUN ----------------
if __name__ == "__main__":
    # You can change these parameters
    COLLECTION_NAME = "listings"  # Collection name in Honda_cars database
    SKIP_ALREADY_PROCESSED = True  # Set to False to reprocess all documents
    
    try:
        description_embedder(
            collection_name=COLLECTION_NAME,
            skip_processed=SKIP_ALREADY_PROCESSED,
            batch_size=100
        )
    except KeyboardInterrupt:
        logger.info("Process interrupted by user")
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
    finally:
        mdb_client.close()
        logger.info("MongoDB connection closed")