In [224]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack

def fetch_realtime_data():
    MONGO_URI = "mongodb://localhost:27017"
    client = MongoClient(MONGO_URI)
    db = client["My-Shop"]
    return (
        pd.DataFrame(list(db.users.find())),
        pd.DataFrame(list(db.interactions.find())),
        pd.DataFrame(list(db.products.find()))
    )

def convert_mongo_types(df, id_cols=[], date_cols=[]):
    for col in id_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: str(x) if pd.notnull(x) else x)
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col])
    return df

def preprocess_data():
    users, interactions, products = fetch_realtime_data()
    users = convert_mongo_types(users, id_cols=['_id'], date_cols=[])
    interactions = convert_mongo_types(interactions, id_cols=['productId'], date_cols=['interactionDate'])
    products = convert_mongo_types(products, id_cols=['_id'], date_cols=[])
    products.drop(columns=['dimensions','reviews','minimumOrderQuantity','image','warrantyInformation','weight','availabilityStatus'], inplace=True, errors='ignore')
    users.drop(columns=['image','role','password','email','phone','__v'], inplace=True, errors='ignore')

    def flatten_user_products(users_df, list_type='cartProducts'):
        flattened = []
        for user in users_df.to_dict('records'):
            product_list = user.get(list_type, [])
            if not isinstance(product_list, list):
                continue
            for product in product_list:
                if not isinstance(product, dict):
                    continue
                flattened.append({
                    'user_id': user.get('_id'),
                    'username': user.get('username'),
                    'productId': product.get('_id'),
                    'quantity': product.get('quantity'),
                    'size': product.get('size'),
                    'type': 'cart' if list_type == 'cartProducts' else 'wishlist'
                })
        return pd.DataFrame(flattened)

    cart_flat = flatten_user_products(users, 'cartProducts')
    wish_flat = flatten_user_products(users, 'wishListProducts')
    user_products_df = pd.concat([cart_flat, wish_flat], ignore_index=True)
    interactions.rename(columns={'userId': 'user_id'}, inplace=True)
    merged_interactions = pd.merge(user_products_df, interactions, how='outer', on=['user_id', 'productId'])
    merged_interactions['type'] = merged_interactions['type_x'].combine_first(merged_interactions['type_y'])
    merged_interactions.drop(columns=['type_x', 'type_y'], inplace=True)
    merged_interactions['type'] = merged_interactions['type'].fillna('view')
    merged_interactions['productId'] = merged_interactions['productId'].astype(str)
    products['_id'] = products['_id'].astype(str)
    filtered_products = products[products['_id'].isin(merged_interactions['productId'])].copy()
    final_df = pd.merge(merged_interactions, filtered_products, how='left', left_on='productId', right_on='_id')
    final_df.drop(columns=[col for col in ['_id', '_id_x', '_id_y'] if col in final_df.columns], inplace=True)
    final_df.sort_values(by=['user_id', 'productId'], inplace=True)
    final_df.reset_index(drop=True, inplace=True)
    final_df['combined_text'] = create_text_features(final_df)
    final_df.fillna({'username': 'user', 'size': 'M', 'quantity': 1, 'interactionDate': pd.Timestamp('2024-10-20 15:04:49')}, inplace=True)
    return final_df

final_df


Unnamed: 0,user_id,username,productId,quantity,size,interactionDate,type,title,description,category,subcategory,oldPrice,price,discountPercentage,rating,stock,tags,brand,sku,combined_text
0,67dd3200a0b496548d06657b,user,1f4ddaf9-bb39-4c86-874b-ce7326ad0708,1,M,2024-11-12 21:38:30,cart,Acana Wild Prairie Dog Food 45kg,Acana Wild Prairie Dog Food is made with freer...,Groceries,Pet Food,3999.0,3599.0,10.0,4.6,10.0,dog food grainfree high protein natural ingred...,Acana,ACANAWILD45KG,Acana Wild Prairie Dog Food 45kg Acana Wild Pr...
1,67dd3200a0b496548d06657b,user,26b16f1f-f1dc-4593-891c-42ec7e42964d,1,M,2024-10-20 15:04:49,payment_failed,NatureFresh Star Fruit Carambola 4 PCs,NatureFresh star fruit or carambola is a tropi...,Groceries,Fresh Produce,200.0,170.0,15.0,4.1,8.0,exotic fruit vitamin C tropical lowcalorie,NatureFresh,NATUREFRESHSTAR4PC,NatureFresh Star Fruit Carambola 4 PCs NatureF...
2,67dd3200a0b496548d06657b,user,512a79ac-79e5-4843-8f80-346dcdcfdb3e,1,M,2025-11-11 01:34:20,search,LOreal Paris Revitalift Crystal MicroEssence,Lightweight essence with salicylic acid and hy...,Beauty Care,Skincare,899.0,764.0,15.0,4.7,70.0,microessence brightening hyaluronic acid salic...,LOreal Paris,LOREALREVITALIFT,LOreal Paris Revitalift Crystal MicroEssence L...
3,67dd3200a0b496548d06657b,user,c2c1c4d7-88c8-4627-b78e-2e712953aac8,1,M,2024-02-08 08:43:21,order,Green Soul Ergonomic Study Chair Mesh Black,This ergonomic study chair features breathable...,Home Living,Furniture,6999.0,5949.0,15.0,4.4,20.0,Study Chair Ergonomic Mesh Comfort,Green Soul,GSCHAIRMESHBLACK,Green Soul Ergonomic Study Chair Mesh Black Th...
4,67dd3200a0b496548d06657b,user,c2c1c4d7-88c8-4627-b78e-2e712953aac8,1,M,2025-06-28 01:07:43,remove_from_cart,Green Soul Ergonomic Study Chair Mesh Black,This ergonomic study chair features breathable...,Home Living,Furniture,6999.0,5949.0,15.0,4.4,20.0,Study Chair Ergonomic Mesh Comfort,Green Soul,GSCHAIRMESHBLACK,Green Soul Ergonomic Study Chair Mesh Black Th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,67f94ec6fc13ae13ef67061e,Magda Borrell,26b16f1f-f1dc-4593-891c-42ec7e42964d,5,S,2024-10-20 15:04:49,cart,NatureFresh Star Fruit Carambola 4 PCs,NatureFresh star fruit or carambola is a tropi...,Groceries,Fresh Produce,200.0,170.0,15.0,4.1,8.0,exotic fruit vitamin C tropical lowcalorie,NatureFresh,NATUREFRESHSTAR4PC,NatureFresh Star Fruit Carambola 4 PCs NatureF...
551,67f94ec6fc13ae13ef67061e,Magda Borrell,39a77353-4a3f-4c7f-9db7-9df87252163e,1,M,2024-10-20 15:04:49,wishlist,Bata Mens Sandals Brown,Comfortable brown sandals with adjustable stra...,Footwear,Mens Footwear,1499.0,1199.0,20.0,4.3,40.0,sandals casual comfortable men,Bata,BATASANDALS23,Bata Mens Sandals Brown Comfortable brown sand...
552,67f94ec6fc13ae13ef67061e,Magda Borrell,512a79ac-79e5-4843-8f80-346dcdcfdb3e,5,M,2024-10-20 15:04:49,cart,LOreal Paris Revitalift Crystal MicroEssence,Lightweight essence with salicylic acid and hy...,Beauty Care,Skincare,899.0,764.0,15.0,4.7,70.0,microessence brightening hyaluronic acid salic...,LOreal Paris,LOREALREVITALIFT,LOreal Paris Revitalift Crystal MicroEssence L...
553,67f94ec6fc13ae13ef67061e,Magda Borrell,910622ef-85a4-4638-a6df-1536021467ab,5,,2024-10-20 15:04:49,cart,Organic Harvest Methi Leaves Fresh 100g,Organic Harvest methi leaves are aromatic and ...,Groceries,Fresh Produce,30.0,25.0,16.7,4.3,22.0,herbs ironrich fresh produce cooking,Organic Harvest,ORGHARMETHI100G,Organic Harvest Methi Leaves Fresh 100g Organi...


In [230]:
def create_text_features(df):
    text_components = []
    for col in [
        "subcategory",
        "tags",
        "brand",
        "title",
        "description",
        "sku",
        "category",
    ]:
        if col == "tags":
            df[col] = df[col].apply(
                lambda x: ", ".join(map(str, x)) if isinstance(x, list) else ""
            )
        df[col] = df[col].fillna("").astype(str).str.replace(r"[^\w\s]", "", regex=True)
        text_components.append(df[col])
    return text_components[0].str.cat(text_components[1:], sep=" ")


def safe_feature_scaling(df, features, prefix="scaled"):
    valid_features = [f for f in features if f in df.columns]
    df_filled = df[valid_features].copy()
    for col in valid_features:
        df_filled[col] = pd.to_numeric(df_filled[col], errors="coerce").fillna(0)

    scaler = MinMaxScaler()
    scaled_array = scaler.fit_transform(df_filled)
    scaled_df = pd.DataFrame(scaled_array, columns=valid_features)

    # Apply custom weights for tuning
    weight_map = {"price": 0.5, "discountPercentage": 2.0, "rating": 3.0, "stock": 1.0}

    for col in scaled_df.columns:
        if col in weight_map:
            scaled_df[col] *= weight_map[col]

    return scaled_df.add_prefix(f"{prefix}_")


def realtime_similarity_engine(df, text_col="combined_text", num_cols=None):
    tfidf = TfidfVectorizer(
        stop_words="english", max_features=8000, ngram_range=(1, 2), min_df=2
    )
    tfidf_matrix = tfidf.fit_transform(df[text_col])

    num_scaled = safe_feature_scaling(df, num_cols)
    text_weight = 1.2
    combined_features = hstack([tfidf_matrix * text_weight, num_scaled]).tocsr()

    n_neighbors = min(30, len(df) - 1)
    nn = NearestNeighbors(
        n_neighbors=n_neighbors + 1, metric="cosine", algorithm="brute"
    )
    nn.fit(combined_features)

    return nn, combined_features


def realtime_recommendations(product_id, model, features_matrix, df, top_n=10):
    try:
        product_idx = df[df["productId"] == product_id].index[0]

        max_neighbors = min(
            top_n + 20, features_matrix.shape[0]
        )  # Safely limit n_neighbors
        distances, indices = model.kneighbors(
            features_matrix[product_idx], n_neighbors=max_neighbors
        )

        results = df.iloc[indices[0]].copy()
        results = results[results["productId"] != product_id]
        results = results.drop_duplicates(subset="productId")
        results = results[results["stock"] > 0].head(top_n)

        return results["productId"].tolist()
    except (IndexError, KeyError):
        return get_fallback_recommendations(df)


def get_fallback_recommendations(df):
    return df[df["stock"] > 0]["productId"].drop_duplicates().head(10).tolist()


if __name__ == "__main__":
    full_df = preprocess_data()
    full_df["combined_text"] = create_text_features(full_df)

    unique_products_df = full_df.drop_duplicates(subset="productId").reset_index(
        drop=True
    )

    print("Final Data Shape:", unique_products_df.shape)
    print("Unique Products:", unique_products_df["productId"].nunique())

    model, feature_matrix = realtime_similarity_engine(
        unique_products_df, num_cols=["price", "discountPercentage", "rating", "stock"]
    )

    sample_id = unique_products_df["productId"].iloc[11
    ]
    print("Sample Product:", unique_products_df.loc[11
    , "title"])

    recommendations = realtime_recommendations(
        sample_id, model, feature_matrix, unique_products_df
    )
    print("Recommended product IDs:", sample_id, recommendations)

Final Data Shape: (16, 20)
Unique Products: 16
Sample Product: Organic Harvest Methi Leaves Fresh 100g
Recommended product IDs: 910622ef-85a4-4638-a6df-1536021467ab ['26b16f1f-f1dc-4593-891c-42ec7e42964d', '799db516-680e-4454-824b-4845e0bf19e0', '1f4ddaf9-bb39-4c86-874b-ce7326ad0708', '39a77353-4a3f-4c7f-9db7-9df87252163e', 'c2c1c4d7-88c8-4627-b78e-2e712953aac8', 'da0567f7-2ddb-4653-bfbc-03aaa58e50ff', 'd556bb73-84d5-4252-8134-6b6e865837c7', 'a45e9eb4-b713-430f-95e8-1ff3f1f8b703', '20354465-8e77-4e9c-b736-5372a00d6a0e', 'ff935b33-0142-4564-95c1-30232db68840']
