In [43]:
events_data["userId"] = events_data["userId"].astype(str)
print(events_data["userId"].unique())  # See all unique user IDs in the dataset
print(events_data[events_data["userId"] == "67b0f429ea5564202b96a19e"])


['65f3c55249132e4527d465f7' '67b04f3cdc12c6e7d07cff48'
 '67b0587a928d6fd8bf045fdf' '67b0bc586ffde1a21eaf0e1f'
 '67b04dcf928d6fd8bf045fc9' '67b0f429ea5564202b96a19e']
                         _id                    userId  \
54  67b0f4e1ea5564202b96a1a1  67b0f429ea5564202b96a19e   
55  67b0f4e2ea5564202b96a1a4  67b0f429ea5564202b96a19e   
56  67b0f4e3ea5564202b96a1a7  67b0f429ea5564202b96a19e   
57  67b0f4ecea5564202b96a1aa  67b0f429ea5564202b96a19e   
58  67b0f4ecea5564202b96a1ad  67b0f429ea5564202b96a19e   
59  67b0f4edea5564202b96a1b0  67b0f429ea5564202b96a19e   
60  67b0f4eeea5564202b96a1b3  67b0f429ea5564202b96a19e   
61  67b0f4eeea5564202b96a1b6  67b0f429ea5564202b96a19e   
62  67b0f4efea5564202b96a1b9  67b0f429ea5564202b96a19e   
63  67b0f580ea5564202b96a1c1  67b0f429ea5564202b96a19e   
64  67b0f580ea5564202b96a1c3  67b0f429ea5564202b96a19e   
65  67b0f580ea5564202b96a1c5  67b0f429ea5564202b96a19e   
66  67b0f580ea5564202b96a1c7  67b0f429ea5564202b96a19e   

                   pr

In [51]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pymongo import MongoClient

# ✅ Connect to MongoDB
client = MongoClient("mongodb+srv://anashenrya:attackanas17@onlineecommerce-cluster.0ckq4.mongodb.net/?retryWrites=true&w=majority&appName=OnlineEcommerce-Cluster/test")
db = client["test"]

# ✅ Load product data
products_data = pd.DataFrame(list(db.products.find({}, {"_id": 1, "title": 1, "description": 1})))

# ✅ Load event data (user interactions)
events_data = pd.DataFrame(list(db.events.find({}, {"userId": 1, "productId": 1, "eventType": 1})))

# ✅ Debug: Print the first few rows to check if user data exists
# print("First few rows of events_data:")
# print(events_data.head())

# ✅ Assign event weights
event_weights = {"view": 1,"search":2, "add_to_cart": 3, "purchase": 5}
events_data["weight"] = events_data["eventType"].map(event_weights)

# ✅ Fix FutureWarning (avoid chained assignment)
products_data = products_data.copy()
products_data["title"] = products_data["title"].fillna("")
products_data["description"] = products_data["description"].fillna("")

# ✅ Combine title and description for content filtering
products_data["combined_features"] = products_data["title"] + " " + products_data["description"]

# ✅ Compute TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(products_data["combined_features"])

# ✅ Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


# ✅ Function: Content-Based Filtering
def content_based_recommendations(product_id, top_n=5):
    if product_id not in products_data["_id"].values:
        print(f"⚠ No product found for ID: {product_id}")
        return []

    idx = products_data.index[products_data["_id"] == product_id][0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    recommended_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    recommended_products = products_data.iloc[recommended_indices]["_id"].tolist()
    
    return recommended_products


# ✅ Function: Collaborative Filtering
def collaborative_filtering(user_id, top_n=5):
    user_events = events_data[events_data["userId"] == str(user_id)]

    if user_events.empty:
        print(f"⚠ No interactions found for user: {user_id}")
        return []

    print(f"✅ User {user_id} found with {len(user_events)} interactions!")


    # Aggregate interactions by user-product pairs
    user_interactions = events_data.groupby(["userId", "productId"])["weight"].sum().reset_index()

    # Create a pivot table
    user_item_matrix = user_interactions.pivot(index="userId", columns="productId", values="weight").fillna(0)

    if user_id not in user_item_matrix.index:
        print(f"⚠ User {user_id} has no recorded interactions.")
        return []

    # Compute similarity scores
    user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)
    similarity_scores = cosine_similarity(user_vector, user_item_matrix)[0]

    # Rank users by similarity
    similar_users = list(user_item_matrix.index[np.argsort(similarity_scores)[::-1]][1:])

    recommended_products = set()
    for similar_user in similar_users:
        recommended_products.update(user_interactions[user_interactions["userId"] == similar_user]["productId"].tolist())

        if len(recommended_products) >= top_n:
            break

    return list(recommended_products)[:top_n]


# ✅ Function: Hybrid Recommendation (Content + Collaborative)
def get_recommendations_for_user(user_id, top_n=5):
    if not events_data[events_data["userId"] == str(user_id)].empty:
        print(f"✅ User {user_id} found in events_data!")
    else:
        print(f"⚠ No interactions found for user: {user_id}")


    # ✅ Get the most interacted product for this user
    user_products = events_data[events_data["userId"] == user_id]["productId"]
    
    if user_products.empty:
        print(f"⚠ No product interactions found for user {user_id}")
        return []

    first_product_id = user_products.iloc[0]

    # ✅ Get recommendations
    content_recommendations = content_based_recommendations(first_product_id, top_n)
    collaborative_recommendations = collaborative_filtering(user_id, top_n)

    # ✅ Hybrid approach: combine and remove duplicates
    hybrid_recommendations = list(set(content_recommendations + collaborative_recommendations))

    # ✅ Print results
    print(f"📌 Content-Based Recommendations: {content_recommendations}")
    print(f"📌 Collaborative Filtering Recommendations: {collaborative_recommendations}")
    print(f"📌 Hybrid Recommendations: {hybrid_recommendations}")

    return hybrid_recommendations


# ✅ Example: Call the function with a specific user ID
user_id_to_test = "67b0f429ea5564202b96a19e"  # Change this to the user ID you want
get_recommendations_for_user(user_id_to_test)


⚠ No product interactions found for user 67b0f429ea5564202b96a19e


[]

In [49]:
events_data["userId"] = events_data["userId"].astype(str)
print(events_data["userId"].unique())  # See all unique user IDs in the dataset
print(events_data[events_data["userId"] == "67b04f3cdc12c6e7d07cff48"])


['65f3c55249132e4527d465f7' '67b04f3cdc12c6e7d07cff48'
 '67b0587a928d6fd8bf045fdf' '67b0bc586ffde1a21eaf0e1f'
 '67b04dcf928d6fd8bf045fc9' '67b0f429ea5564202b96a19e']
                         _id                    userId  \
10  67b0a5f2b87d10638a0e1ce1  67b04f3cdc12c6e7d07cff48   
11  67b0a636b87d10638a0e1ce5  67b04f3cdc12c6e7d07cff48   
12  67b0a663b87d10638a0e1cef  67b04f3cdc12c6e7d07cff48   
13  67b0a663b87d10638a0e1cf1  67b04f3cdc12c6e7d07cff48   
14  67b0a663b87d10638a0e1cf3  67b04f3cdc12c6e7d07cff48   
15  67b0a663b87d10638a0e1cf5  67b04f3cdc12c6e7d07cff48   
16  67b0a663b87d10638a0e1cf8  67b04f3cdc12c6e7d07cff48   
17  67b0a664b87d10638a0e1cfa  67b04f3cdc12c6e7d07cff48   
18  67b0a664b87d10638a0e1cfc  67b04f3cdc12c6e7d07cff48   
19  67b0b839b87d10638a0e1d00  67b04f3cdc12c6e7d07cff48   
20  67b0b972b87d10638a0e1d05  67b04f3cdc12c6e7d07cff48   
21  67b0bc11b87d10638a0e1d18  67b04f3cdc12c6e7d07cff48   
22  67b0bc3db87d10638a0e1d1a  67b04f3cdc12c6e7d07cff48   
68  67b104679b698ac759

In [27]:
print(events_data.dtypes)


_id          object
userId       object
productId    object
eventType    object
weight        int64
dtype: object


In [53]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pymongo import MongoClient

# ✅ Connect to MongoDB
client = MongoClient("mongodb+srv://anashenrya:attackanas17@onlineecommerce-cluster.0ckq4.mongodb.net/?retryWrites=true&w=majority&appName=OnlineEcommerce-Cluster/test")
db = client["test"]

# ✅ Load product data
products_data = pd.DataFrame(list(db.products.find({}, {"_id": 1, "title": 1, "description": 1})))

# ✅ Load event data (user interactions)
events_data = pd.DataFrame(list(db.events.find({}, {"userId": 1, "productId": 1, "eventType": 1})))

# Potential fixes: Convert _id to string and strip whitespace
products_data["_id"] = products_data["_id"].astype(str)
events_data["productId"] = events_data["productId"].astype(str)  # Ensure productId is a string too!
products_data["_id"] = products_data["_id"].str.strip()
events_data["productId"] = events_data["productId"].str.strip()

# Debugging data types and missing values
print("Data Types of events_data:")
print(events_data.dtypes)
print("\nMissing values in events_data:")
print(events_data.isnull().sum())


# Added debugging prints
print("First few rows of events_data:")
print(events_data.head())
print("\nUnique user IDs in events_data:")
print(events_data["userId"].unique())

# Debug prints for product IDs
print("Unique product IDs in events_data:")
print(events_data["productId"].unique())
print("\nUnique _id values in products_data:")
print(products_data["_id"].unique())


# ✅ Assign event weights
event_weights = {"view": 1,"search":2, "add_to_cart": 3, "purchase": 5}
events_data["weight"] = events_data["eventType"].map(event_weights)

# ✅ Fix FutureWarning (avoid chained assignment)
products_data = products_data.copy()
products_data["title"] = products_data["title"].fillna("")
products_data["description"] = products_data["description"].fillna("")

# ✅ Combine title and description for content filtering
products_data["combined_features"] = products_data["title"] + " " + products_data["description"]

# ✅ Compute TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(products_data["combined_features"])

# ✅ Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


# ✅ Function: Content-Based Filtering
def content_based_recommendations(product_id, top_n=5):
    if product_id not in products_data["_id"].values:
        print(f"⚠ No product found for ID: {product_id}")
        return []

    idx = products_data.index[products_data["_id"] == product_id][0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    recommended_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    recommended_products = products_data.iloc[recommended_indices]["_id"].tolist()
    
    return recommended_products


# ✅ Function: Collaborative Filtering
def collaborative_filtering(user_id, top_n=5):
    user_events = events_data[events_data["userId"] == str(user_id)]

    if user_events.empty:
        print(f"⚠ No interactions found for user: {user_id}")
        return []

    print(f"✅ User {user_id} found with {len(user_events)} interactions!")


    # Aggregate interactions by user-product pairs
    user_interactions = events_data.groupby(["userId", "productId"])["weight"].sum().reset_index()

    # Create a pivot table
    user_item_matrix = user_interactions.pivot(index="userId", columns="productId", values="weight").fillna(0)

    if user_id not in user_item_matrix.index:
        print(f"⚠ User {user_id} has no recorded interactions.")
        return []

    # Compute similarity scores
    user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)
    similarity_scores = cosine_similarity(user_vector, user_item_matrix)[0]

    # Rank users by similarity
    similar_users = list(user_item_matrix.index[np.argsort(similarity_scores)[::-1]][1:])

    recommended_products = set()
    for similar_user in similar_users:
        recommended_products.update(user_interactions[user_interactions["userId"] == similar_user]["productId"].tolist())

        if len(recommended_products) >= top_n:
            break

    return list(recommended_products)[:top_n]


# ✅ Function: Hybrid Recommendation (Content + Collaborative)
def get_recommendations_for_user(user_id, top_n=5):
    print(f"Entering get_recommendations_for_user for user: {user_id}") # Debug
    user_events = events_data[events_data["userId"] == str(user_id)]

    if user_events.empty:
        print(f"⚠ No interactions found for user: {user_id}")
        return []

    print(f"✅ User {user_id} found with {len(user_events)} interactions!")
    print(f"User events data: \n{user_events}") # Debug

    # ✅ Get the most interacted product for this user
    user_products = events_data[events_data["userId"] == str(user_id)]["productId"]

    print(f"User products (before checking if empty): \n{user_products}") # Debug

    if user_products.empty:
        print(f"⚠ No product interactions found for user {user_id}")
        return []

    print(f"User products (after checking if empty): \n{user_products}") # Debug
    first_product_id = user_products.iloc[0]
    print(f"First product ID: {first_product_id}") # Debug

    # ✅ Get recommendations
    content_recommendations = content_based_recommendations(first_product_id, top_n)
    collaborative_recommendations = collaborative_filtering(user_id, top_n)

    # ✅ Hybrid approach: combine and remove duplicates
    hybrid_recommendations = list(set(content_recommendations + collaborative_recommendations))

    # ✅ Print results
    print(f"📌 Content-Based Recommendations: {content_recommendations}")
    print(f"📌 Collaborative Filtering Recommendations: {collaborative_recommendations}")
    print(f"📌 Hybrid Recommendations: {hybrid_recommendations}")

    return hybrid_recommendations


# ✅ Example: Call the function with a specific user ID
user_id_to_test = "67b0f429ea5564202b96a19e"  # Change this to the user ID you want
get_recommendations_for_user(user_id_to_test)


Data Types of events_data:
_id          object
userId       object
productId    object
eventType    object
dtype: object

Missing values in events_data:
_id          0
userId       0
productId    0
eventType    0
dtype: int64
First few rows of events_data:
                        _id                    userId  \
0  67b04f1ddc12c6e7d07cff38  65f3c55249132e4527d465f7   
1  67b04f22dc12c6e7d07cff3c  65f3c55249132e4527d465f7   
2  67b04f24dc12c6e7d07cff40  65f3c55249132e4527d465f7   
3  67b04f28dc12c6e7d07cff44  65f3c55249132e4527d465f7   
4  67b04f91dc12c6e7d07cff4e  65f3c55249132e4527d465f7   

                  productId eventType  
0  67b04d562784417bd80f966b      view  
1  67b04d562784417bd80f966a      view  
2  67b04d562784417bd80f9669      view  
3  67b04d562784417bd80f9668      view  
4  67b04d562784417bd80f9661      view  

Unique user IDs in events_data:
[ObjectId('65f3c55249132e4527d465f7') ObjectId('67b04f3cdc12c6e7d07cff48')
 ObjectId('67b0587a928d6fd8bf045fdf') ObjectId('67b0

[]