In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker

In [2]:
fake = Faker()

#Parameters
NUM_USERS = 1587
NUM_PRODUCTS = 27
MAX_INTERACTIONS = 10159 #total interactions to simulate

In [3]:
# generate users
def generate_user(n):
    return pd.DataFrame({
        'user_id': range(1, n + 1),
        'name': [fake.name() for _ in range(n)],
        'gender': [random.choice(['Male', 'Female']) for _ in range(n)],
        'age': np.random.randint(18, 65, size=n),
        'location': [fake.city() for _ in range(n)]
    })

In [4]:
# generate products
def generate_products(n):
    categories = ['Electronic', 'Fashion', 'Home', 'Beauty', 'Sports', 'Books']
    return pd.DataFrame({
        'product_id': range(1, n + 1),
        'product_name': [fake.word().capitalize() + " " + fake.word().capitalize() for _ in range(n)],
        'category': [random.choice(categories) for _ in range(n)],
        'price': np.round(random.uniform(5, 3000),2),
        'description': [fake.sentence() for _ in range(n)],
        'premium': np.random.choice([0,1], size=n, p=[0.9, 0.1]) #10% on premium products
    })

In [5]:
# generate user interaction
def generate_interactions(n, users, products):
    interactions = []
    actions = ['view', 'like', 'share', 'comment', 'purchase']
    user_ids = users['user_id'].values.tolist()
    product_ids = products['product_id'].values.tolist()
    for _ in range(n):
        user = random.choice(user_ids) if user_ids else None
        product = random.choice(product_ids) if product_ids else None
        if user is not None and product is not None:
            action = random.choices(actions, weights=[0.5, 0.2, 0.15, 0.1, 0.05])[0]
            interactions.append((user, product, action))

    return pd.DataFrame(interactions, columns=['user_id', 'product_id', 'action'])

In [6]:
#Generate Data
users_df = generate_user(NUM_USERS)
products_df = generate_products(NUM_PRODUCTS)
interactions_df = generate_interactions(MAX_INTERACTIONS, users_df, products_df)

In [7]:
df_u = users_df.copy()
df_p = products_df.copy()
df_i = interactions_df.copy()

In [8]:
# Display Samples
#df_u.head()
df_p.head()
#df_i.head()

Unnamed: 0,product_id,product_name,category,price,description,premium
0,1,Us Marriage,Beauty,516.9,Bill stop Congress difference.,0
1,2,Cost Month,Books,516.9,Strategy site really industry.,0
2,3,Other These,Fashion,516.9,Office middle apply deep prove.,0
3,4,Fear Better,Beauty,516.9,Agent land toward easy.,0
4,5,Show Force,Sports,516.9,Whole cup edge front few.,0


In [9]:
# Data Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [10]:
df_u#Encoding categorical variables
encoder = LabelEncoder()
df_u['gender'] = encoder.fit_transform(df_u['gender'])
df_u['location'] = encoder.fit_transform(df_u['location'])
df_p['category'] = encoder.fit_transform(df_p['category'])

In [11]:
df_p#Normalize Price
scaler = MinMaxScaler()
df_p['price'] = scaler.fit_transform(df_p[['price']])

In [12]:
# One-Hot Encoding for Actions in Interactions
action_dummies = pd.get_dummies(df_i['action'])
df_i = pd.concat([df_i.drop('action', axis=1), action_dummies], axis=1)

In [13]:
# Aggregate User-Product Interactions
df_in = df_i.groupby(['user_id', 'product_id']).sum().reset_index()

In [46]:
df_in.tail()

Unnamed: 0,user_id,product_id,comment,like,purchase,share,view
9099,1586,26,0,0,1,0,1
9100,1587,4,0,0,0,0,1
9101,1587,10,0,1,0,0,0
9102,1587,20,0,0,0,1,0
9103,1587,27,0,0,0,0,1


In [15]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [16]:
#Create a user interaction matrix
interaction_matrix = df_in.pivot(index = 'user_id', columns = 'product_id', values='purchase').fillna(0)

In [17]:
#convert to sparse matrix
sparse_matrix = csr_matrix(interaction_matrix.values)

In [18]:
# Apply Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=27)  # Reduce dimensionality to 50 latent factors
user_factors = svd.fit_transform(sparse_matrix)
product_factors = svd.components_.T

In [19]:
def predict_ratings(user_id):
    if user_id not in interaction_matrix.index:
        return pd.DataFrame(columns=['product_id', 'product_name', 'category', 'price'])
    
    user_index = interaction_matrix.index.get_loc(user_id)
    user_ratings = np.dot(user_factors[user_index], product_factors.T)
    recommendations = sorted(enumerate(user_ratings), key=lambda x: x[1], reverse=True)
    recommended_products = products_df.iloc[[i[0] for i in recommendations[:10]]]
    return recommended_products[['product_id', 'product_name', 'category', 'price']].reset_index(drop=True)

In [47]:
# Example Recommendation
example_user = 9099
recommended_products = predict_ratings(example_user)
print(f"Recommended products for User {example_user}: {recommended_products}")

Recommended products for User 9099: Empty DataFrame
Columns: [product_id, product_name, category, price]
Index: []


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Use TF-IDF for Text-Based Similarity
vectorizer = TfidfVectorizer(stop_words='english')
product_desc_matrix = vectorizer.fit_transform(df_p['description'].fillna(""))

In [23]:
# Compute Similarity Matrix
similarity_matrix = cosine_similarity(product_desc_matrix)

In [24]:
def recommend_similar_products(product_id, top_n=5):
    product_index_list = products_df.index[products_df['product_id'] == product_id].tolist()
    if not product_index_list:
        return pd.DataFrame(columns=['product_id', 'product_name', 'category', 'price'])
    product_index = product_index_list[0]
    similarity_scores = sorted(enumerate(similarity_matrix[product_index]), key=lambda x: x[1], reverse=True)
    similar_products = products_df.iloc[[i[0] for i in similarity_scores[1:top_n+1]]]
    return similar_products[['product_id', 'product_name', 'category', 'price']].reset_index(drop=True)

In [25]:
# Example Recommendation
example_product = df_p['product_id'].iloc[0]
similar_products = recommend_similar_products(example_product)
print(f"Products similar to {example_product}: {similar_products}")

Products similar to 1:    product_id   product_name category  price
0           2     Cost Month    Books  516.9
1           3    Other These  Fashion  516.9
2           4    Fear Better   Beauty  516.9
3           5     Show Force   Sports  516.9
4           6  Season Method  Fashion  516.9


In [26]:
# Hybrid Recommendation System
def hybrid_recommendation(user_id, alpha=0.5, top_n=10):
    collab_recs = predict_ratings(user_id)
    if isinstance(collab_recs, list) or collab_recs.empty:
        return pd.DataFrame(columns=['product_id', 'product_name', 'category', 'price'])
    
    collab_product_ids = collab_recs['product_id'].tolist()
    hybrid_scores = {}
    for product_id in collab_product_ids:
        similar_products = recommend_similar_products(product_id, top_n=3)
        for similar_product_id in similar_products['product_id'].tolist():
            hybrid_scores[similar_product_id] = hybrid_scores.get(similar_product_id, 0) + alpha
    
    for i, product_id in enumerate(collab_product_ids):
        hybrid_scores[product_id] = hybrid_scores.get(product_id, 0) + (1 - alpha) * (top_n - i)
    
    sorted_hybrid = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_product_ids = [product[0] for product in sorted_hybrid[:top_n]]
    return products_df[products_df['product_id'].isin(recommended_product_ids)][['product_id', 'product_name', 'category', 'price']].reset_index(drop=True)

In [50]:
# Example Hybrid Recommendation
example_user = 24
hybrid_recs = hybrid_recommendation(example_user)
print(f"Hybrid recommendations for User {example_user}:")
print(hybrid_recs)

Hybrid recommendations for User 24:
   product_id      product_name category  price
0           1       Us Marriage   Beauty  516.9
1           2        Cost Month    Books  516.9
2           3       Other These  Fashion  516.9
3           4       Fear Better   Beauty  516.9
4           5        Show Force   Sports  516.9
5           6     Season Method  Fashion  516.9
6           7          This New   Beauty  516.9
7           8  Scientist Father     Home  516.9
8           9      Appear Whose   Beauty  516.9
9          15        First Fear    Books  516.9


In [28]:
# Evaluation Metrics
def precision_at_k(recommended, relevant, k=10):
    recommended_at_k = recommended[:k]
    relevant_set = set(relevant)
    return len(set(recommended_at_k) & relevant_set) / k

def recall_at_k(recommended, relevant, k=10):
    recommended_at_k = recommended[:k]
    relevant_set = set(relevant)
    return len(set(recommended_at_k) & relevant_set) / len(relevant) if relevant else 0

def ndcg_at_k(recommended, relevant, k=10):
    def dcg(scores):
        return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(scores))
    
    recommended_at_k = recommended[:k]
    relevant_set = set(relevant)
    relevance_scores = [1 if item in relevant_set else 0 for item in recommended_at_k]
    idcg_scores = sorted(relevance_scores, reverse=True)
    
    return dcg(relevance_scores) / dcg(idcg_scores) if dcg(idcg_scores) > 0 else 0

In [45]:
# Example Evaluation
example_user = 201
recommended_products = hybrid_recommendation(example_user)['product_id'].tolist()
actual_purchases = interactions_df[(df_i['user_id'] == example_user) & (df_i['purchase'] == 1)]['product_id'].tolist()

precision = precision_at_k(recommended_products, actual_purchases)
recall = recall_at_k(recommended_products, actual_purchases)
ndcg = ndcg_at_k(recommended_products, actual_purchases)

print(f"Evaluation for User {example_user}:")
print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")
print(f"NDCG@10: {ndcg:.4f}")

Evaluation for User 201:
Precision@10: 0.0000
Recall@10: 0.0000
NDCG@10: 0.0000


In [34]:
print("Recommended Products:", recommended_products)
print("Actual Purchases:", actual_purchases)

Recommended Products: [1, 2, 3, 8, 9, 10, 11, 15, 20, 22]
Actual Purchases: [11, 2]


In [37]:
def evaluate_recommendation_system(users, top_n=10):
    precision_scores = []
    recall_scores = []
    ndcg_scores = []

    for user_id in users:
        recommended_products = hybrid_recommendation(user_id, top_n=top_n)['product_id'].tolist()
        actual_purchases = df_i[(df_i['user_id'] == user_id) & 
                                           (df_i['purchase'] == 1)]['product_id'].tolist()

        if not actual_purchases:  # Skip users with no purchases
            continue

        # Compute Precision@K
        hits = sum(1 for product in recommended_products if product in actual_purchases)
        precision_at_k = hits / top_n
        precision_scores.append(precision_at_k)

        # Compute Recall@K
        recall_at_k = hits / len(actual_purchases)
        recall_scores.append(recall_at_k)

        # Compute NDCG@K
        dcg = sum(1 / np.log2(i + 2) for i, product in enumerate(recommended_products) if product in actual_purchases)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual_purchases), top_n)))
        ndcg_at_k = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg_at_k)

    # Compute averages
    avg_precision = np.mean(precision_scores) if precision_scores else 0
    avg_recall = np.mean(recall_scores) if recall_scores else 0
    avg_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0

    print(f"Average Precision@{top_n}: {avg_precision:.4f}")
    print(f"Average Recall@{top_n}: {avg_recall:.4f}")
    print(f"Average NDCG@{top_n}: {avg_ndcg:.4f}")

# Get a list of users who have made purchases
purchasing_users = df_i[df_i['purchase'] == 1]['user_id'].unique()

# Evaluate on multiple users
evaluate_recommendation_system(purchasing_users, top_n=10)


Average Precision@10: 0.1152
Average Recall@10: 0.9989
Average NDCG@10: 0.3870
