In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load your transactions data
articles_df = pd.read_csv('./dataset/articles.csv')
customers_df = pd.read_csv('./dataset/customers.csv')
training_transactions = pd.read_csv('./dataset/split/fold_0/train.csv')
validation_transactions = pd.read_csv('./dataset/split/fold_0/test.csv')
# Load your transactions data

print("articles_df: ", articles_df.columns)
print("customers_df: ", customers_df.columns)
print("transactions_df: ", training_transactions.columns)

In [None]:
training_transactions["article_id"].value_counts()

In [None]:
# Step 2: Positive Samples
# Group by customer and list all articles they have purchased
positive_samples = training_transactions.groupby('customer_id')['article_id'].agg(list).reset_index()

# All unique articles
all_articles = set(articles_df['article_id'].astype(str))

# Step 3: Generate Negative Samples
# Function to generate negative samples with a ratio of 1:5
def generate_negatives(row):
    num_positives = len(row['article_id'])
    num_negatives = num_positives * 1  # Ratio of 1 positive to 5 negatives
    positives = set(row['article_id'])
    possible_negatives = list(all_articles - positives)
    num_negatives = min(num_negatives, len(possible_negatives))  # Avoid trying to sample more items than available
    return np.random.choice(possible_negatives, num_negatives, replace=False).tolist()

# Apply function to generate negative samples
# positive_samples['negative_samples'] = positive_samples.apply(generate_negatives, axis=1)

# Step 4: Compile Training Data
# Format data as sentences: each user has two lists, one for positives, one for negatives
training_data = []
for _, row in positive_samples.iterrows():
    training_data.append(row['article_id'])  # Positive sentence
    # training_data.append(row['negative_samples'])  # Negative sentence

# Now training_data can be used for embedding training


In [None]:
# Calculate the number of unique items in the filtered training transactions
unique_items_count = training_transactions['article_id'].nunique()
print(f"Number of unique items in the training transactions: {unique_items_count}")

# Calculate the number of unique users in the filtered training transactions
unique_users_count = training_transactions['customer_id'].nunique()
print(f"Number of unique users in the training transactions: {unique_users_count}")

# Calculate the number of unique users in the filtered training transactions
unique_users_count = validation_transactions['article_id'].nunique()
print(f"Number of unique items in the validation transactions: {unique_users_count}")

# Calculate the number of unique users in the filtered training transactions
unique_users_count = validation_transactions['customer_id'].nunique()
print(f"Number of unique users in the validation transactions: {unique_users_count}")

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import gensim
assert gensim.models.word2vec.FAST_VERSION > -1
from collections import Counter
import random

for purchase in training_data:
    random.shuffle(purchase)
    
print(len(training_data))

# Calculate the length of each sublist
lengths = [len(sublist) for sublist in training_data]

# Count the frequency of each length
length_distribution = Counter(lengths)

# Print the length distribution
print("Length distribution of sublists:")
for length, count in sorted(length_distribution.items())[:10]:
    print(f"Length {length}: {count} times")

In [None]:
from gensim.models import Word2Vec
import time

start = time.time()

model = Word2Vec(sentences=training_data,  # pre-processed list of movie lists
                 epochs=15,                # number of iterations over the corpus
                 min_count=10,               # item need to appear more than 10 times
                 vector_size=128,         # embedding vector size
                 workers=8,               # number of threads to use for training
                 sg=1,                    # using skip-gram algorithm
                 hs=0,                    # hierarchical softmax not used, we use negative sampling instead
                 negative=5,              # number of negative samples
                 window=9999)               # context window size

duration = time.time() - start
print("Time passed: {:.2f} seconds".format(duration))
# To save the model for later use
model.save('item2vec.model')
model.get_latest_training_loss()

In [None]:
from gensim.models import Word2Vec

# Load the model
# model = Word2Vec.load('item2vec.model')

# Now you can use the model
# For example, to find vectors or perform operations like finding similar items
# print(model.wv['some_item'])  # Replace 'some_item' with a valid item name from your training data


In [None]:
import numpy as np

# Assuming `transactions_df` is your transaction dataset with 'customer_id' and 'article_id'
# and `model` is your trained Word2Vec model

# Group transactions by user and collect all item IDs per user
user_items = training_transactions.groupby('customer_id')['article_id'].apply(list)

# Define a function to calculate the average vector of items per user
def calculate_user_profile(item_ids, model):
    vectors = []
    for item_id in item_ids:
        if item_id in model.wv.key_to_index:  # Check if the item is in the model's vocabulary
            vectors.append(model.wv[item_id])
    if vectors:  # If there are any vectors, calculate the average
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no items are in the vocabulary

# Calculate user profiles
user_profiles = user_items.apply(calculate_user_profile, model=model)

# Now `user_profiles` contains the vector representation (profile) for each user

In [None]:
# Assuming user_profiles is already defined as per the previous instructions

# Print the shape of the user_profiles DataFrame
print("Shape of user_profiles:", user_profiles.shape)

# Check the data type of the first item to ensure consistency
if len(user_profiles) > 0:
    first_element = user_profiles.iloc[0]
    print("Data type of user profiles:", type(first_element))
    print("Length of a single user profile vector:", len(first_element))

# Check for null values
null_data = user_profiles.isnull().any()
print("Any null user profiles?:", null_data)

# Print the first few user profiles to check
print("Sample user profiles:")
print(user_profiles.head())


In [None]:
import torch
from torch import nn
from tqdm import tqdm
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming `model` and `user_profiles` are already defined
# Convert item vectors to a tensor and move to the device
item_ids = list(model.wv.index_to_key)
item_vectors = torch.tensor([model.wv[item] for item in item_ids], dtype=torch.float, device=device)
item_vectors_norm = item_vectors / item_vectors.norm(dim=1, keepdim=True)

# Convert user profiles to a tensor
user_ids = list(user_profiles.keys())
user_vectors = torch.tensor(list(user_profiles.values), dtype=torch.float, device=device)

# Define batch size
batch_size = 4096

# Function to process batches and get recommendations
def process_batch(start_idx, end_idx):
    # Slice the batch
    batch_user_vectors = user_vectors[start_idx:end_idx]
    batch_user_vectors_norm = batch_user_vectors / batch_user_vectors.norm(dim=1, keepdim=True)
    
    # Compute cosine similarity
    similarities = torch.mm(batch_user_vectors_norm, item_vectors_norm.t())
    
    # Get top 12 recommendations for each user in the batch
    top_indices = torch.topk(similarities, 100, dim=1).indices
    
    # Map indices to item IDs
    return {user_ids[i]: [item_ids[idx] for idx in top_indices[row_index].cpu().tolist()]
            for row_index, i in enumerate(range(start_idx, end_idx))}

# Process all batches and collect recommendations
user_recommendations = {}
for start_idx in tqdm(range(0, len(user_vectors), batch_size)):
    end_idx = min(start_idx + batch_size, len(user_vectors))
    user_recommendations.update(process_batch(start_idx, end_idx))

# # Print or use these recommendations
# for user_id, recommendations in user_recommendations.items():
#     print(f"Recommendations for User {user_id}: {recommendations}")


In [None]:
# Step 1: Prepare actual purchases data
actual_purchases = validation_transactions.groupby('customer_id')['article_id'].agg(list)

# Step 3: Compute MAP@K
def compute_map_k(actual, predicted, k=12):
    actual_set = set(actual)
    predicted = predicted[:k]  # consider only the top k predictions
    hits = [1 if item in actual_set else 0 for item in predicted]
    cumulative_hits = np.cumsum(hits)
    precision_at_k = [hits[i] * cumulative_hits[i] / (i + 1) for i in range(len(hits))]
    if actual_set:
        return sum(precision_at_k) / min(len(actual_set), k)
    return 0

# Collect actual and predicted lists
user_ids = actual_purchases.index.intersection(user_recommendations.keys())
actual_lists = [actual_purchases.loc[user_id] for user_id in user_ids]
predicted_lists = [user_recommendations[user_id] for user_id in user_ids]

# Calculate MAP@12
map_scores = [compute_map_k(actual, predicted, 12) for actual, predicted in zip(actual_lists, predicted_lists)]
mean_average_precision = np.mean(map_scores)

print(f"MAP@12: {mean_average_precision}")

In [None]:
import numpy as np

# Your provided function
def calculate_map_at_n(predictions, ground_truth, top_n=12):
    def precision_at_k(k, predicted, actual):
        if len(predicted) > k:
            predicted = predicted[:k]
        return len(set(predicted) & set(actual)) / k

    def average_precision(predicted, actual):
        if not actual:
            return 0.0
        ap = 0.0
        relevant_items = 0
        for k in range(1, min(len(predicted), top_n) + 1):
            if predicted[k-1] in actual:
                relevant_items += 1
                ap += precision_at_k(k, predicted, actual) * 1
        return ap / min(len(actual), top_n)
    
    return average_precision(predictions, ground_truth)

# Prepare actual and predicted lists
user_ids = actual_purchases.index.intersection(user_recommendations.keys())
actual_lists = [actual_purchases.loc[user_id] for user_id in user_ids]
predicted_lists = [user_recommendations[user_id] for user_id in user_ids]

# Calculate MAP@12 for each user and compute the mean
map_scores = [calculate_map_at_n(predicted, actual, top_n=12) for actual, predicted in zip(actual_lists, predicted_lists)]
mean_average_precision = np.mean(map_scores)

print(f"MAP@12: {mean_average_precision}")


In [None]:
import numpy as np

# Your provided function
def calculate_map_at_n(predictions, ground_truth, top_n=12):
    def precision_at_k(k, predicted, actual):
        if len(predicted) > k:
            predicted = predicted[:k]
        return len(set(predicted) & set(actual)) / k

    def average_precision(predicted, actual):
        if not actual:
            return 0.0
        ap = 0.0
        relevant_items = 0
        for k in range(1, min(len(predicted), top_n) + 1):
            if predicted[k-1] in actual:
                relevant_items += 1
                ap += precision_at_k(k, predicted, actual) * 1
        return ap / min(len(actual), top_n)
    
    return average_precision(predictions, ground_truth)

# Step 1: Prepare actual purchases data
actual_purchases = validation_transactions.groupby('customer_id')['article_id'].agg(list)

# Step 3: Compute MAP@K
def compute_map_k(actual, predicted, k=12):
    actual_set = set(actual)
    predicted = predicted[:k]  # consider only the top k predictions
    hits = [1 if item in actual_set else 0 for item in predicted]
    cumulative_hits = np.cumsum(hits)
    precision_at_k = [hits[i] * cumulative_hits[i] / (i + 1) for i in range(len(hits))]
    if actual_set:
        return sum(precision_at_k) / min(len(actual_set), k)
    return 0

# Prepare actual and predicted lists
user_ids = actual_purchases.index.intersection(user_recommendations.keys())
actual_lists = [actual_purchases.loc[user_id] for user_id in user_ids]
predicted_lists = [user_recommendations[user_id] for user_id in user_ids]

# Calculate MAP@12 for each user using both methods
map_scores_method_1 = [calculate_map_at_n(predicted, actual, top_n=12) for actual, predicted in zip(actual_lists, predicted_lists)]
map_scores_method_2 = [compute_map_k(actual, predicted, 12) for actual, predicted in zip(actual_lists, predicted_lists)]

# Find and print differences
for user_id, score1, score2 in zip(user_ids, map_scores_method_1, map_scores_method_2):
    if score1 != score2:
        print(f"User ID: {user_id}, Method 1 MAP@12: {score1}, Method 2 MAP@12: {score2}")

mean_average_precision_1 = np.mean(map_scores_method_1)
mean_average_precision_2 = np.mean(map_scores_method_2)

print(f"Mean MAP@12 Method 1: {mean_average_precision_1}")
print(f"Mean MAP@12 Method 2: {mean_average_precision_2}")


In [None]:
# Define the user ID you want to inspect
user_id_to_inspect = 'b7332c96a15893a068fc0a3efdea80d905b8fe577650272bd7f5f23678afe5c9'

# Get the actual purchases for the user
actual_purchases_user = actual_purchases.loc[user_id_to_inspect]

# Get the predicted recommendations for the user
predicted_recommendations_user = user_recommendations[user_id_to_inspect][:12]

# Print the actual purchases and predicted recommendations
print(f"Actual Purchases for User {user_id_to_inspect}: {actual_purchases_user}")
print(f"Predicted Recommendations for User {user_id_to_inspect}: {predicted_recommendations_user}")


In [None]:
import numpy as np

# Function 1: calculate_map_at_n
def calculate_map_at_n(predictions, ground_truth, top_n=12):
    def precision_at_k(k, predicted, actual):
        if len(predicted) > k:
            predicted = predicted[:k]
        return len(set(predicted) & set(actual)) / k

    def average_precision(predicted, actual):
        if not actual:
            return 0.0
        ap = 0.0
        relevant_items = 0
        for k in range(1, min(len(predicted), top_n) + 1):
            if predicted[k-1] in actual:
                relevant_items += 1
                ap += precision_at_k(k, predicted, actual) * 1
        return ap / min(len(actual), top_n)
    
    return average_precision(predictions, ground_truth)

# Function 2: compute_map_k
def compute_map_k(actual, predicted, k=12):
    actual_set = set(actual)
    predicted = predicted[:k]  # consider only the top k predictions
    hits = [1 if item in actual_set else 0 for item in predicted]
    cumulative_hits = np.cumsum(hits)
    precision_at_k = [hits[i] * cumulative_hits[i] / (i + 1) for i in range(len(hits))]
    if actual_set:
        return sum(precision_at_k) / min(len(actual_set), k)
    return 0

# Test example
actual_items = [648256007, 648256007]
predicted_items = [648256007, 684588007, 504154020, 648256001, 626813006, 504154021, 504154019, 684588008, 684588001, 745829002, 684588002, 712425003]
top_n = 3

# Run both functions
map_at_n_result = calculate_map_at_n(predicted_items, actual_items, top_n)
map_k_result = compute_map_k(actual_items, predicted_items, top_n)

print(f"MAP@{top_n} using calculate_map_at_n: {map_at_n_result}")
print(f"MAP@{top_n} using compute_map_k: {map_k_result}")


In [None]:
# Assuming `user_profiles` is a pandas DataFrame
print(user_profiles.head())  # Displays the first few rows
print(user_profiles.info())  # Provides a concise summary of the DataFrame


In [None]:
ll

In [None]:
# from annoy import AnnoyIndex
# import numpy as np

# # Assume user_profiles is a DataFrame with user profiles as vectors
# f = 32  # Length of item vectors that we are indexing, here 32 dimensions
# t = AnnoyIndex(f, 'angular')  # Use 'angular' for cosine distance

# # Adding all user vectors to the Annoy index
# for i, vector in enumerate(user_profiles):
#     t.add_item(i, vector)

# # Building the index - more trees give higher precision when querying
# t.build(10)  # 10 trees

# # Getting the 10,000 nearest neighbors for each user
# user_nearest_neighbors = {}
# for i in range(len(user_profiles)):
#     # The second parameter is the number of neighbors you want
#     # it includes the item itself, so ask for 10001
#     nearest_neighbors = t.get_nns_by_item(i, 1001, include_distances=False)[1:]  # exclude the item itself
#     user_nearest_neighbors[user_profiles.index[i]] = [user_profiles.index[nn] for nn in nearest_neighbors]

# # Now you have a dictionary with user IDs as keys and lists of the 10,000 nearest neighbors' user IDs as values


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Step 1: Identify Last Week's Transactions
current_date = training_transactions['t_dat'].max()
last_week_start = current_date - timedelta(days=7)
last_week_transactions = training_transactions[(training_transactions['t_dat'] > last_week_start) & (training_transactions['t_dat'] <= current_date)]

# Step 2: Aggregate Purchases for the 10,000 Nearest Customers
# Creating a DataFrame for easy lookup
last_week_transactions['count'] = 1  # This will be used to sum up purchases

def get_top_300_items(user_id, neighbors_dict, transactions):
    # Get neighbor IDs for the user
    neighbors = neighbors_dict[user_id]
    # Filter transactions for these neighbors
    neighbor_transactions = transactions[transactions['customer_id'].isin(neighbors)]
    # Aggregate purchase counts by article
    item_counts = neighbor_transactions.groupby('article_id').agg({'count': 'sum'}).reset_index()
    # Sort by purchase count descending and select top 300
    top_items = item_counts.sort_values(by='count', ascending=False).head(300)['article_id'].tolist()
    return top_items

# Assuming user_nearest_neighbors is already populated
user_top_300_items = {user_id: get_top_300_items(user_id, user_nearest_neighbors, last_week_transactions) for user_id in user_nearest_neighbors.keys()}

# Optionally, display or process these recommendations
for user_id, top_items in user_top_300_items.items():
    print(f"User {user_id} top 300 recommended items: {top_items[:10]}")  # Show only the top 10 for brevity


In [None]:
import numpy as np

# Step 1: Prepare Actual Purchases Data
# Assuming 'validation_transactions' has at least 'customer_id' and 'article_id'
actual_purchases = validation_transactions.groupby('customer_id')['article_id'].agg(list)

# Step 2: Extract Top 12 Predictions for Each User from Your Recommendations
predicted_top_12 = {user_id: recommendations[:12] for user_id, recommendations in user_top_300_items.items()}

# Step 3: Compute MAP@12
def average_precision_at_k(actual, predicted, k=12):
    if not actual:
        return 0.0
    predicted = predicted[:k]  # truncate to the first k predictions
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(actual, predicted, k=12):
    return np.mean([average_precision_at_k(a, p, k) for a, p in zip(actual, predicted)])

# Collect actual and predicted lists for users present in both actual and predicted sets
user_ids = set(actual_purchases.keys()).intersection(set(predicted_top_12.keys()))
actual_lists = [actual_purchases[user_id] for user_id in user_ids]
predicted_lists = [predicted_top_12[user_id] for user_id in user_ids]

# Calculate MAP@12
map_score = map_at_k(actual_lists, predicted_lists, 12)
print(f"MAP@12: {map_score}")


In [None]:
import pandas as pd
import numpy as np

# Step 1: Identify Most Popular Items in Training Data
# Assuming `training_transactions` DataFrame has columns 'article_id'
item_popularity = training_transactions['article_id'].value_counts().head(12)  # Get top 12 items
most_popular_items = item_popularity.index.tolist()

# Step 2: Recommend to Validation Users
# Assuming `validation_transactions` DataFrame has column 'customer_id'
validation_users = validation_transactions['customer_id'].unique()
user_recommendations = {user: most_popular_items for user in validation_users}

# Step 3: Calculate MAP@12
# Step 1: Prepare actual purchases data
actual_purchases = validation_transactions.groupby('customer_id')['article_id'].agg(list)

def average_precision_at_k(actual, predicted, k=12):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def map_at_k(actual, predicted, k=12):
    # Filter users who appear in both actual and predicted lists
    common_users = set(actual.keys()).intersection(predicted.keys())
    return np.mean([average_precision_at_k(actual[user], predicted[user], k) for user in common_users])

# Calculate MAP@12
map_score = map_at_k(actual_purchases, user_recommendations, 12)
print(f"MAP@12 for most popular item recommendations: {map_score}")


In [None]:
import pandas as pd

# Convert user_nearest_neighbors to a DataFrame
neighbors_df = pd.DataFrame.from_dict(user_nearest_neighbors, orient='index')
neighbors_df.reset_index(inplace=True)
neighbors_df.columns = ['customer_id'] + [f'neighbor_{i+1}' for i in range(neighbors_df.shape[1]-1)]

# Save to CSV
neighbors_df.to_csv('user_nearest_neighbors.csv', index=False)
print("Saved user_nearest_neighbors to CSV.")


In [None]:

# Find the top 10 most similar items
item_id = "851010002"
similar_items = model.wv.most_similar(item_id, topn=10)

print("Top 10 similar items to {}:".format(item_id))
for item, similarity in similar_items:
    print("Item: {}, Similarity: {:.4f}".format(item, similarity))


In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def get_image_path(item_id):
    item_id = '0'+item_id
    folder_number = item_id[:3]  # Assuming the first three digits of the item ID correspond to folder names
    image_directory = r'./images/{}'.format(folder_number)
    image_path = os.path.join(image_directory, '{}.jpg'.format(item_id))  # Assuming images are saved as PNG
    print(image_path)
    return image_path

# Check if file exists and is file
def is_valid_path(path):
    return os.path.exists(path) and os.path.isfile(path)

fig, axes = plt.subplots(1, 10, figsize=(20, 2))  # Adjust the size as needed
for i, (item, _) in enumerate(similar_items):
    img_path = get_image_path(item)
    if is_valid_path(img_path):
        img = mpimg.imread(img_path)
        axes[i].imshow(img)
        axes[i].set_title(item)
        axes[i].axis('off')  # Hide axes
    else:
        axes[i].text(0.5, 0.5, 'No image', fontsize=12, ha='center')
        axes[i].axis('off')

plt.show()
