In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load your transactions data
articles_df = pd.read_csv('./dataset/articles.csv')
customers_df = pd.read_csv('./dataset/customers.csv')
training_transactions = pd.read_csv('./dataset/split/fold_0/train.csv')
validation_transactions = pd.read_csv('./dataset/split/fold_0/test.csv')
# Load your transactions data

print("articles_df: ", articles_df.columns)
print("customers_df: ", customers_df.columns)
print("transactions_df: ", training_transactions.columns)

articles_df:  Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')
customers_df:  Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
transactions_df:  Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id'], dtype='object')


In [3]:
training_transactions["article_id"].value_counts()

article_id
689109001    9297
706016001    7728
692930001    7558
706016002    6422
689109003    6187
             ... 
636698001       1
613347001       1
664337001       1
602152001       1
739502002       1
Name: count, Length: 42793, dtype: int64

In [4]:
# Step 2: Positive Samples
# Group by customer and list all articles they have purchased
positive_samples = training_transactions.groupby('customer_id')['article_id'].agg(list).reset_index()

# All unique articles
all_articles = set(articles_df['article_id'].astype(str))

# Step 3: Generate Negative Samples
# Function to generate negative samples with a ratio of 1:5
def generate_negatives(row):
    num_positives = len(row['article_id'])
    num_negatives = num_positives * 1  # Ratio of 1 positive to 5 negatives
    positives = set(row['article_id'])
    possible_negatives = list(all_articles - positives)
    num_negatives = min(num_negatives, len(possible_negatives))  # Avoid trying to sample more items than available
    return np.random.choice(possible_negatives, num_negatives, replace=False).tolist()

# Apply function to generate negative samples
# positive_samples['negative_samples'] = positive_samples.apply(generate_negatives, axis=1)

# Step 4: Compile Training Data
# Format data as sentences: each user has two lists, one for positives, one for negatives
training_data = []
for _, row in positive_samples.iterrows():
    training_data.append(row['article_id'])  # Positive sentence
    # training_data.append(row['negative_samples'])  # Negative sentence

# Now training_data can be used for embedding training


In [5]:
# Calculate the number of unique items in the filtered training transactions
unique_items_count = training_transactions['article_id'].nunique()
print(f"Number of unique items in the training transactions: {unique_items_count}")

# Calculate the number of unique users in the filtered training transactions
unique_users_count = training_transactions['customer_id'].nunique()
print(f"Number of unique users in the training transactions: {unique_users_count}")

# Calculate the number of unique users in the filtered training transactions
unique_users_count = validation_transactions['article_id'].nunique()
print(f"Number of unique items in the validation transactions: {unique_users_count}")

# Calculate the number of unique users in the filtered training transactions
unique_users_count = validation_transactions['customer_id'].nunique()
print(f"Number of unique users in the validation transactions: {unique_users_count}")

Number of unique items in the training transactions: 42793
Number of unique users in the training transactions: 493897
Number of unique items in the validation transactions: 22190
Number of unique users in the validation transactions: 85437


In [6]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import gensim
assert gensim.models.word2vec.FAST_VERSION > -1
from collections import Counter
import random

for purchase in training_data:
    random.shuffle(purchase)
    
print(len(training_data))

# Calculate the length of each sublist
lengths = [len(sublist) for sublist in training_data]

# Count the frequency of each length
length_distribution = Counter(lengths)

# Print the length distribution
print("Length distribution of sublists:")
for length, count in sorted(length_distribution.items())[:10]:
    print(f"Length {length}: {count} times")

493897
Length distribution of sublists:
Length 1: 70791 times
Length 2: 71460 times
Length 3: 56225 times
Length 4: 47092 times
Length 5: 36812 times
Length 6: 30577 times
Length 7: 24671 times
Length 8: 20877 times
Length 9: 17264 times
Length 10: 14612 times


In [7]:
from gensim.models import Word2Vec
import time

start = time.time()

model = Word2Vec(sentences=training_data,  # pre-processed list of movie lists
                 epochs=10,                # number of iterations over the corpus
                 min_count=10,               # item need to appear more than 10 times
                 vector_size=128,         # embedding vector size
                 workers=8,               # number of threads to use for training
                 sg=1,                    # using skip-gram algorithm
                 hs=0,                    # hierarchical softmax not used, we use negative sampling instead
                 negative=5,              # number of negative samples
                 window=9999)               # context window size

duration = time.time() - start
print("Time passed: {:.2f} seconds".format(duration))
# To save the model for later use
model.save('item2vec.model')
model.get_latest_training_loss()

Time passed: 68.40 seconds


0.0

In [8]:
from gensim.models import Word2Vec

# Load the model
model = Word2Vec.load('item2vec.model')

# Now you can use the model
# For example, to find vectors or perform operations like finding similar items
# print(model.wv['some_item'])  # Replace 'some_item' with a valid item name from your training data


In [9]:
import numpy as np

# Assuming `transactions_df` is your transaction dataset with 'customer_id' and 'article_id'
# and `model` is your trained Word2Vec model

# Group transactions by user and collect all item IDs per user
user_items = training_transactions.groupby('customer_id')['article_id'].apply(list)

# Define a function to calculate the average vector of items per user
def calculate_user_profile(item_ids, model):
    vectors = []
    for item_id in item_ids:
        if item_id in model.wv.key_to_index:  # Check if the item is in the model's vocabulary
            vectors.append(model.wv[item_id])
    if vectors:  # If there are any vectors, calculate the average
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no items are in the vocabulary

# Calculate user profiles
user_profiles = user_items.apply(calculate_user_profile, model=model)

# Now `user_profiles` contains the vector representation (profile) for each user

In [10]:
# Assuming user_profiles is already defined as per the previous instructions

# Print the shape of the user_profiles DataFrame
print("Shape of user_profiles:", user_profiles.shape)

# Check the data type of the first item to ensure consistency
if len(user_profiles) > 0:
    first_element = user_profiles.iloc[0]
    print("Data type of user profiles:", type(first_element))
    print("Length of a single user profile vector:", len(first_element))

# Check for null values
null_data = user_profiles.isnull().any()
print("Any null user profiles?:", null_data)

# Print the first few user profiles to check
print("Sample user profiles:")
print(user_profiles.head())


Shape of user_profiles: (493897,)
Data type of user profiles: <class 'numpy.ndarray'>
Length of a single user profile vector: 128
Any null user profiles?: False
Sample user profiles:
customer_id
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa    [0.29293284, -0.30096897, 0.0062587056, 0.3340...
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318    [-0.2699006, 0.3842866, 0.017176077, -0.281941...
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a    [-0.3082838, 0.106087945, -0.44609258, -0.5450...
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2    [-0.11548444, 0.06403653, 0.022126459, -0.2546...
00009d946eec3ea54add5ba56d5210ea898def4b46c68570cf0096d962cacc75    [-0.18852253, 0.048781775, 0.103449255, -0.143...
Name: article_id, dtype: object


In [11]:
import torch
from torch import nn
from tqdm import tqdm
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming `model` and `user_profiles` are already defined
# Convert item vectors to a tensor and move to the device
item_ids = list(model.wv.index_to_key)
item_vectors = torch.tensor([model.wv[item] for item in item_ids], dtype=torch.float, device=device)
item_vectors_norm = item_vectors / item_vectors.norm(dim=1, keepdim=True)

# Convert user profiles to a tensor
user_ids = list(user_profiles.keys())
user_vectors = torch.tensor(list(user_profiles.values), dtype=torch.float, device=device)

# Define batch size
batch_size = 4096

# Function to process batches and get recommendations
def process_batch(start_idx, end_idx):
    # Slice the batch
    batch_user_vectors = user_vectors[start_idx:end_idx]
    batch_user_vectors_norm = batch_user_vectors / batch_user_vectors.norm(dim=1, keepdim=True)
    
    # Compute cosine similarity
    similarities = torch.mm(batch_user_vectors_norm, item_vectors_norm.t())
    
    # Get top 12 recommendations for each user in the batch
    top_indices = torch.topk(similarities, 100, dim=1).indices
    
    # Map indices to item IDs
    return {user_ids[i]: [item_ids[idx] for idx in top_indices[row_index].cpu().tolist()]
            for row_index, i in enumerate(range(start_idx, end_idx))}

# Process all batches and collect recommendations
user_recommendations = {}
for start_idx in tqdm(range(0, len(user_vectors), batch_size)):
    end_idx = min(start_idx + batch_size, len(user_vectors))
    user_recommendations.update(process_batch(start_idx, end_idx))

# # Print or use these recommendations
# for user_id, recommendations in user_recommendations.items():
#     print(f"Recommendations for User {user_id}: {recommendations}")


Using device: cuda


  item_vectors = torch.tensor([model.wv[item] for item in item_ids], dtype=torch.float, device=device)
100%|██████████| 121/121 [00:27<00:00,  4.47it/s]


In [12]:
# Step 1: Prepare actual purchases data
actual_purchases = validation_transactions.groupby('customer_id')['article_id'].agg(list)

# Step 3: Compute MAP@K
def compute_map_k(actual, predicted, k=12):
    actual_set = set(actual)
    predicted = predicted[:k]  # consider only the top k predictions
    hits = [1 if item in actual_set else 0 for item in predicted]
    cumulative_hits = np.cumsum(hits)
    precision_at_k = [hits[i] * cumulative_hits[i] / (i + 1) for i in range(len(hits))]
    if actual_set:
        return sum(precision_at_k) / min(len(actual_set), k)
    return 0

# Collect actual and predicted lists
user_ids = actual_purchases.index.intersection(user_recommendations.keys())
actual_lists = [actual_purchases.loc[user_id] for user_id in user_ids]
predicted_lists = [user_recommendations[user_id] for user_id in user_ids]

# Calculate MAP@12
map_scores = [compute_map_k(actual, predicted, 12) for actual, predicted in zip(actual_lists, predicted_lists)]
mean_average_precision = np.mean(map_scores)

print(f"MAP@12: {mean_average_precision}")

MAP@12: 0.02060477670642518


In [13]:
# Assuming `user_profiles` is a pandas DataFrame
print(user_profiles.head())  # Displays the first few rows
print(user_profiles.info())  # Provides a concise summary of the DataFrame


customer_id
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa    [0.29293284, -0.30096897, 0.0062587056, 0.3340...
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318    [-0.2699006, 0.3842866, 0.017176077, -0.281941...
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a    [-0.3082838, 0.106087945, -0.44609258, -0.5450...
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2    [-0.11548444, 0.06403653, 0.022126459, -0.2546...
00009d946eec3ea54add5ba56d5210ea898def4b46c68570cf0096d962cacc75    [-0.18852253, 0.048781775, 0.103449255, -0.143...
Name: article_id, dtype: object
<class 'pandas.core.series.Series'>
Index: 493897 entries, 0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa to ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264
Series name: article_id
Non-Null Count   Dtype 
--------------   ----- 
493897 non-null  object
dtypes: object(1)
memory usage: 7.5+ MB
None


In [58]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import faiss

# Assuming `user_profiles` is a pandas Series where index is user_id and value is the average embedding vector.
# First, ensure that all vectors have the same length (this step might be redundant if you're sure about the uniformity).
if all(len(x) == len(user_profiles.iloc[0]) for x in user_profiles):
    # Convert Series of lists or arrays to a 2D NumPy array
    user_profiles_array = np.stack(user_profiles.values).astype('float32')

    # Normalize the vectors
    user_profiles_array = normalize(user_profiles_array)

    # Create the Faiss index (using L2 distance here)
    index = faiss.IndexFlatL2(user_profiles_array.shape[1])

    # Use GPU for Faiss (ensure you have initialized GPU resources)
    gpu_resources = faiss.StandardGpuResources()  # Using standard GPU resources
    gpu_index = faiss.index_cpu_to_gpu(gpu_resources, 0, index)  # Move index to GPU 0
    gpu_index.add(user_profiles_array)  # Add vectors to the index

    # Search for the nearest neighbors
    k = 200  # Number of nearest neighbors
    distances, indices = gpu_index.search(user_profiles_array, k)

    print("Indices of Nearest Neighbors:\n", indices)
    print("Distances to Nearest Neighbors:\n", distances)
else:
    print("Error: Not all vectors have the same length.")


Indices of Nearest Neighbors:
 [[     0 207708 279399 ... 130743 114100 114725]
 [185146      1 320948 ... 350678 441397 112082]
 [     2  54962  69753 ... 329052 247147 160619]
 ...
 [493894 466129 418769 ... 405148 106578 436438]
 [493895 320099 377801 ... 481992 367136 278758]
 [493896 163935  77990 ...  25654 209144 310742]]
Distances to Nearest Neighbors:
 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 1.0000000e+00
  1.0000000e+00 1.0000000e+00]
 [3.5762787e-07 3.5762787e-07 3.5762787e-07 ... 8.3011955e-01
  8.3015001e-01 8.3027041e-01]
 [2.3841858e-07 2.3841858e-07 2.3841858e-07 ... 4.3050027e-01
  4.3163669e-01 4.3184638e-01]
 ...
 [2.3841858e-07 2.3841858e-07 1.0207671e-01 ... 6.0582107e-01
  6.0639137e-01 6.0759407e-01]
 [1.1920929e-07 2.9190326e-01 3.2647753e-01 ... 5.1127470e-01
  5.1132840e-01 5.1170492e-01]
 [0.0000000e+00 3.0488384e-01 3.0615306e-01 ... 4.3315709e-01
  4.3430209e-01 4.3452853e-01]]


In [75]:
from datetime import timedelta

# Step 1: Filter for last week's transactions
# Convert 't_dat' to datetime if it's not already
training_transactions['t_dat'] = pd.to_datetime(training_transactions['t_dat'])

# Now, compute the current date and last week's start date
current_date = training_transactions['t_dat'].max()
last_week_start = current_date - timedelta(days=14)
last_week_transactions = training_transactions[(training_transactions['t_dat'] > last_week_start) & (training_transactions['t_dat'] <= current_date)]

# Group transactions by customer_id and aggregate article counts
grouped_transactions = last_week_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.value_counts().to_dict()).to_dict()

# Step 2: Create a mapping from indices to user IDs
user_id_list = user_profiles.index.tolist()  # List of user IDs corresponding to indices in user_profiles_array
neighbors_dict = {user_id: [user_id_list[idx] for idx in indices[i]] for i, user_id in enumerate(user_id_list)}

In [76]:
import pandas as pd

# Step 3: Function to find top items
def get_top_items_for_user(user_id, neighbors_dict, grouped_transactions, top_k=12):
    # Dictionary to hold aggregated article counts from all similar users
    article_counts = {}

    # Aggregate counts from all similar users
    for neighbor_id in neighbors_dict[user_id]:
        if neighbor_id in grouped_transactions:
            neighbor_articles = grouped_transactions[neighbor_id]
            for article_id, count in neighbor_articles.items():
                if article_id in article_counts:
                    article_counts[article_id] += count
                else:
                    article_counts[article_id] = count

    # Sort articles by their aggregated counts and select the top_k articles
    top_items = sorted(article_counts.items(), key=lambda item: item[1], reverse=True)[:top_k]
    return [item[0] for item in top_items]  # Return only the article IDs

# Step 4: Generate recommendations for each user
# Generate recommendations for each user with a progress bar
recommendations = {user_id: get_top_items_for_user(user_id, neighbors_dict, grouped_transactions, 12) for user_id in tqdm(user_id_list, desc="Generating recommendations")}

# Display or process the recommendations
# for user_id, recommended_items in recommendations.items():
#     print(f"User {user_id} recommended items: {recommended_items}")

Generating recommendations: 100%|██████████| 493897/493897 [00:58<00:00, 8472.95it/s]


In [77]:
import pandas as pd
import numpy as np

# Step 1: Prepare actual purchases data
actual_purchases = validation_transactions.groupby('customer_id')['article_id'].agg(list)

# Function to compute MAP@k
def compute_map_k(actual, predicted, k=12):
    actual_set = set(actual)
    predicted = predicted[:k]  # consider only the top k predictions
    hits = [1 if item in actual_set else 0 for item in predicted]
    cumulative_hits = np.cumsum(hits)
    precision_at_k = [cumulative_hits[i] / (i + 1) if hits[i] else 0 for i in range(len(hits))]
    if actual_set:
        return np.sum(precision_at_k) / min(len(actual_set), k)
    return 0

# Step 2: Ensure user_recommendations is defined somewhere in your workflow
# This should be a dictionary mapping user IDs to lists of recommended article IDs

# Step 3: Collect actual and predicted lists
user_ids = actual_purchases.index.intersection(recommendations.keys())
actual_lists = [actual_purchases.loc[user_id] for user_id in user_ids]
predicted_lists = [recommendations[user_id] for user_id in user_ids]

# Calculate MAP@12
map_scores = [compute_map_k(actual, predicted, 12) for actual, predicted in zip(actual_lists, predicted_lists)]
mean_average_precision = np.mean(map_scores)

print(f"MAP@12: {mean_average_precision}")


MAP@12: 0.022427705001835836


In [78]:
def calculate_apk(actual, predicted, k=12):
    """Calculate average precision at k for a single user."""
    if not actual:
        return 0
    score = 0.0
    hits = 0
    for i, p in enumerate(predicted[:k]):
        if p in actual and p not in predicted[:i]:  # Ensure uniqueness in predictions considered for scoring
            hits += 1
            score += hits / (i + 1)
    return score / min(len(actual), k)

user_ids = set(actual_purchases.keys()).intersection(recommendations.keys())
ap_scores = []

for user_id in user_ids:
    actual_items = actual_purchases[user_id]
    predicted_items = recommendations[user_id]
    ap = calculate_apk(actual_items, predicted_items, k=12)
    ap_scores.append(ap)

# Calculate the mean of the average precision scores for all users
mean_average_precision = np.mean(ap_scores)
print(f"MAP@12: {mean_average_precision}")

MAP@12: 0.02025199129545246


In [79]:
aa

NameError: name 'aa' is not defined

In [None]:

# Find the top 10 most similar items
item_id = "851010002"
similar_items = model.wv.most_similar(item_id, topn=10)

print("Top 10 similar items to {}:".format(item_id))
for item, similarity in similar_items:
    print("Item: {}, Similarity: {:.4f}".format(item, similarity))


In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def get_image_path(item_id):
    item_id = '0'+item_id
    folder_number = item_id[:3]  # Assuming the first three digits of the item ID correspond to folder names
    image_directory = r'./images/{}'.format(folder_number)
    image_path = os.path.join(image_directory, '{}.jpg'.format(item_id))  # Assuming images are saved as PNG
    print(image_path)
    return image_path

# Check if file exists and is file
def is_valid_path(path):
    return os.path.exists(path) and os.path.isfile(path)

fig, axes = plt.subplots(1, 10, figsize=(20, 2))  # Adjust the size as needed
for i, (item, _) in enumerate(similar_items):
    img_path = get_image_path(item)
    if is_valid_path(img_path):
        img = mpimg.imread(img_path)
        axes[i].imshow(img)
        axes[i].set_title(item)
        axes[i].axis('off')  # Hide axes
    else:
        axes[i].text(0.5, 0.5, 'No image', fontsize=12, ha='center')
        axes[i].axis('off')

plt.show()
