In [24]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m184.3/227.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [25]:
# importing requirements
import pandas as pd
from fastai.collab import CollabDataLoaders, Learner
from fastai.losses import MSELossFlat
import torch.nn as nn
import torch
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [9]:
# Load the data with header handling
customer = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/orders.csv", names=['order_id', 'customer_id'], usecols=[0, 1], skiprows=1)
print("         CUSTOMER        ")
customer.head(10)

         CUSTOMER        


Unnamed: 0,order_id,customer_id
0,1,64
1,2,473
2,3,774
3,4,433
4,5,441
5,6,800
6,7,626
7,8,58
8,9,852
9,10,659


In [10]:
product = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/sales.csv", names=['order_id', 'product_id'], usecols=[1, 2], skiprows=1)
print("         PRODUCT        ")
product.head(10)

         PRODUCT        


Unnamed: 0,order_id,product_id
0,1,218
1,1,481
2,1,2
3,1,1002
4,1,691
5,1,981
6,2,915
7,2,686
8,2,1091
9,2,1196


In [11]:
product_details = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/products.csv", names=['product_id', 'description'], usecols=[0, 7], skiprows=1)
print("         PRODUCT DETAILS        ")
product_details.head(10)

         PRODUCT DETAILS        


Unnamed: 0,product_id,description
0,0,"A red coloured, XS sized, Oxford Cloth Shirt"
1,1,"A red coloured, S sized, Oxford Cloth Shirt"
2,2,"A red coloured, M sized, Oxford Cloth Shirt"
3,3,"A red coloured, L sized, Oxford Cloth Shirt"
4,4,"A red coloured, XL sized, Oxford Cloth Shirt"
5,5,"A orange coloured, XS sized, Oxford Cloth Shirt"
6,6,"A orange coloured, S sized, Oxford Cloth Shirt"
7,7,"A orange coloured, M sized, Oxford Cloth Shirt"
8,8,"A orange coloured, L sized, Oxford Cloth Shirt"
9,9,"A orange coloured, XL sized, Oxford Cloth Shirt"


In [13]:
# Merge the datasets on 'order_id'
merged_df = product.merge(customer, on='order_id')
print("               SALES            ")
merged_df.head(10)

               SALES            


Unnamed: 0,order_id,product_id,customer_id
0,1,218,64
1,1,481,64
2,1,2,64
3,1,1002,64
4,1,691,64
5,1,981,64
6,2,915,473
7,2,686,473
8,2,1091,473
9,2,1196,473


In [14]:
# Ensure no missing values
merged_df.dropna(inplace=True)

# Remove non-numeric rows
merged_df['product_id'] = pd.to_numeric(merged_df['product_id'], errors='coerce')
merged_df['customer_id'] = pd.to_numeric(merged_df['customer_id'], errors='coerce')
merged_df = merged_df.dropna()

# Convert columns to integers
merged_df['product_id'] = merged_df['product_id'].astype(int)
merged_df['customer_id'] = merged_df['customer_id'].astype(int)

# Ensure that the DataFrame has only the required columns
merged_df = merged_df[['customer_id', 'product_id']]

# Add a dummy rating column
merged_df['rating'] = 1
merged_with_details = merged_df.merge(product_details, on='product_id', how='left')

In [15]:
# Processed DataFrame
print("           PROCESSED SALES DATA            ")
merged_with_details.head(10)

           PROCESSED SALES DATA            


Unnamed: 0,customer_id,product_id,rating,description
0,64,218,1,"A orange coloured, L sized, Chambray Shirt"
1,64,481,1,"A indigo coloured, S sized, Puffer Jacket"
2,64,2,1,"A red coloured, M sized, Oxford Cloth Shirt"
3,64,1002,1,"A blue coloured, M sized, Wool Trousers"
4,64,691,1,"A indigo coloured, S sized, Parka Jacket"
5,64,981,1,"A red coloured, S sized, Wool Trousers"
6,473,915,1,"A orange coloured, XS sized, Drawstring Trousers"
7,473,686,1,"A blue coloured, S sized, Parka Jacket"
8,473,1091,1,"A orange coloured, S sized, Cropped Trousers"
9,473,1196,1,"A orange coloured, S sized, Pleated Trousers"


In [17]:
# Create data loaders
dls = CollabDataLoaders.from_df(merged_with_details, item_name='description', user_name='customer_id', rating_name='rating', bs=64)

# Display a batch of data
dls.show_batch()

# Number of products and customers
n_products = len(dls.classes['description'])
n_customers = len(dls.classes['customer_id'])
print(f"Number of products: {n_products}")
print(f"Number of customers: {n_customers}")
print("DataFrame shape:", merged_df.shape)

Unnamed: 0,customer_id,description,rating
0,629,"A red coloured, M sized, Trench Coat Jacket",1
1,234,"A violet coloured, S sized, Chinos Trousers",1
2,765,"A blue coloured, M sized, Chambray Shirt",1
3,574,"A yellow coloured, L sized, Relaxed Leg Trousers",1
4,602,"A yellow coloured, S sized, Windbreaker Jacket",1
5,75,"A yellow coloured, XL sized, Dress Shirt",1
6,578,"A indigo coloured, XL sized, Wool Trousers",1
7,107,"A indigo coloured, S sized, Cropped Trousers",1
8,807,"A indigo coloured, S sized, Pleated Trousers",1
9,980,"A green coloured, XS sized, Tracksuit Bottoms Trousers",1


Number of products: 1234
Number of customers: 614
DataFrame shape: (5000, 3)


In [18]:
# Define the model
class DotProductBias(nn.Module):
    def __init__(self, n_products, n_customers, n_factors, y_range=(0, 5.5)):
        super().__init__()
        self.product_factors = nn.Embedding(n_products, n_factors)
        self.product_bias = nn.Embedding(n_products, 1)
        self.customer_factors = nn.Embedding(n_customers, n_factors)
        self.customer_bias = nn.Embedding(n_customers, 1)
        self.y_range = y_range

    def forward(self, x):
        product_indices = x[:, 1]
        customer_indices = x[:, 0]

        products = self.product_factors(product_indices)
        customers = self.customer_factors(customer_indices)

        dot_product = (products * customers).sum(1)
        bias = self.product_bias(product_indices).squeeze() + self.customer_bias(customer_indices).squeeze()

        return torch.sigmoid(dot_product + bias) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]

In [19]:
# Initialize model
model = DotProductBias(n_products, n_customers, 50)

# Create Learner
learn = Learner(dls, model, loss_func=MSELossFlat())

# Try running on CPU first
learn.model = learn.model.cpu()

# Fit the model
learn.fit_one_cycle(5, 5e-3, wd=0.1)


epoch,train_loss,valid_loss,time
0,9.033795,8.988402,00:00
1,7.659019,8.800696,00:00
2,6.632639,8.642551,00:00
3,5.842573,8.593298,00:00
4,5.484343,8.585732,00:00


In [20]:
# Extract product biases
product_bias = learn.model.product_bias.weight.squeeze()
idxs = product_bias.argsort()[:5]
lowest_bias_products = [dls.classes['description'][i] for i in idxs]

print("Products with the lowest biases:", lowest_bias_products)

Products with the lowest biases: ['A violet coloured, XL sized, Pleated Trousers', 'A blue coloured, XL sized, Pullover Jacket', 'A yellow coloured, L sized, Linen Shirt', 'A yellow coloured, XS sized, Cargo Pants Trousers', 'A yellow coloured, M sized, Pleated Trousers']


In [22]:
# Extract product biases in descending order
idxs = product_bias.argsort(descending=True)[:5]
highest_bias_products = [dls.classes['description'][i] for i in idxs]

print("Products with the highest biases:", highest_bias_products)

Products with the highest biases: ['A red coloured, XL sized, Joggers Trousers', 'A yellow coloured, XS sized, Cardigan Jacket', 'A yellow coloured, L sized, Cardigan Jacket', 'A yellow coloured, L sized, Wool Trousers', 'A orange coloured, XL sized, Drawstring Trousers']


In [27]:
# Load SentenceTransformer model for description embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all product descriptions
descriptions = dls.classes['description']
description_embeddings = embedder.encode(descriptions, convert_to_tensor=True)

def find_similar_products(input_description, top_n=5):
    # Encode the input description
    input_embedding = embedder.encode(input_description, convert_to_tensor=True)

    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(input_embedding, description_embeddings)[0]

    # Find the top_n most similar descriptions
    top_results = torch.topk(cosine_scores, k=top_n)

    similar_products = []
    for score, idx in zip(top_results[0], top_results[1]):
        similar_products.append({
            'description': descriptions[idx],
            'score': score.item()
        })

    return similar_products

# Example usage
input_description = "A yellow coloured, S sized, Windbreaker Jacket"
similar_products = find_similar_products(input_description)
print("Similar products:", similar_products)

Similar products: [{'description': 'A yellow coloured, S sized, Windbreaker Jacket', 'score': 1.0}, {'description': 'A yellow coloured, XL sized, Windbreaker Jacket', 'score': 0.9763168096542358}, {'description': 'A yellow coloured, L sized, Windbreaker Jacket', 'score': 0.9710155725479126}, {'description': 'A yellow coloured, M sized, Windbreaker Jacket', 'score': 0.9706171154975891}, {'description': 'A orange coloured, S sized, Windbreaker Jacket', 'score': 0.9256076812744141}]
