In [1]:
# importing requirements

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
ds = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/products.csv")

In [2]:
# data visualization

ds.head(10)

Unnamed: 0,product_id,product_type,product_name,size,colour,price,quantity,description
0,0,Shirt,Oxford Cloth,XS,red,114,66,"A red coloured, XS sized, Oxford Cloth Shirt"
1,1,Shirt,Oxford Cloth,S,red,114,53,"A red coloured, S sized, Oxford Cloth Shirt"
2,2,Shirt,Oxford Cloth,M,red,114,54,"A red coloured, M sized, Oxford Cloth Shirt"
3,3,Shirt,Oxford Cloth,L,red,114,69,"A red coloured, L sized, Oxford Cloth Shirt"
4,4,Shirt,Oxford Cloth,XL,red,114,47,"A red coloured, XL sized, Oxford Cloth Shirt"
5,5,Shirt,Oxford Cloth,XS,orange,114,45,"A orange coloured, XS sized, Oxford Cloth Shirt"
6,6,Shirt,Oxford Cloth,S,orange,114,72,"A orange coloured, S sized, Oxford Cloth Shirt"
7,7,Shirt,Oxford Cloth,M,orange,114,77,"A orange coloured, M sized, Oxford Cloth Shirt"
8,8,Shirt,Oxford Cloth,L,orange,114,48,"A orange coloured, L sized, Oxford Cloth Shirt"
9,9,Shirt,Oxford Cloth,XL,orange,114,43,"A orange coloured, XL sized, Oxford Cloth Shirt"


# TF-IDF vectorizer based recommendation(ML)

In [3]:
# model process

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['product_id'][i]) for i in similar_indices]

    results[row['product_id']] = similar_items[1:]

def item(id):
    return ds.loc[ds['product_id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(item_id=11, num=5)

Recommending 5 products similar to A yellow coloured, S sized, Oxford Cloth Shirt...
-------
Recommended: A yellow coloured, M sized, Oxford Cloth Shirt (score:0.9999999999999998)
Recommended: A yellow coloured, S sized, Oxford Cloth Shirt (score:0.9999999999999998)
Recommended: A red coloured, L sized, Oxford Cloth Shirt (score:0.8601042175783741)
Recommended: A red coloured, M sized, Oxford Cloth Shirt (score:0.8601042175783741)
Recommended: A red coloured, S sized, Oxford Cloth Shirt (score:0.8601042175783741)


# Neural network based recommendation(DL)

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
ds = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/products.csv")

# Data preprocessing
descriptions = ds['description'].astype(str).tolist()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions)
sequences = tokenizer.texts_to_sequences(descriptions)
word_index = tokenizer.word_index

# Pad sequences to ensure uniform input length
max_length = 100
data = pad_sequences(sequences, maxlen=max_length)

# Define model parameters
vocab_size = len(word_index) + 1
embedding_dim = 50

# Define the model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, np.ones(len(data)), test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Generate embeddings for each product description
embeddings = model.predict(data)

# Compute cosine similarities
cosine_similarities = cosine_similarity(embeddings)

# Create a dictionary to store the results
results = {}
for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['product_id'][i]) for i in similar_indices]
    results[row['product_id']] = similar_items[1:]

def item(id):
    return ds.loc[ds['product_id'] == id]['description'].tolist()[0].split(' - ')[0]

# Function to recommend similar products
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(item_id=11, num=5)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Recommending 5 products similar to A yellow coloured, S sized, Oxford Cloth Shirt...
-------
Recommended: A violet coloured, XL sized, Tracksuit Bottoms Trousers (score:1.0)
Recommended: A violet coloured, L sized, Tracksuit Bottoms Trousers (score:1.0)
Recommended: A violet coloured, M sized, Tracksuit Bottoms Trousers (score:1.0)
Recommended: A violet coloured, S sized, Tracksuit Bottoms Trousers (score:1.0)
Recommended: A violet coloured, XS sized, Tracksuit Bottoms Trousers (score:1.0)


# Transformer based recommendation(DL)

In [5]:
# Importing requirements
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Load dataset
ds = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/products.csv")

# Data visualization
ds.head(10)

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to compute sentence embeddings
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Compute embeddings for all product descriptions
embeddings = []
for description in ds['description']:
    embeddings.append(get_embedding(description, tokenizer, model))

embeddings = np.vstack(embeddings)

# Compute cosine similarities
cosine_similarities = cosine_similarity(embeddings, embeddings)

# Create a dictionary to store the results
results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['product_id'][i]) for i in similar_indices]

    results[row['product_id']] = similar_items[1:]

# Function to get product description
def item(id):
    return ds.loc[ds['product_id'] == id]['description'].tolist()[0].split(' - ')[0]

# Function to recommend products
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(item_id=11, num=5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Recommending 5 products similar to A yellow coloured, S sized, Oxford Cloth Shirt...
-------
Recommended: A green coloured, S sized, Oxford Cloth Shirt (score:0.9965025)
Recommended: A blue coloured, S sized, Oxford Cloth Shirt (score:0.9964535)
Recommended: A red coloured, S sized, Oxford Cloth Shirt (score:0.9960631)
Recommended: A orange coloured, S sized, Oxford Cloth Shirt (score:0.9958135)
Recommended: A yellow coloured, L sized, Oxford Cloth Shirt (score:0.9930552)
