In [15]:
# Import necessary libraries
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


In [16]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/final.csv')
data

Unnamed: 0,user_id,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link,ratings,review,Month,Price (USD)
0,93810,8493,Women,Apparel,Bottomwear,Skirts,Black,Fall,2011,Casual,Forever New Women Black Skirts,8493.jpg,http://assets.myntassets.com/v1/images/style/p...,5,Amazing quality! Definitely recommend. Would r...,November,46
1,24592,30757,Men,Apparel,Topwear,Kurtas,Grey,Summer,2012,Ethnic,Fabindia Men Grey Mangalgiri Kurta,30757.jpg,http://assets.myntassets.com/v1/images/style/p...,4,"Perfect for any occasion, great buy..",September,29
2,13278,14881,Women,Accessories,Bags,Handbags,Black,Summer,2011,Casual,United Colors of Benetton Women Solid Black Ha...,14881.jpg,http://assets.myntassets.com/v1/images/style/p...,3,"It's okay, nothing special.!",October,39
3,46048,48449,Men,Apparel,Topwear,Tshirts,Blue,Summer,2012,Casual,French Connection Men Blue T-shirt,48449.jpg,http://assets.myntassets.com/v1/images/style/p...,5,"Comfortable and stylish, worth the price. and ...",August,39
4,42098,4697,Unisex,Accessories,Watches,Watches,Black,Winter,2016,Sports,ADIDAS Unisex Digital Duramo Black Watch,4697.jpg,http://assets.myntassets.com/v1/images/style/p...,4,"Perfect for any occasion, great buy.!",March,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,76543,42833,Women,Accessories,Bags,Handbags,Black,Summer,2012,Casual,French Connection Women Black Croc Handbag,42833.jpg,http://assets.myntassets.com/v1/images/style/p...,5,"High quality fabric, very satisfied.!",February,57
9995,49651,8106,Men,Accessories,Watches,Watches,Brown,Winter,2016,Casual,Fastrack Men Sports Analog Brown Watch,8106.jpg,http://assets.myntassets.com/v1/images/style/p...,3,Satisfactory for the price. especially for thi...,January,12
9996,32280,25041,Men,Apparel,Topwear,Tshirts,Yellow,Fall,2011,Sports,Nike Men Yellow Jersey,25041.jpg,http://assets.myntassets.com/v1/images/style/p...,4,"Perfect for any occasion, great buy. especiall...",May,36
9997,91620,49670,Men,Accessories,Eyewear,Sunglasses,White,Winter,2016,Casual,United Colors of Benetton Men Sunglass,49670.jpg,http://assets.myntassets.com/v1/images/style/p...,5,"Exceeded expectations, love the fit! but could...",August,40


In [17]:
# Combine relevant text fields to create a single field for embedding extraction
data['combined_text'] = data['productDisplayName'] + " " + data['review'] + " " + data['articleType']

In [18]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [19]:
# Function to encode text using BERT and obtain embeddings
def get_bert_embedding(text):
    # Tokenize and encode text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        # Get hidden states from BERT
        outputs = model(**inputs)
        # Average the token embeddings from the last hidden layer for a single sentence embedding
        embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.squeeze().numpy()

In [20]:
# Apply BERT embedding function to each row in the combined_text column
data['embedding'] = data['combined_text'].apply(get_bert_embedding)

In [21]:
# Function to recommend similar items based on cosine similarity
def get_recommendations(item_index, num_recommendations=5):
    # Calculate cosine similarity between the selected item and all other items
    item_embedding = data.loc[item_index, 'embedding'].reshape(1, -1)
    all_embeddings = np.stack(data['embedding'].values)
    similarities = cosine_similarity(item_embedding, all_embeddings).flatten()

    # Get the indices of the most similar items
    similar_indices = similarities.argsort()[-num_recommendations-1:-1][::-1]

    # Return recommended items
    return data.iloc[similar_indices][['productDisplayName', 'review', 'articleType', 'combined_text']]

# Example usage: Get recommendations for the first item in the dataset
recommendations = get_recommendations(0)
print(recommendations)


                                 productDisplayName  \
2573                      Femella Women Black Skirt   
8026                Inc. 5 Women Casual Black Heels   
6748                          HM Women Golden Heels   
7856                       Catwalk Women Gold Heels   
1648  Doodle Girl Denim skirt with belt Blue Skirts   

                                                 review articleType  \
2573            Amazing quality! Definitely recommend.!      Skirts   
8026  Amazing quality! Definitely recommend. Would r...       Heels   
6748  Amazing quality! Definitely recommend. Would r...       Heels   
7856  Amazing quality! Definitely recommend. Would r...       Heels   
1648            Amazing quality! Definitely recommend..      Skirts   

                                          combined_text  
2573  Femella Women Black Skirt Amazing quality! Def...  
8026  Inc. 5 Women Casual Black Heels Amazing qualit...  
6748  HM Women Golden Heels Amazing quality! Definit...  
7856  Catw