In [None]:
!pip install transformers torch spacy pandas
!python -m spacy download en_core_web_sm

import spacy
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv('all_products.csv')  
df.head()

brands = df['brand'].tolist()
products = df['product_name'].tolist()
descriptions = df['description'].tolist()
prices = df['price'].tolist()
categories = df['category'].tolist()
concerns = df['concern'].tolist()

# Preprocessing function
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Preprocess the product descriptions
processed_descriptions = [preprocess_text(desc) for desc in descriptions]

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embedding

# Generate embeddings for the processed descriptions
doc_embeddings = np.vstack([get_bert_embedding(desc) for desc in processed_descriptions])

# Function to perform semantic search and return product information
def recommend_products(query, k=3):
    # Preprocess the query
    query_preprocessed = preprocess_text(query)
    query_embedding = get_bert_embedding(query_preprocessed)
    
    # Calculate cosine similarity
    similarities = [cosine_similarity(query_embedding, doc_emb.reshape(1, -1)).flatten()[0]
                    for doc_emb in doc_embeddings]
    
    # Get top k similar products
    top_k_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]
    
    # Collect product information
    recommendations = []
    for idx in top_k_indices:
        recommendations.append({
            "brand": brands[idx],
            "product": products[idx],
            "price": prices[idx],
            "category": categories[idx],
            "concern": concerns[idx]
        })
    
    return recommendations

# Example query
query = "organic skincare products"

# Get product recommendations
recommendations = recommend_products(query, k=5)

# Display results
for rec in recommendations:
    print(f"Brand: {rec['brand']}, Product: {rec['product']}, Price: ${rec['price']}, Category: {rec['category']}, Concern: {rec['concern']}")