In [16]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Read the dataset into a DataFrame
df_nic = pd.read_excel('NIC.xlsx')
df_nic=df_nic.dropna(subset="Sub-class")


In [18]:
# Tokenize and preprocess the user input
# user_input = "Develop processes for health care solutions, sustainable solutions, Affordable solutions, enabling the deskilled, Inclusive solutions FOR ALL strata of humanity"
user_input="tax legal information services"
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
encoded_input = tokenizer(user_input, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings for the user input
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
with torch.no_grad():
    model_output = model(**encoded_input)
user_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
user_embedding = F.normalize(user_embedding, p=2, dim=1)

# Compute token embeddings for each row in the "Description" column
encoded_descriptions = tokenizer(df_nic['Description'].tolist(), padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    descriptions_output = model(**encoded_descriptions)
description_embeddings = mean_pooling(descriptions_output, encoded_descriptions['attention_mask'])
description_embeddings = F.normalize(description_embeddings, p=2, dim=1)

# Compute cosine similarity between the user embedding and each description embedding
similarity_scores = F.cosine_similarity(user_embedding, description_embeddings)

# Find the indices of the top 3 most similar descriptions
# best_match_indices = similarity_scores.argsort()[::-1][:3]


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

In [19]:
import torch