In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Read the dataset into a DataFrame
df_nic = pd.read_excel('/kaggle/input/nic-code/NIC.xlsx')
df_nic=df_nic.dropna(subset="Sub-class")


In [5]:
# Tokenize and preprocess the user input
# user_input = "Develop processes for health care solutions, sustainable solutions, Affordable solutions, enabling the deskilled, Inclusive solutions FOR ALL strata of humanity"
user_input="cleaning and washing services"
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
encoded_input = tokenizer(user_input, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings for the user input
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
with torch.no_grad():
    model_output = model(**encoded_input)
user_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
user_embedding = F.normalize(user_embedding, p=2, dim=1)

# Compute token embeddings for each row in the "Description" column
encoded_descriptions = tokenizer(df_nic['Description'].tolist(), padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    descriptions_output = model(**encoded_descriptions)
description_embeddings = mean_pooling(descriptions_output, encoded_descriptions['attention_mask'])
description_embeddings = F.normalize(description_embeddings, p=2, dim=1)

# Compute cosine similarity between the user embedding and each description embedding
similarity_scores = F.cosine_similarity(user_embedding, description_embeddings)

# Find the indices of the top 3 most similar descriptions
# best_match_indices = similarity_scores.argsort()[::-1][:3]


In [6]:
best_match_indices = similarity_scores.argsort(descending=True)[:7].tolist()

top_nic_codes = []
top_nic_descriptions = []
for index in best_match_indices:
    if index < len(df_nic):
        top_nic_codes.append(df_nic.iloc[index]['Sub-class'])
        top_nic_descriptions.append(df_nic.iloc[index]['Description'])

# Print the top matching NIC codes and descriptions
for code, description in zip(top_nic_codes, top_nic_descriptions):
    print("NIC Code:", code)
    print("Description:", description)
    print()


NIC Code: 81292.0
Description: Cleaning of industrial machinery

NIC Code: 81299.0
Description: Other building and industrial cleaning activities

NIC Code: 95221.0
Description:  Repair and servicing of household appliances ( refrigerators, stoves, washing

NIC Code: 96010.0
Description:  Washing and (dry-) cleaning of textile and fur products

NIC Code: 81210.0
Description: General cleaning of buildings

NIC Code: 96908.0
Description:  General household maintenance activities like grooming of the floor, dusting,

NIC Code: 81291.0
Description: Cleaning of trains buses, planes etc.

