In [1]:
import pandas as pd 
from tqdm import tqdm 
from scipy.spatial.distance import cosine
import ast 

In [2]:
df = pd.read_excel("dataset/preprocessed_updated_20_search_and_cf_data-2.xlsx", engine='openpyxl')

In [3]:
df.head() 

Unnamed: 0,veh_model,veh year,veh_loc,veh_mile,cust_complaint,repr_comments,cmpnt_cat_desc,cmpnt_code,cmpnt_symp_txt,TREAD_cat
0,Audi,2024,NC,30945,the airbag warning light in my audi wont turn off,identified as a switch problem the issue has b...,Airbag,253,warning light on delayed deployment,SUSPENSION
1,Audi,2016,TX,30316,the airbag is making a buzzing noise very anno...,fixed a loose wiring of the airbag assembly,Airbag,261,annoying noise,MEASURE_EQUIPMENTS
2,Audi,2010,IL,37217,my audis fuel gauge is stuck showing full even...,the fuel gauge sticking issue was due to a fau...,Fuel Gauge,281,incorrect reading fluctuates erratically,MEASURE_EQUIPMENTS
3,Audi,2013,WA,13308,the airbag warning light in my audi wont turn off,identified as a switch problem the issue has b...,Airbag,217,warning light on delayed deployment,AIRBAG
4,Audi,2022,NY,10949,the air suspension system of my audi seems to ...,air suspension fault traced to a leaky air str...,Air Suspension,102,vehicle sagging rough ride,AIRBAG


In [4]:
#dataset cleaning
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~।|'''
def punctuations_remover(text):
    no_punct = " " 
    for char in text:
        if char not in punctuations and char.isnumeric() == False :
            no_punct = no_punct + char
    return no_punct.lower().strip()

In [5]:
test_data = pd.read_excel("dataset/preprocessed_without ghij-2.xlsx", engine='openpyxl')

In [6]:
# This unified list, `word2vec_corpus`, is intended for training or updating a Word2Vec model with diverse textual data.
corpus = test_data.cust_complaint.to_list() + test_data.repr_comments.to_list()+df.cust_complaint.to_list()+df.repr_comments.to_list()   

In [7]:
import torch
# Choose the appropriate device based on availability (CUDA or CPU)
gpu_available = torch.cuda.is_available()
device = torch.device("cuda" if gpu_available else "cpu")
##  load bert model and send to gpu
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)
# Utilize a different tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    
def vector_embedding_converter(sentence):
    # Tokenize the input sentence into tokens using the specified tokenizer
    tokens = tokenizer.tokenize(sentence)
    # Convert the tokens into their corresponding IDs; here, we take the ID of the first token only
    sentence_token_id = tokenizer.convert_tokens_to_ids(tokens)[0]
    # Convert the token ID into a tensor and transfer it to the device (e.g., GPU)
    sentence_token_tensor = torch.tensor([sentence_token_id]).to(device)
    # Retrieve the embedding for the specified token ID using the model's word embeddings layer
    sentence_embedding = model.embeddings.word_embeddings(sentence_token_tensor)
    # Return the embedding tensor of the sentence
    return sentence_embedding


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
df['combined_cust_repr'] =df.cust_complaint +" "+ df.repr_comments
df['combined_cust_repr_vector'] = df.combined_cust_repr.apply(vector_embedding_converter)
test_data['combined_cust_repr'] = test_data.cust_complaint +" "+ test_data.repr_comments
test_data['combined_cust_repr_vector'] = test_data.combined_cust_repr.apply(vector_embedding_converter)

In [9]:
# List of customer representation vectors obtained from a DataFrame and converted to a list
cust_repr_vectors = df.combined_cust_repr_vector.tolist()
# Instantiate a CosineSimilarity object from PyTorch, specifying operation over dimension 1
cos = torch.nn.CosineSimilarity(dim=1)
# Initialize a list to keep track of the best cosine similarity scores
best_cos_score = list()

def best_matching_finder(test_df_vector):
    # Initialize the best similarity score as -1 to guarantee the first comparison is always greater
    best_flag = -1  
    # Initialize the index of the best matching vector
    best_index = 0
    # Initialize the best cosine similarity value
    best_value = 0
    # Iterate through the customer representation vectors with their indices
    for index, df_vector in enumerate(cust_repr_vectors):
        # Calculate cosine similarity between the current customer vector and the test vector; assumes df_vector and test_df_vector are tensors
        best_value = cos(df_vector, test_df_vector)[0].tolist()
        # If the current cosine similarity is better than the previously best one, update the best record
        if best_value > best_flag:
            best_flag = best_value
            best_index = index
    # Append the best cosine similarity value from the last comparison to the list (this might be a bug, should be best_flag instead)
    best_cos_score.append(best_value)
    # Return the index of the best matching customer vector
    return best_index


In [10]:
for index, com_repr_vector in enumerate(test_data.combined_cust_repr_vector.to_list()):
    best_matching_index = best_matching_finder(com_repr_vector)
    # Assuming df is your DataFrame and 'NLQ' is the column from which you want to retrieve the value
    best_cat_desc = df['cmpnt_cat_desc'].iloc[best_matching_index]
    best_cmpnt_code = df['cmpnt_code'].iloc[best_matching_index]
    best_cmpnt_symp_txt = df['cmpnt_symp_txt'].iloc[best_matching_index]
    # Update the 'Matched NLQ' column in test_data at the current index
    test_data.at[index, 'Matched cmpnt_cat_desc'] = best_cat_desc
    test_data.at[index, 'Matched cmpnt_symp_txt'] = best_cmpnt_symp_txt
    test_data.at[index, 'Matched cmpnt_code'] = best_cmpnt_code 

In [11]:
# test_data.drop('combined_cust_repr_vector', axis=1,inplace=True)
test_data['similarity'] = [x*100 for x in best_cos_score]
test_data.to_excel('Bert Global pretrained output direct compalint and repr matching.xlsx',index=False)