In [1]:
import pandas as pd
import numpy as np
import re
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import nltk
from typing import Literal, Union

# check to see if the nltk data has been downloaded in the virtual environment
if not os.path.exists(os.path.join(os.path.expanduser("~"), "nltk_data")):
    # download the nltk data
    nltk.download('stopwords')
    nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Dense Retrieval

Vector Database vs. SVD

In [2]:
PARENT_DIR = os.getcwd().rsplit("\\", maxsplit=1)[0]
DATASET_DIR = os.path.join(PARENT_DIR, "Datasets")

In [3]:
df = pd.read_csv(f"{DATASET_DIR}/complaints_and_recalls.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37778 entries, 0 to 37777
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          37778 non-null  int64  
 1   CDESCR         37777 non-null  object 
 2   COMPDESC       37778 non-null  object 
 3   MMYTXT         37778 non-null  object 
 4   RECORDID       37778 non-null  object 
 5   NUMCOMPLAINTS  11624 non-null  float64
 6   IS_COMPLAINT   37778 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 1.8+ MB


In [5]:
df.dtypes

index              int64
CDESCR            object
COMPDESC          object
MMYTXT            object
RECORDID          object
NUMCOMPLAINTS    float64
IS_COMPLAINT        bool
dtype: object

In [90]:
df["IS_COMPLAINT"].value_counts()

IS_COMPLAINT
False    26154
True     11624
Name: count, dtype: int64

# SVD Retrieval

1. Convert Documents into TF-IDF vector
2. Train Scikit-Learn's SVD algorithm on train set
3. Retrieve relevant documents given query
4. Show relevant topics

In [6]:
# create a streamlit app that allows users to input a text query and get the top 5 most similar documents from the corpus using the TextClassifier class from import os
import os
import pickle

class TextClassifier:
    def __init__(self, dataset_path, column_name:str):
        # set the random state for reproducibility
        self.random_state = 42
        # create a TfidfVectorizer object
        self.vectorizer = TfidfVectorizer()
        # set the number of dimensions to reduce the vectorized data to
        self.num_dimensions = 100
        self.dataset_path = dataset_path
        self.column_name = column_name
        self.column_name_cleaned = column_name + "_CLEANED"
        # create variables to store the training, test, and validation sets for class functions
        self.column_name_cleaned = None
        self.df_train = None
        self.df_test = None
        self.df_validation = None
        self.x_train_vect = None
        self.x_test_vect = None
        self.x_validation_vect = None
        self.lsa = None
        self.vectorized_train = None
        self.vectorized_test = None
        self.vectorized_validation = None

    def read_dataset(self):
        """Read the dataset from the csv file"""
        self.df = pd.read_csv(self.dataset_path)

    # create a function that will be called from a apply lambda function to process the text from a dataframe with the "CDESCR" column
    def process_text(text):
        """
        Process text by tokenizing, removing stop words, and stemming
        """
        #print(text)
        stop_words = set(stopwords.words("english"))
        stemmer = PorterStemmer()
        
        # make sure the text is not empty and or nan
        if not text or pd.isna(text):
            return []

        # tokenize the text and remove stop words and stem the words
        content_cleaned = [stemmer.stem(word) for word in word_tokenize(text.lower()) if word not in stop_words]

        # remove the punctuation from the content_cleaned list
        content_cleaned = [word for word in content_cleaned if word.isalnum()]
        
        return content_cleaned

    def load_pickle_if_paths_exist(BASE_PATH, basenames_to_check : list):
        pickles = []
        for basename_to_check in basenames_to_check:
            if os.path.exists(f"{BASE_PATH}/{basename_to_check}"):
                with open(f"{BASE_PATH}/{basename_to_check}", "rb") as f:
                    pickles.append(pickle.load(f))

        return pickles
    
    def process_dataframe(self, train_size=0.7, test_size=0.2, validation_size=0.1):
        """
        Process the text in the "CDESCR" column and create a new column "CDESCR_CLEANED" with the processed text
        """
        # get current working directory
        cwd = os.getcwd()
        # change the Example folder to Datasets folder in the cwd path
        desired_save_path = cwd.replace("Example", "Datasets")
        # desired_save_path = cwd + "\\Datasets"
        # create a folder path to save the pickle files
        if not os.path.exists(desired_save_path):
            os.makedirs(desired_save_path)

        # check to see if there is a pickle file for the dataframe with the processed text
        if os.path.exists(desired_save_path + '//' + self.column_name + "_df.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_df.pkl", "rb") as f:
                self.df = pickle.load(f)
        else:
            # process the text in the "CDESCR" column and create a new column "CDESCR_CLEANED" with the processed text
            self.df[self.column_name_cleaned] = self.df[self.column_name].apply(lambda x: TextClassifier.process_text(x))
            # create a pickle file for the dataframe with the processed text
            with open(desired_save_path + '//' + self.column_name + "_df.pkl", "wb") as f:
                pickle.dump(self.df, f)

        # split the df_complaints dataframe into a test, train, and validation set with a 70/20/10 split
        self.df_train, self.df_test = train_test_split(self.df, test_size=(1-train_size), random_state=self.random_state)
        self.df_test, self.df_validation = train_test_split(self.df_test, test_size=(validation_size/(1-train_size)), random_state=self.random_state)

        # check to see if there is a pickle file for the vectorizer
        if os.path.exists(desired_save_path + '//' + self.column_name + "_vectorizer.pkl") and os.path.exists(desired_save_path + '//' + self.column_name + "_x_train_vect.pkl") and os.path.exists(desired_save_path + '//' + self.column_name + "_x_test_vect.pkl") and os.path.exists(desired_save_path + '//' + self.column_name + "_x_validation_vect.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_vectorizer.pkl", "rb") as f:
                self.vectorizer = pickle.load(f)
            with open(desired_save_path + '//' + self.column_name + "_x_train_vect.pkl", "rb") as f:
                self.x_train_vect = pickle.load(f)
            with open(desired_save_path + '//' + self.column_name + "_x_test_vect.pkl", "rb") as f:
                self.x_test_vect = pickle.load(f)
            with open(desired_save_path + '//' + self.column_name + "_x_validation_vect.pkl", "rb") as f:
                self.x_validation_vect = pickle.load(f)
        else:
            # fit the vectorizer on the "CDESCR_CLEANED" column and transform the "CDESCR_CLEANED" column into a vectorized format
            # apply a lambda function to join the list of words in the "CDESCR_CLEANED" column into a string
            self.x_train_vect = self.vectorizer.fit_transform(self.df_train[self.column_name_cleaned].apply(lambda x: " ".join(x)))
            self.x_test_vect = self.vectorizer.transform(self.df_test[self.column_name_cleaned].apply(lambda x: " ".join(x)))
            self.x_validation_vect = self.vectorizer.transform(self.df_validation[self.column_name_cleaned].apply(lambda x: " ".join(x)))        
            #print(self.x_train_vect.shape, self.x_test_vect.shape, self.x_validation_vect.shape)
            # create a pickle file for the vectorizer
            with open(desired_save_path + '//' + self.column_name + "_vectorizer.pkl", "wb") as f:
                pickle.dump(self.vectorizer, f)
            with open(desired_save_path + '//' + self.column_name + "_x_train_vect.pkl", "wb") as f:
                pickle.dump(self.x_train_vect, f)
            with open(desired_save_path + '//' + self.column_name + "_x_test_vect.pkl", "wb") as f:
                pickle.dump(self.x_test_vect, f)
            with open(desired_save_path + '//' + self.column_name + "_x_validation_vect.pkl", "wb") as f:
                pickle.dump(self.x_validation_vect, f)

        # check to see if there is a pickle file for the lsa
        if os.path.exists(desired_save_path + '//' + self.column_name + "_lsa.pkl") and os.path.exists(desired_save_path + '//' + self.column_name + "_complaints_vectorized_train.pkl") and os.path.exists(desired_save_path + '//' + self.column_name + "_complaints_vectorized_test.pkl") and os.path.exists(desired_save_path + '//' + self.column_name + "_complaints_vectorized_validation.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_lsa.pkl", "rb") as f:
                self.lsa = pickle.load(f)
            with open(desired_save_path + '//' + self.column_name + "_complaints_vectorized_train.pkl", "rb") as f:
                self.complaints_vectorized_train = pickle.load(f)
            with open(desired_save_path + '//' + self.column_name + "_complaints_vectorized_test.pkl", "rb") as f:
                self.complaints_vectorized_test = pickle.load(f)
            with open(desired_save_path + '//' + self.column_name + "_complaints_vectorized_validation.pkl", "rb") as f:
                self.complaints_vectorized_validation = pickle.load(f)
        else:
            # perform LSA on the vectorized data to reduce the dimensionality
            self.lsa = TruncatedSVD(n_components=self.num_dimensions, random_state=self.random_state)
            self.complaints_vectorized_train = self.lsa.fit_transform(self.x_train_vect)
            self.complaints_vectorized_test = self.lsa.transform(self.x_test_vect)
            self.complaints_vectorized_validation = self.lsa.transform(self.x_validation_vect)
            
            # create a pickle file for the vectorized training data
            with open(desired_save_path + '//' + self.column_name + "_lsa.pkl", "wb") as f:
                pickle.dump(self.lsa, f)
            with open(desired_save_path + '//' + self.column_name + "_complaints_vectorized_train.pkl", "wb") as f:
                pickle.dump(self.complaints_vectorized_train, f)
            with open(desired_save_path + '//' + self.column_name + "_complaints_vectorized_test.pkl", "wb") as f:
                pickle.dump(self.complaints_vectorized_test, f)
            with open(desired_save_path + '//' + self.column_name + "_complaints_vectorized_validation.pkl", "wb") as f:
                pickle.dump(self.complaints_vectorized_validation, f)

        return self.df, self.df_train, self.df_test, self.df_validation, self.lsa, self.vectorizer
    
    # A functionalize the process of finding similar complaints to a query text
    def find_similar_complaint(self, query_text:str):
        """
        Find the most similar complaint to a query text in the training set
        """
        # process the query text
        query_text_cleaned = TextClassifier.process_text(query_text)
        # vectorize the query text
        query_vectorized = self.vectorizer.transform([" ".join(query_text_cleaned)])
        # reduce the dimensionality of the query vector
        query_vectorized_lsa = self.lsa.transform(query_vectorized)
        # find the cosine similarity between the query vector and all the complaints in the training set
        cosine_similarities_query = cosine_similarity(query_vectorized_lsa, self.complaints_vectorized_train)
        # get the index of the most similar complaint in the training set
        most_similar_index_query = cosine_similarities_query.argmax()
        print(most_similar_index_query)
        # get the most similar complaint in the training set
        most_similar_complaint_train_query = self.df_train.iloc[most_similar_index_query]
        return most_similar_complaint_train_query
    
    def find_similar_complaints(self, query_text:str, 
                                    dataset_to_check : Literal[
                                        "train", 
                                        "validation",
                                        "test"    
                                    ] = "train",
                                    top=5):
        """
        Find the most similar complaint to a query text in the training set
        """
        # process the query text
        query_text_cleaned = TextClassifier.process_text(query_text)
        # vectorize the query text
        query_vectorized = self.vectorizer.transform([" ".join(query_text_cleaned)])
        # reduce the dimensionality of the query vector
        query_vectorized_lsa = self.lsa.transform(query_vectorized)
        # find the cosine similarity between the query vector and all the complaints in the training set
        
        if dataset_to_check == "train":
            cosine_similarities_query = cosine_similarity(query_vectorized_lsa, self.complaints_vectorized_train)
        elif dataset_to_check == "validation":
            cosine_similarities_query = cosine_similarity(query_vectorized_lsa, self.complaints_vectorized_validation)
        elif dataset_to_check == "test":
            cosine_similarities_query = cosine_similarity(query_vectorized_lsa, self.complaints_vectorized_test)
    
        # get the index of the most similar complaint in the training set
        most_similar_indices_query = cosine_similarities_query.argsort()[-top::-1]
        # get the most similar complaint in the training set
        most_similar_elements = self.df_train.iloc[most_similar_indices_query]
        most_similar_cosine_similarities = cosine_similarities_query[most_similar_indices_query]
        return most_similar_elements, most_similar_cosine_similarities
    
    def run_training_pipeline(self):
        # Load dataset in self.df
        self.read_dataset(self.dataset_path)

        # Process DataFrame and run SVD
        self.process_dataframe()

# run the below code if main script
# if __name__ == "__main__":
#     # https://www.nhtsa.gov/nhtsa-datasets-and-apis#recalls
#     # read in C:\Repo\SIADs_Audio_Text_SRS\Example\COMPLAINTS_RECEIVED_2025-2025.txt into a pandas dataframe, where the columns are RCL
#     df_complaints = pd.read_csv("C:\\Repo\\SIADs_Audio_Text_SRS\\Datasets\\COMPLAINTS_RECEIVED_2025-2025.txt", sep='\t', header=None, index_col=0)
#     df_complaints.columns = ['ODINO', 'MFR_NAME', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'CRASH', 'FAILDATE', 'FIRE', 'INJURED', 'DEATHS', 'COMPDESC', 'CITY', 'STATE', 'VIN', 'DATEA', 'LDATE', 'MILES', 'OCCURENCES', 'CDESCR', 'CMPL_TYPE', 'POLICE_RPT_YN', 'PURCH_DT', 'ORIG_OWNER_YN', 'ANTI_BRAKES_YN', 'CRUISE_CONT_YN', 'NUM_CYLS', 'DRIVE_TRAIN', 'FUEL_SYS', 'FUEL_TYPE',
#                 'TRANS_TYPE', 'VEH_SPEED', 'DOT', 'TIRE_SIZE', 'LOC_OF_TIRE', 'TIRE_FAIL_TYPE', 'ORIG_EQUIP_YN', 'MANUF_DT', 'SEAT_TYPE', 'RESTRAINT_TYPE', 'DEALER_NAME', 'DEALER_TEL', 'DEALER_CITY', 'DEALER_STATE', 'DEALER_ZIP', 'PROD_TYPE', 'REPAIRED_YN', 'MEDICAL_ATTN', 'VEHICLES_TOWED_YN']

#     # create a list of unique manufacturers in the "MFR_NAME" column
#     list_of_manufacturers = df_complaints["MFR_NAME"].unique()

#     # testing out the functionilzed code

#     # call the TextClassifier class and create an instance of it as text_classifier
#     # pass in the df_complaints dataframe and the "CDESCR" column
#     text_classifier = TextClassifier(df_complaints, "CDESCR")
#     # process the text in the "CDESCR" column
#     text_classifier.process_dataframe()

#     # use one of the complaints in the test set as a query to find the most similar complaint in the training set
#     #complaint_test_query = text_classifier.df_test["CDESCR"].iloc[5]
#     #complaint_test_query = text_classifier.df_test["CDESCR"].iloc[4]
#     #complaint_test_query = "Car won't start and makes a clicking noise"
#     complaint_test_query = "Battery dies after a few days of not driving the car"

#     print(complaint_test_query)
#     # find the most similar complaint to the complaint test
#     most_similar_complaint = text_classifier.find_similar_complaint(complaint_test_query)
#     # print the most similar complaint with the below columns
#     print(most_similar_complaint[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
#     print(most_similar_complaint["CDESCR"])

# Dense Embedding Retrieval

1. Convert documents into vector embeddings (e.g. All-MiniLM for general purpose embeddings, or use Roberta for more semantic embeddings)
2. Split documents into chunks and store in vector index/database. 
3. Send query to vector database

In [7]:
from sentence_transformers import SentenceTransformer
import torch

In [72]:
class HuggingFaceEmbeddings:
    def __init__(self, dataset_path, model_name, column_name):
        self.df = pd.read_csv(os.path.join(dataset_path, "complaints_and_recalls.csv"))
        self.model = SentenceTransformer(model_name)
        self.column_name = column_name
        self.model_name = model_name
        self.dataset_path = dataset_path

    def split_text_by_period(text, max_length=200):
        sentences = re.split(r'\.\s+', text)  # Split by period + space
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) < max_length:
                current_chunk += sentence + ". "
            else:
                current_chunk += sentence + "."
                chunks.append(current_chunk.strip())
                current_chunk = "" # Reset current chunk

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def dataset_processing(self):
        self.df[self.column_name + "_CLEANED"] = (
            self.df[self.column_name]
            .apply(lambda x: self.split_text_by_period(x))
        )
        self.df = self.df[
            ["index", self.column_name + "_CLEANED"]
        ].explode(self.column_name + "_CLEANED")

    def encode_embeddings(self):
        '''
        TODO: Consider GPU acceleration
        '''
        embedding_path = os.path.join(self.dataset_path, "huggingface", self.column_name)
        if not os.path.exists(embedding_path):
            os.makedirs(embedding_path, exists_ok=True)
            self.embeddings = self.model.encode(self.df[self.column_name], show_progress_bar=True)
            with open(os.path.join(embedding_path, f"{self.model_name}.pkl"), "wb") as f:
                pickle.dump(self.embeddings, f)
        else:
            print("Embeddings found. Loading embeddings.")
            with open(os.path.join(embedding_path, f"{self.model_name}.pkl"), "rb") as f:
                self.embeddings = pickle.load(f)
    
    def query_embeddings(self, query, top=20):
        query_embedding = self.model.encode(query)
        model_similarities = self.model.similarity(
            query_embedding,
            self.embeddings
        )
        values, indices = torch.topk(model_similarities, largest=True, k=top, dim=1)
        return self.df.iloc[indices[0].tolist()], values

    def run_training_pipeline(self):
        '''
            Technically no training is done.
            Strictly speaking, this is simply a processing pipeline
        '''
        self.encode_embeddings()



In [57]:
hg = HuggingFaceEmbeddings(
    dataset_path=DATASET_DIR,
    model_name="all-MiniLM-L6-v2",
    column_name="CDESCR"
)

In [58]:
hg.run_training_pipeline()

Embeddings found. Loading embeddings.


In [77]:
returned_docs, sims = hg.query_embeddings(
    query="Car won't start and makes a clicking noise",
    top=50
)

In [78]:
sims

tensor([[0.6604, 0.6346, 0.6278, 0.6207, 0.6035, 0.5744, 0.5667, 0.5662, 0.5593,
         0.5474, 0.5468, 0.5438, 0.5316, 0.5225, 0.5172, 0.5157, 0.5112, 0.5072,
         0.5061, 0.5052, 0.5038, 0.5028, 0.4983, 0.4963, 0.4943, 0.4907, 0.4894,
         0.4758, 0.4758, 0.4757, 0.4735, 0.4734, 0.4730, 0.4717, 0.4698, 0.4697,
         0.4694, 0.4664, 0.4649, 0.4620, 0.4613, 0.4607, 0.4590, 0.4569, 0.4545,
         0.4530, 0.4505, 0.4488, 0.4465, 0.4454]])

In [82]:
pd.set_option('max_colwidth', 400)

In [None]:
returned_docs, sims = hg.query_embeddings(
    query="Car won't start and makes a clicking noise",
    top=50
)

In [84]:
returned_docs.head(5)

Unnamed: 0,index,CDESCR,COMPDESC,MMYTXT,RECORDID,NUMCOMPLAINTS,IS_COMPLAINT
1213,1213,Clicking noise when turning,"POWER TRAIN, SUSPENSION",HONDA PROLOGUE 2024,11638680,2.0,True
10951,10951,When turning the wheel there is Clicking sounds. Something is definitely wrong with this car.,"SUSPENSION, UNKNOWN OR OTHER, STEERING",HONDA PROLOGUE 2024,11637054,3.0,True
5952,5952,"The car makes a clicking noise when I make sharp turns, most co.monly to the left. It is brand new with only 2320 miles on it. The noise started within the last few weeks. The rear breaking system locked up the brakes and turns the emergency break on, when backing up, even if nothing is behind me. Once it was rain drops dripping off of the rear window. Another time, nothing was there. I had to...","FORWARD COLLISION AVOIDANCE: AUTOMATIC EMERGENCY BRAKING, BACK OVER PREVENTION: REARVIEW SYSTEM BRAKING, SUSPENSION",HONDA PROLOGUE 2024,11642088,3.0,True
5953,5953,"The car makes a clicking noise when I turn. It is brand new with only 1100 miles on it. The noise started within the last few weeks. While the dealer claimed the car is safe to drive, I am concerned a part may fail while I am on the road.","SUSPENSION, STEERING",HONDA PROLOGUE 2024,11640808,2.0,True
421,421,"A few months after I purchased the car in July, I started hearing a clicking noise when turning from a full stop. In December I was told it was axel issue. It’s not yet been resolved. I’m still driving the car.",UNKNOWN OR OTHER,HONDA PROLOGUE 2024,11637802,1.0,True


In [81]:
returned_docs["IS_COMPLAINT"].value_counts()

IS_COMPLAINT
True    50
Name: count, dtype: int64

In [76]:
returned_docs["CDESCR"].iloc[4]

'A few months after I purchased the car in July, I started hearing a clicking noise when turning from a full stop. In December I was told it was axel issue. It’s not yet been resolved. I’m still driving the car. '

In [None]:
returned_docs, sims = hg.query_embeddings(
    query="Car won't start and makes a clicking noise",
    top=50
)

In [85]:
returned_docs, sims = hg.query_embeddings(
    query="Bad smell overheating",
    top=50
)

In [88]:
sims

tensor([[0.5880, 0.5559, 0.5389, 0.5370, 0.5323, 0.5210, 0.5096, 0.4913, 0.4862,
         0.4762, 0.4754, 0.4722, 0.4673, 0.4656, 0.4648, 0.4638, 0.4607, 0.4595,
         0.4584, 0.4577, 0.4542, 0.4540, 0.4478, 0.4463, 0.4390, 0.4371, 0.4361,
         0.4347, 0.4339, 0.4301, 0.4287, 0.4286, 0.4279, 0.4247, 0.4242, 0.4230,
         0.4216, 0.4207, 0.4191, 0.4187, 0.4170, 0.4132, 0.4111, 0.4100, 0.4100,
         0.4089, 0.4074, 0.4073, 0.4067, 0.4064]])

In [87]:
returned_docs.head(5)

Unnamed: 0,index,CDESCR,COMPDESC,MMYTXT,RECORDID,NUMCOMPLAINTS,IS_COMPLAINT
2057,2057,Front heater fan smelling of smoke,ELECTRICAL SYSTEM,LION LIONC 2024,11642408,1.0,True
4000,4000,"My Jeep eats up thermostats too often. I have had it for 3 years and had it replaced 4 times. The last time , one year ago, it took the repair shop a week to finally install one that would hold. It needs it again. I also smell something burning when I accelerate onto the freeway since owning it. My engines fan runs high aIl the time also. I read many complaints online about people having overh...",ENGINE,JEEP GRAND CHEROKEE 2014,11635444,1.0,True
1314,1314,Coolant is leaking and causing engine to overheat.,ENGINE,CHEVROLET TRAX 2017,11640557,1.0,True
2848,2848,"I just bought this used car and just drove around town. It has 101,724 miles and it smells like something burning when I start it up and when I drive it. I know it's coming from somewhere under the hood. I'm paying 9500.00 for this car and already have burnt smell.",UNKNOWN OR OTHER,JEEP CHEROKEE 2016,11640183,1.0,True
10873,10873,When my heat is on there is a burning smell when the heat on and my right side passages door will not allow me to open the door from the inside or out sides,UNKNOWN OR OTHER,HYUNDAI SONATA 2013,11637091,1.0,True


In [None]:
returned_docs, sims = hg.query_embeddings(
    query="Bad smell overheating",
    top=50
)

In [91]:
returned_docs, sims = hg.query_embeddings(
    query="Door opens even when child safety mode is activated",
    top=50
)

In [92]:
returned_docs.head(5)

Unnamed: 0,index,CDESCR,COMPDESC,MMYTXT,RECORDID,NUMCOMPLAINTS,IS_COMPLAINT
10937,10937,"When turning on the child safety locks, the doors repeat a clicking sound for a few minutes sounding as if the doors are locking. After the noise stops, the dashboard shows the message ""child safety lock failure"". My small child has been winding the window down since I cannot lock the windows and doors properly. I am afraid she will get a body part caught in the window or open the door while d...",ELECTRICAL SYSTEM,HYUNDAI PALISADE 2021,11642064,1.0,True
1206,1206,"Child safety locks failed and clicked attempting to engage until turned off.With the locks unable to engage, children able to utilize windows and open doors which put their safety at risk. Replaced under warranty.",ELECTRICAL SYSTEM,HYUNDAI PALISADE 2021,11643828,1.0,True
1205,1205,"Child Safety Lock Failure notice - both parked or driving, I will hear a clicking noise coming from the doors. Sometimes I will receive a Child Safety Lock Failure error message on the dash. This would sporadically continue to happen while driving and even if my vehicle is parked and off and I happen to do something like open the doors. The vehicle has been serviced numerous times by Hyundai d...",ELECTRICAL SYSTEM,HYUNDAI PALISADE 2021,11634797,1.0,True
3733,3733,"Loaded children into second row of vehicle, confirmed all doors closed. Began driving, passenger rear door swung open. Child DID NOT engage door handle. Pulled over to attempt to close door. Door latch would not engage, could not close or lock door. Still unable to close and lock door.",LATCHES/LOCKS/LINKAGES:DOORS:LATCH,VOLVO XC90 2016,11640401,1.0,True
5846,5846,The back doors will no longer open when unlocked. This poses a safety risk. I was unable to get my child out of his car seat when the locks or locking mechanism went out. I had to climb into the back seat from the front. The door do not open either from the interior or exterior of the vehicle. A mechanic stated that this could be from faulty solenoids. I was unable to get a print out of a quot...,"UNKNOWN OR OTHER, STRUCTURE:BODY",HYUNDAI SONATA 2013,11634770,2.0,True


In [94]:
returned_docs, sims = hg.query_embeddings(
    query="Mercedes Benz issue with fuel leakage",
    top=50
)

In [95]:
returned_docs.head(5)

Unnamed: 0,index,CDESCR,COMPDESC,MMYTXT,RECORDID,NUMCOMPLAINTS,IS_COMPLAINT
2110,2110,"Fuel tank is leaking at connection between fill tube and tank. Multiple recalls have been issued apparently for similar problems with cars of other years. With less than 1/2 tank, the fuel leak is not as pronounced. With the tank full, the tank drips fuel. Fire department in Riverside was called and confirmed issue. Upon bringing vehicle to Clearwater Mercedes and other mechanics, the problem ...",FUEL/PROPULSION SYSTEM,MERCEDES BENZ C300 2013,11643602,1.0,True
2089,2089,Fuel leaking from engine,"ENGINE, FUEL/PROPULSION SYSTEM",CHEVROLET MALIBU 2017,11641448,2.0,True
9310,9310,"The seal that connects the fuel level sensor to the fuel tank has failed, allowing fuel to leak onto the top of the tank when it’s filled. This creates a serious risk of fire or explosion. Unfortunately, Mercedes does not offer a remedy unless you pay a prohibitively high fee.",FUEL/PROPULSION SYSTEM,MERCEDES BENZ E550 2008,11638060,1.0,True
27123,15499,"ON CERTAIN SPORT UTILITY VEHICLES, THE FUEL RETURN HOSE BETWEEN FUEL TANK AND RETURN LINE CAN BE PERMEATED WITH FUEL. THIS SITUATION RESULTS FROM THE EXPANSION AND CONTRACTION OF THE HOSE UNDER PRESSURE CAUSING MICROCRACKS IN THE HOSE MATERIAL.\n\n THESE MICROCRACKS MAY CAUSE FUEL LEAKAGE. FUEL LEAKAGE IN THE PRESENCE OF AN IGNITION SOURCE COULD RESULT IN A FIRE.\n\n DEALERS WILL INSPECT AND R...","FUEL SYSTEM, GASOLINE:DELIVERY:HOSES, LINES/PIPING, AND FITTINGS","MERCEDES BENZ G55 2004, MERCEDES BENZ G55 2005","55276, 55275",,False
18446,6822,"FUEL LEAKAGE MAY OCCUR IN THE ENGINE BAY AT FUEL LINE CONNECTIONS OR AT THE FUEL PUMP.\n\n \n\n THE DEALER WILL INSPECT THE FUEL LINES AND FUEL PUMP AND WILL REPLACE THEM, IF NECESSARY, WITHOUT CHARGE.","FUEL SYSTEM, GASOLINE:DELIVERY","TRIUMPH SPITFIRE 1974, TRIUMPH SPITFIRE 1971, TRIUMPH SPITFIRE 1976, TRIUMPH SPITFIRE 1972, TRIUMPH SPITFIRE 1973, TRIUMPH SPITFIRE 1975","40637, 40634, 40639, 40638, 40636, 40635",,False


# Compare Embedding Results

Run Training Pipeline for all Models