In [None]:
# CONSUMER COMPLAINTS

# The COMPLAINTS file contains all safety-related defect complaints received 
# by NHTSA since January 1, 1995.

# File characteristics:

# -  All the records are TAB delimited
# -  All dates are in YYYYMMDD format

# -  Maximum record length: 2824

# Change log:
# 1. Fields 21 - 46 were added on Sept. 14, 2007
# 2. Changed flat file extension from .lst to .txt on Sept. 14, 2007
# 3. Field 47 was added on Oct. 15, 2007
# 4. Field 34 was changed from CHAR(20) to CHAR(30) on Aug. 08, 2008
# 5. Field 18 was changed from NUMBER(6) to NUMBER(7) on Jun. 18, 2010
# 6. Complaint Type 'MIVQ' was added to Field 21 list on Mar. 21, 2013
# 7. Complaint Type 'MAVQ' was added to Field 21 list on Jan. 17, 2014
# 8. Field 48 was added on Apr. 24, 2014
# 9. Field 49 was added on Sept. 29, 2015
# 10. Flat file content changes May 17 - June 17, 2021
# 11. Information message updated on June 28, 2021

# =======
# May 17 - June 17, 2021 - Flat file content changes.
# * Previously blank Y/N fields (such as crash or fire) will now show as N.
# * Previously blank values for numeric fields will now show as zero.
# * Manufacturer name, make, model and component name of the product(s) in a complaint may have changed over time and the new flat file will now reflect them.
# * If a complaint had multiple dealer contacts, only a single dealer's contact information will now show.
# * Additional minor differences due to data cleanup in a relatively small number of records in the flat file.
# =======
# June 28, 2021 - Flat file content changes.
# The NHTSA system that generates the complaints file underwent an update during the weekend of May 16-17, 2021. The update has caused discrepancies between the contents of the complaints file between the version posted on Friday, May 15, 2021 (before the system update) and the versions posted daily since Monday, May 17, 2021 (after the system update) and continuing to date. Lookup of complaints on the NHTSA public website are unaffected.
# We are working to resolve the issue. In the meantime, we will continue to publish data daily as it will contain newly received complaints by NHTSA. The complaint data is included in the single FLAT_CMPL.zip and the COMPLAINTS_RECEIVED_YYYY-YYYY.zip files, which contain the same data broken down into 5-year chunks by received date. Note that the format of the complaint files has not and will not change. Once resolved, you may still see minor data differences between the latest version and that from prior to the system update.
# We will continue to update this message as needed until the issue is resolved.
# =======

# Last updated: June 28, 2021

# FIELDS:
# =======

# Field#  Name              Type/Size     Description
# ------  ---------         ---------     --------------------------------------
# 1       CMPLID            CHAR(9)       NHTSA'S INTERNAL UNIQUE SEQUENCE NUMBER.
#                                         IS AN UPDATEABLE FIELD,THUS DATA FOR A
#                                         GIVEN RECORD POTENTIALLY COULD CHANGE FROM
#                                         ONE DATA OUTPUT FILE TO THE NEXT.
# 2       ODINO             CHAR(9)       NHTSA'S INTERNAL REFERENCE NUMBER.
#                                         THIS NUMBER MAY BE REPEATED FOR
#                                         MULTIPLE COMPONENTS.
#                                         ALSO, IF LDATE IS PRIOR TO DEC 15, 2002,
#                                         THIS NUMBER MAY BE REPEATED FOR MULTIPLE
#                                         PRODUCTS OWNED BY THE SAME COMPLAINANT.
# 3       MFR_NAME          CHAR(40)      MANUFACTURER'S NAME
# 4       MAKETXT           CHAR(25)      VEHICLE/EQUIPMENT MAKE
# 5       MODELTXT          CHAR(256)     VEHICLE/EQUIPMENT MODEL
# 6       YEARTXT           CHAR(4)       MODEL YEAR, 9999 IF UNKNOWN or N/A
# 7       CRASH             CHAR(1)       WAS VEHICLE INVOLVED IN A CRASH, 'Y' OR 'N'
# 8       FAILDATE          CHAR(8)       DATE OF INCIDENT (YYYYMMDD)
# 9       FIRE              CHAR(1)       WAS VEHICLE INVOLVED IN A FIRE 'Y' OR 'N'
# 10      INJURED           NUMBER(2)     NUMBER OF PERSONS INJURED
# 11      DEATHS            NUMBER(2)     NUMBER OF FATALITIES
# 12      COMPDESC          CHAR(128)     SPECIFIC COMPONENT'S DESCRIPTION
# 13      CITY              CHAR(30)      CONSUMER'S CITY
# 14      STATE             CHAR(2)       CONSUMER'S STATE CODE
# 15      VIN               CHAR(11)      VEHICLE'S VIN#
# 16      DATEA             CHAR(8)       DATE ADDED TO FILE (YYYYMMDD)
# 17      LDATE             CHAR(8)       DATE COMPLAINT RECEIVED BY NHTSA (YYYYMMDD)
# 18      MILES             NUMBER(7)     VEHICLE MILEAGE AT FAILURE
# 19      OCCURENCES        NUMBER(4)     NUMBER OF OCCURRENCES
# 20      CDESCR            CHAR(2048)    DESCRIPTION OF THE COMPLAINT
# 21      CMPL_TYPE         CHAR(4)       SOURCE OF COMPLAINT CODE:
#                                           CAG  =CONSUMER ACTION GROUP
#                                           CON  =FORWARDED FROM A CONGRESSIONAL OFFICE
#                                           DP   =DEFECT PETITION,RESULT OF A DEFECT PETITION
#                                           EVOQ =HOTLINE VOQ
#                                           EWR  =EARLY WARNING REPORTING
#                                           INS  =INSURANCE COMPANY
#                                           IVOQ =NHTSA WEB SITE
#                                           LETR =CONSUMER LETTER
#                                           MAVQ =NHTSA MOBILE APP
#                                           MIVQ =NHTSA MOBILE APP
#                                           MVOQ =OPTICAL MARKED VOQ
#                                           RC   =RECALL COMPLAINT,RESULT OF A RECALL INVESTIGATION
#                                           RP   =RECALL PETITION,RESULT OF A RECALL PETITION
#                                           SVOQ =PORTABLE SAFETY COMPLAINT FORM (PDF)
#                                           VOQ  =NHTSA VEHICLE OWNERS QUESTIONNAIRE
# 22      POLICE_RPT_YN     CHAR(1)       WAS INCIDENT REPORTED TO POLICE 'Y' OR 'N'
# 23      PURCH_DT          CHAR(8)       DATE PURCHASED (YYYYMMDD)
# 24      ORIG_OWNER_YN     CHAR(1)       WAS ORIGINAL OWNER 'Y' OR 'N'
# 25      ANTI_BRAKES_YN    CHAR(1)       ANTI-LOCK BRAKES 'Y' OR 'N'
# 26      CRUISE_CONT_YN    CHAR(1)       CRUISE CONTROL 'Y' OR 'N'
# 27      NUM_CYLS          NUMBER(2)     NUMBER OF CYLINDERS
# 28      DRIVE_TRAIN       CHAR(4)       DRIVE TRAIN TYPE [AWD,4WD,FWD,RWD]
# 29      FUEL_SYS          CHAR(4)       FUEL SYSTEM CODE:
#                                            FI =FUEL INJECTION
#                                            TB =TURBO
# 30      FUEL_TYPE         CHAR(4)       FUEL TYPE CODE:
#                                            BF =BIFUEL
#                                            CN =CNG/LPG
#                                            DS =DIESEL
#                                            GS =GAS
#                                            HE =HYBRID ELECTRIC
# 31      TRANS_TYPE        CHAR(4)       VEHICLE TRANSMISSION TYPE [AUTO, MAN]
# 32      VEH_SPEED         NUMBER(3)     VEHICLE SPEED
# 33      DOT               CHAR(20)      DEPARTMENT OF TRANSPORTATION TIRE IDENTIFIER
# 34      TIRE_SIZE         CHAR(30)      TIRE SIZE
# 35      LOC_OF_TIRE       CHAR(4)       LOCATION OF TIRE CODE:
#                                            FSW =DRIVER SIDE FRONT
#                                            DSR =DRIVER SIDE REAR
#                                            FTR =PASSENGER SIDE FRONT
#                                            PSR =PASSENGER SIDE REAR
#                                            SPR =SPARE
# 36      TIRE_FAIL_TYPE    CHAR(4)       TYPE OF TIRE FAILURE CODE:
#                                            BST =BLISTER
#                                            BLW =BLOWOUT
#                                            TTL =CRACK
#                                            OFR =OUT OF ROUND
#                                            TSW =PUNCTURE
#                                            TTR =ROAD HAZARD
#                                            TSP =TREAD SEPARATION
# 37      ORIG_EQUIP_YN     CHAR(1)       WAS PART ORIGINAL EQUIPMENT 'Y' OR 'N'
# 38      MANUF_DT          CHAR(8)       DATE OF MANUFACTURE (YYYYMMDD)
# 39      SEAT_TYPE         CHAR(4)       TYPE OF CHILD SEAT CODE:
#                                            B  =BOOSTER
#                                            C  =CONVERTIBLE
#                                            I  =INFANT
#                                            IN =INTEGRATED
#                                            TD =TODDLER
# 40     RESTRAINT_TYPE     CHAR(4)       INSTALLATION SYSTEM CODE;
#                                            A =VEHICLE SAFETY BELT
#                                            B =LATCH SYSTEM
# 41     DEALER_NAME        CHAR(40)      DEALER'S NAME
# 42     DEALER_TEL         CHAR(20)      DEALER'S TELEPHONE NUMBER
# 43     DEALER_CITY        CHAR(30)      DEALER'S CITY
# 44     DEALER_STATE       CHAR(2)       DEALER'S STATE CODE
# 45     DEALER_ZIP         CHAR(10)      DEALER'S ZIPCODE
# 46     PROD_TYPE          CHAR(4)       PRODUCT TYPE CODE:
#                                            V =VEHICLE
#                                            T =TIRES
#                                            E =EQUIPMENT
#                                            C =CHILD RESTRAINT
# 47     REPAIRED_YN        CHAR(1)       WAS DEFECTIVE TIRE REPAIRED 'Y' OR 'N'
# 48     MEDICAL_ATTN       CHAR(1)       WAS MEDICAL ATTENTION REQUIRED 'Y' OR 'N'
# 49     VEHICLES_TOWED_YN  CHAR(1)       WAS VEHICLE TOWED 'Y' OR 'N'

import pandas as pd

# https://www.nhtsa.gov/nhtsa-datasets-and-apis#recalls
filename = "C:\\Repo\\SIADs_Audio_Text_SRS\\Datasets\\COMPLAINTS_RECEIVED_2025-2025.txt"
# read in C:\Repo\SIADs_Audio_Text_SRS\Example\COMPLAINTS_RECEIVED_2025-2025.txt into a pandas dataframe, where the columns are RCL
df_complaints = pd.read_csv(filename, sep='\t', header=None, index_col=0)
df_complaints.columns = ['ODINO', 'MFR_NAME', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'CRASH', 'FAILDATE', 'FIRE', 'INJURED', 'DEATHS', 'COMPDESC', 'CITY', 'STATE', 'VIN', 'DATEA', 'LDATE', 'MILES', 'OCCURENCES', 'CDESCR', 'CMPL_TYPE', 'POLICE_RPT_YN', 'PURCH_DT', 'ORIG_OWNER_YN', 'ANTI_BRAKES_YN', 'CRUISE_CONT_YN', 'NUM_CYLS', 'DRIVE_TRAIN', 'FUEL_SYS', 'FUEL_TYPE',
              'TRANS_TYPE', 'VEH_SPEED', 'DOT', 'TIRE_SIZE', 'LOC_OF_TIRE', 'TIRE_FAIL_TYPE', 'ORIG_EQUIP_YN', 'MANUF_DT', 'SEAT_TYPE', 'RESTRAINT_TYPE', 'DEALER_NAME', 'DEALER_TEL', 'DEALER_CITY', 'DEALER_STATE', 'DEALER_ZIP', 'PROD_TYPE', 'REPAIRED_YN', 'MEDICAL_ATTN', 'VEHICLES_TOWED_YN']

summary = df_complaints.describe()
display(summary)

deadly_complaints = df_complaints[df_complaints["DEATHS"] > 0]
#print(deadly_complaints)

# print any of the complaints that lead to a death
display(deadly_complaints[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "DEATHS", "CDESCR"]])
# print the tesla complaints that lead to a death
print(deadly_complaints[deadly_complaints["MFR_NAME"] == "Tesla, Inc."]["CDESCR"].values)

# display columns 11 through 25
#display(df_complaints)

sub_df = df_complaints[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]]
display(sub_df)
#display(read_complaints_received())
# print all the unique COMPDESC values
COMPDESC_list = df_complaints["COMPDESC"].unique()
print(COMPDESC_list)


In [None]:
# RECALLS

# The RECALL file contains all NHTSA safety-related defect and compliance 
# campaigns since 1967.

# File characteristics:

# -  All the records are TAB delimited
# -  All dates are in YYYYMMDD format

# -  Maximum Record length: 9109

# Change log:
# 1.Field# 23 added as of Sept. 14, 2007
# 2.Changed flat file extension from .lst to .txt as of Sept. 14, 2007
# 3.Field# 24 added as of March 14, 2008
# 4.Field#s 25,26,27 added as of March 23, 2020

# Last Updated March 23, 2020


# FIELDS:
# =======

# Field#   Name                Type/Size   Description                      
# ------   ---------           ---------   --------------------------------------
# 1        RECORD_ID           NUMBER(9)   RUNNING SEQUENCE NUMBER, 
#                                           WHICH UNIQUELY IDENTIFIES THE RECORD.
# 2        CAMPNO              CHAR(12)    NHTSA CAMPAIGN NUMBER
# 3        MAKETXT             CHAR(25)    VEHICLE/EQUIPMENT MAKE
# 4        MODELTXT            CHAR(256)   VEHICLE/EQUIPMENT MODEL
# 5        YEARTXT             CHAR(4)     MODEL YEAR, 9999 IF UNKNOWN or N/A
# 6        MFGCAMPNO           CHAR(20)    MFR CAMPAIGN NUMBER
# 7        COMPNAME            CHAR(256)   COMPONENT DESCRIPTION
# 8        MFGNAME             CHAR(40)    MANUFACTURER THAT FILED DEFECT/NONCOMPLIANCE REPORT
# 9        BGMAN               CHAR(8)     BEGIN DATE OF MANUFACTURING
# 10       ENDMAN              CHAR(8)     END DATE OF MANUFACTURING
# 11       RCLTYPECD           CHAR(4)     VEHICLE, EQUIPMENT OR TIRE REPORT
# 12       POTAFF              NUMBER(9)   POTENTIAL NUMBER OF UNITS AFFECTED               
# 13       ODATE               CHAR(8)     DATE OWNER NOTIFIED BY MFR
# 14       INFLUENCED_BY       CHAR(4)     RECALL INITIATOR (MFR/OVSC/ODI)
# 15       MFGTXT              CHAR(40)    MANUFACTURERS OF RECALLED VEHICLES/PRODUCTS
# 16       RCDATE              CHAR(8)     REPORT RECEIVED DATE
# 17       DATEA               CHAR(8)     RECORD CREATION DATE
# 18       RPNO                CHAR(3)     REGULATION PART NUMBER
# 19       FMVSS               CHAR(10)    FEDERAL MOTOR VEHICLE SAFETY STANDARD NUMBER
# 20       DESC_DEFECT         CHAR(2000)  DEFECT SUMMARY
# 21       CONEQUENCE_DEFECT   CHAR(2000)  CONSEQUENCE SUMMARY	
# 22       CORRECTIVE_ACTION   CHAR(2000)  CORRECTIVE SUMMARY
# 23       NOTES               CHAR(2000)  RECALL NOTES
# 24       RCL_CMPT_ID         CHAR(27)    NUMBER THAT UNIQUELY IDENTIFIES A RECALLED COMPONENT.
# 25       MFR_COMP_NAME       CHAR(50)    MANUFACTURER-SUPPLIED COMPONENT NAME
# 26       MFR_COMP_DESC       CHAR(200)   MANUFACTURER-SUPPLIED COMPONENT DESCRIPTION
# 27       MFR_COMP_PTNO       CHAR(100)   MANUFACTURER-SUPPLIED COMPONENT PART NUMBER

# read in C:\Repo\SIADs_Audio_Text_SRS\Datasets\FLAT_RCL.txt
# there are only 24 columns in the file, so we need to specify the column names
# https://www.nhtsa.gov/nhtsa-datasets-and-apis#recalls
df_recall = pd.read_csv("C:\\Repo\\SIADs_Audio_Text_SRS\\Datasets\\FLAT_RCL.txt", sep='\t', header=None, on_bad_lines='skip')
# use the column names listed above
df_recall.columns = ['RECORD_ID', 'CAMPNO', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'MFGCAMPNO', 'COMPNAME', 'MFGNAME', 'BGMAN', 'ENDMAN', 'RCLTYPECD', 'POTAFF', 'ODATE', 'INFLUENCED_BY', 'MFGTXT', 'RCDATE', 'DATEA', 'RPNO', 'FMVSS', 'DESC_DEFECT', 'CONEQUENCE_DEFECT', 'CORRECTIVE_ACTION', 'NOTES', 'RCL_CMPT_ID', 'MFR_COMP_NAME', 'MFR_COMP_DESC', 'MFR_COMP_PTNO']
display(df_recall)

# print all the unique COMPDESC values
COMPNAME_list = df_recall["COMPNAME"].unique()
print(COMPNAME_list)
print(len(COMPNAME_list))
# state encode the COMPDESC values and create a new column in the dataframe called COMPDESC_StateEncoded
df_recall["COMPNAME_StateEncoded"] = df_recall["COMPNAME"].apply(lambda x: hash(x))


In [None]:
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# create a function that will be called from a apply lambda function to process the text from a dataframe with the "CDESCR" column
def process_text(text):
    """
    Process text by tokenizing, removing stop words, and stemming
    """
    #print(text)
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()
    
    # make sure the text is not empty and or nan
    if not text or pd.isna(text):
        return []

    # tokenize the text and remove stop words and stem the words
    content_cleaned = [stemmer.stem(word) for word in word_tokenize(text.lower()) if word not in stop_words]

    # remove the punctuation from the content_cleaned list
    content_cleaned = [word for word in content_cleaned if word.isalnum()]
    
    return content_cleaned

# process the text in the "CDESCR" column and create a new column "CDESCR_CLEANED" with the processed text
df_complaints["CDESCR_CLEANED"] = df_complaints["CDESCR"].apply(lambda x: process_text(x))
display(df_complaints[["CDESCR", "CDESCR_CLEANED"]])

In [None]:
# find the most common words in the "CDESCR_CLEANED" column
from collections import Counter

# create a Counter object from the "CDESCR_CLEANED" column
word_counter = Counter([word for words in df_complaints["CDESCR_CLEANED"] for word in words])

# get the 10 most common words
most_common_words = word_counter.most_common(10)
print(most_common_words)

# get the 10 least common words
least_common_words = word_counter.most_common()[:-10:-1]
print(least_common_words)

# get the 10 most common words in the "CDESCR_CLEANED" column for complaints that lead to a death
deadly_complaints = df_complaints[df_complaints["DEATHS"] > 0]
word_counter_deadly = Counter([word for words in deadly_complaints["CDESCR_CLEANED"] for word in words])
most_common_words_deadly = word_counter_deadly.most_common(10)
print(most_common_words_deadly)

In [None]:
# Remove nan values from the "CDESCR" column and put it into a new dataframe named "df_complaints_no_nan"
df_complaints_no_nan = df_complaints.dropna(subset=["CDESCR"])
# find any instances of the word "diagnostic" or "DTC" in the "CDESCR" column, DTC stands for Diagnostic Trouble Code and is used in the automotive industry for identifying issues with a vehicle
diagnostic_complaints = df_complaints_no_nan[df_complaints_no_nan["CDESCR"].str.contains("diagnostic|DTC", case=False)]
display(diagnostic_complaints[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR", "CDESCR_CLEANED"]])

In [None]:
# split the df_complaints dataframe into a test, train, and validation set with a 70/20/10 split
from sklearn.model_selection import train_test_split

random_state = 42
train_size = 0.7
test_size = 0.2
validation_size = 0.1


train, test = train_test_split(df_complaints, test_size=(1-train_size), random_state=random_state)
test, validation = train_test_split(test, test_size=(validation_size/(1-train_size)), random_state=random_state)
print(df_complaints.shape)
print(train.shape, test.shape, validation.shape)

In [None]:
# vectorize the "CDESCR_CLEANED" column using the TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# create a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# set the number of dimensions to reduce the vectorized data to
num_dimensions = 100


# fit the vectorizer on the "CDESCR_CLEANED" column and transform the "CDESCR_CLEANED" column into a vectorized format
# apply a lambda function to join the list of words in the "CDESCR_CLEANED" column into a string
X_train = vectorizer.fit_transform(train["CDESCR_CLEANED"].apply(lambda x: " ".join(x)))
X_test = vectorizer.transform(test["CDESCR_CLEANED"].apply(lambda x: " ".join(x)))
X_validation = vectorizer.transform(validation["CDESCR_CLEANED"].apply(lambda x: " ".join(x)))
print(X_train.shape, X_test.shape, X_validation.shape)

# perform LSA on the vectorized data to reduce the dimensionality
lsa = TruncatedSVD(n_components=num_dimensions, random_state=random_state)
complaints_vectorized_train = lsa.fit_transform(X_train)
vectorized_test = lsa.transform(X_test)
complaints_vectorized_validation = lsa.transform(X_validation)
print(complaints_vectorized_train.shape, vectorized_test.shape, complaints_vectorized_validation.shape)
# print out the words that correspond to the first 10 dimensions of the LSA

# create a list of unique manufacturers in the "MFR_NAME" column
list_of_manufacturers = df_complaints["MFR_NAME"].unique()
#print(list_of_manufacturers)

# Find the cosine similarity between the first complaint in the test set and all the complaints in the training set
# get the first complaint in the test set
complaint_test = vectorized_test[5].reshape(1, -1)
# get the cosine similarity between the first complaint in the test set and all the complaints in the training set
cosine_similarities = cosine_similarity(complaint_test, complaints_vectorized_train)
# get the index of the most similar complaint in the training set
most_similar_index = cosine_similarities.argmax()
print(most_similar_index)
# get the most similar complaint in the training set
most_similar_complaint_train = train.iloc[most_similar_index]
print(most_similar_complaint_train[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
print(most_similar_complaint_train["CDESCR"])
# get the most similar complaint in the test set
most_similar_complaint_test = test.iloc[5]
print(most_similar_complaint_test[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
print(most_similar_complaint_test["CDESCR"])


In [None]:
# query text to vectorize and find similar complaints to
query_text = "EV not charging"

# process the query text
query_text_cleaned = process_text(query_text)
# vectorize the query text
query_vectorized = vectorizer.transform([" ".join(query_text_cleaned)])
# reduce the dimensionality of the query vector
query_vectorized_lsa = lsa.transform(query_vectorized)
# find the cosine similarity between the query vector and all the complaints in the training set
cosine_similarities_query = cosine_similarity(query_vectorized_lsa, complaints_vectorized_train)
# get the index of the most similar complaint in the training set
most_similar_index_query = cosine_similarities_query.argmax()
# get the most similar complaint in the training set
most_similar_complaint_train_query = train.iloc[most_similar_index_query]
print(most_similar_complaint_train_query[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
print(most_similar_complaint_train_query["CDESCR"])

In [None]:
# functionalize the process of finding similar complaints to a query text
def find_similar_complaint(query_text:str, vectorizer, lsa, train):
    """
    Find the most similar complaint to a query text in the training set
    """
    # process the query text
    query_text_cleaned = process_text(query_text)
    # vectorize the query text
    query_vectorized = vectorizer.transform([" ".join(query_text_cleaned)])
    # reduce the dimensionality of the query vector
    query_vectorized_lsa = lsa.transform(query_vectorized)
    # find the cosine similarity between the query vector and all the complaints in the training set
    cosine_similarities_query = cosine_similarity(query_vectorized_lsa, complaints_vectorized_train)
    # get the index of the most similar complaint in the training set
    most_similar_index_query = cosine_similarities_query.argmax()
    # get the most similar complaint in the training set
    most_similar_complaint_train_query = train.iloc[most_similar_index_query]
    return most_similar_complaint_train_query

In [None]:
find_similar_stuff = "rattling noise when driving slowly"
most_similar_complaint = find_similar_complaint(find_similar_stuff, vectorizer, lsa, train)
print(most_similar_complaint[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
print(most_similar_complaint["CDESCR"])

In [None]:
import os
import pickle
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


class TextClassifier:
    def __init__(self, df, column_name:str):
        # set the random state for reproducibility
        self.random_state = 42
        # create a TfidfVectorizer object
        self.vectorizer = TfidfVectorizer()
        # set the number of dimensions to reduce the vectorized data to
        self.num_dimensions = 100
        self.df = df
        self.column_name = column_name
        self.column_name_cleaned = column_name + "_CLEANED"
        # create variables to store the training, test, and validation sets for class functions
        self.column_name_cleaned = None
        self.df_train = None
        self.df_test = None
        self.df_validation = None
        self.x_train_vect = None
        self.x_test_vect = None
        self.x_validation_vect = None
        self.lsa = None
        self.vectorized_train = None
        self.vectorized_test = None
        self.vectorized_validation = None

    # create a function that will be called from a apply lambda function to process the text from a dataframe with the "CDESCR" column
    def process_text(text):
        """
        Process text by tokenizing, removing stop words, and stemming
        """
        #print(text)
        stop_words = set(stopwords.words("english"))
        stemmer = PorterStemmer()
        
        # make sure the text is not empty and or nan
        if not text or pd.isna(text):
            return []

        # tokenize the text and remove stop words and stem the words
        content_cleaned = [stemmer.stem(word) for word in word_tokenize(text.lower()) if word not in stop_words]

        # remove the punctuation from the content_cleaned list
        content_cleaned = [word for word in content_cleaned if word.isalnum()]
        
        return content_cleaned
    
    def process_dataframe(self, train_size=0.7, test_size=0.2, validation_size=0.1):
        """
        Process the text in the "CDESCR" column and create a new column "CDESCR_CLEANED" with the processed text
        """
        # get current working directory
        cwd = os.getcwd()
        # change the Example folder to Datasets folder in the cwd path
        desired_save_path = cwd.replace("Example", "Datasets")
        # create a folder path to save the pickle files
        if not os.path.exists(desired_save_path):
            os.makedirs(desired_save_path)

        # check to see if there is a pickle file for the dataframe with the processed text
        if os.path.exists(desired_save_path + '//' + self.column_name + "_df.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_df.pkl", "rb") as f:
                self.df = pickle.load(f)
        else:
            # process the text in the "CDESCR" column and create a new column "CDESCR_CLEANED" with the processed text
            self.df[self.column_name_cleaned] = self.df[self.column_name].apply(lambda x: TextClassifier.process_text(x))
            # create a pickle file for the dataframe with the processed text
            with open(desired_save_path + '//' + self.column_name + "_df.pkl", "wb") as f:
                pickle.dump(self.df, f)

        # split the df_complaints dataframe into a test, train, and validation set with a 70/20/10 split
        self.df_train, self.df_test = train_test_split(self.df, test_size=(1-train_size), random_state=self.random_state)
        self.df_test, self.df_validation = train_test_split(self.df_test, test_size=(validation_size/(1-train_size)), random_state=self.random_state)

        # check to see if there is a pickle file for the vectorizer
        if os.path.exists(desired_save_path + '//' + self.column_name + "_vectorizer.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_vectorizer.pkl", "rb") as f:
                self.vectorizer = pickle.load(f)
        else:
            # fit the vectorizer on the "CDESCR_CLEANED" column and transform the "CDESCR_CLEANED" column into a vectorized format
            # apply a lambda function to join the list of words in the "CDESCR_CLEANED" column into a string
            self.x_train_vect = self.vectorizer.fit_transform(self.df_train[self.column_name_cleaned].apply(lambda x: " ".join(x)))
            self.x_test_vect = self.vectorizer.transform(self.df_test[self.column_name_cleaned].apply(lambda x: " ".join(x)))
            self.x_validation_vect = self.vectorizer.transform(self.df_validation[self.column_name_cleaned].apply(lambda x: " ".join(x)))        
            #print(self.x_train_vect.shape, self.x_test_vect.shape, self.x_validation_vect.shape)
            # create a pickle file for the vectorizer
            with open(desired_save_path + '//' + self.column_name + "_vectorizer.pkl", "wb") as f:
                pickle.dump(self.vectorizer, f)

        # check to see if there is a pickle file for the lsa
        if os.path.exists(desired_save_path + '//' + self.column_name + "_lsa.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_lsa.pkl", "rb") as f:
                self.lsa = pickle.load(f)
        else:
            # perform LSA on the vectorized data to reduce the dimensionality
            self.lsa = TruncatedSVD(n_components=self.num_dimensions, random_state=self.random_state)

        # check to see if there is a pickle file for the vectorized training data
        if os.path.exists(desired_save_path + '//' + self.column_name + "_vectorized_train.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_vectorized_train.pkl", "rb") as f:
                self.complaints_vectorized_train = pickle.load(f)
        else:
            self.complaints_vectorized_train = self.lsa.fit_transform(self.x_train_vect)
            # create a pickle file for the vectorized training data
            with open(desired_save_path + '//' + self.column_name + "_vectorized_train.pkl", "wb") as f:
                pickle.dump(self.complaints_vectorized_train, f)
        
        # check to see if there is a pickle file for the vectorized test data
        if os.path.exists(desired_save_path + '//' + self.column_name + "_vectorized_test.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_vectorized_test.pkl", "rb") as f:
                self.vectorized_test = pickle.load(f)
        else:
            self.vectorized_test = self.lsa.transform(self.x_test_vect)
            # create a pickle file for the vectorized test data
            with open(desired_save_path + '//' + self.column_name + "_vectorized_test.pkl", "wb") as f:
                pickle.dump(self.vectorized_test, f)

        # check to see if there is a pickle file for the vectorized validation data
        if os.path.exists(desired_save_path + '//' + self.column_name + "_vectorized_validation.pkl"):
            # load the pickle file
            with open(desired_save_path + '//' + self.column_name + "_vectorized_validation.pkl", "rb") as f:
                self.complaints_vectorized_validation = pickle.load(f)
        else:
            self.complaints_vectorized_validation = self.lsa.transform(self.x_validation_vect)
            # create a pickle file for the vectorized validation data
            with open(desired_save_path + '//' + self.column_name + "_vectorized_validation.pkl", "wb") as f:
                pickle.dump(self.complaints_vectorized_validation, f)

        return self.df, self.complaints_vectorized_train, self.vectorized_test, self.complaints_vectorized_validation, self.df_train, self.df_test, self.df_validation
    
    # A functionalize the process of finding similar complaints to a query text
    def find_similar_complaint(self, query_text:str):
        """
        Find the most similar complaint to a query text in the training set
        """
        # process the query text
        query_text_cleaned = TextClassifier.process_text(query_text)
        # vectorize the query text
        query_vectorized = self.vectorizer.transform([" ".join(query_text_cleaned)])
        # reduce the dimensionality of the query vector
        query_vectorized_lsa = self.lsa.transform(query_vectorized)
        # find the cosine similarity between the query vector and all the complaints in the training set
        cosine_similarities_query = cosine_similarity(query_vectorized_lsa, self.complaints_vectorized_train)
        # get the index of the most similar complaint in the training set
        most_similar_index_query = cosine_similarities_query.argmax()
        print(most_similar_index_query)
        # get the most similar complaint in the training set
        most_similar_complaint_train_query = self.df_train.iloc[most_similar_index_query]
        return most_similar_complaint_train_query

In [None]:
# https://www.nhtsa.gov/nhtsa-datasets-and-apis#recalls
# read in C:\Repo\SIADs_Audio_Text_SRS\Example\COMPLAINTS_RECEIVED_2025-2025.txt into a pandas dataframe, where the columns are RCL
df_complaints = pd.read_csv("C:\\Repo\\SIADs_Audio_Text_SRS\\Datasets\\COMPLAINTS_RECEIVED_2025-2025.txt", sep='\t', header=None, index_col=0)
df_complaints.columns = ['ODINO', 'MFR_NAME', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'CRASH', 'FAILDATE', 'FIRE', 'INJURED', 'DEATHS', 'COMPDESC', 'CITY', 'STATE', 'VIN', 'DATEA', 'LDATE', 'MILES', 'OCCURENCES', 'CDESCR', 'CMPL_TYPE', 'POLICE_RPT_YN', 'PURCH_DT', 'ORIG_OWNER_YN', 'ANTI_BRAKES_YN', 'CRUISE_CONT_YN', 'NUM_CYLS', 'DRIVE_TRAIN', 'FUEL_SYS', 'FUEL_TYPE',
              'TRANS_TYPE', 'VEH_SPEED', 'DOT', 'TIRE_SIZE', 'LOC_OF_TIRE', 'TIRE_FAIL_TYPE', 'ORIG_EQUIP_YN', 'MANUF_DT', 'SEAT_TYPE', 'RESTRAINT_TYPE', 'DEALER_NAME', 'DEALER_TEL', 'DEALER_CITY', 'DEALER_STATE', 'DEALER_ZIP', 'PROD_TYPE', 'REPAIRED_YN', 'MEDICAL_ATTN', 'VEHICLES_TOWED_YN']

# create a list of unique manufacturers in the "MFR_NAME" column
list_of_manufacturers = df_complaints["MFR_NAME"].unique()

# testing out the functionilzed code

# call the TextClassifier class and create an instance of it as text_classifier
# pass in the df_complaints dataframe and the "CDESCR" column
text_classifier = TextClassifier(df_complaints, "CDESCR")
# process the text in the "CDESCR" column
text_classifier.process_dataframe()

# use one of the complaints in the test set as a query to find the most similar complaint in the training set
complaint_test_query = text_classifier.df_test["CDESCR"].iloc[5]
print(complaint_test_query)
# find the most similar complaint to the complaint test
most_similar_complaint = text_classifier.find_similar_complaint(complaint_test_query)
# print the most similar complaint with the below columns
print(most_similar_complaint[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
print(most_similar_complaint["CDESCR"])

In [None]:
# find the most similar complaint to the complaint test
most_similar_complaint = text_classifier.find_similar_complaint("Car won't start")# and makes a clicking noise")
# print the most similar complaint with the below columns
print(most_similar_complaint[["ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CDESCR"]])
print(most_similar_complaint["CDESCR"])

In [10]:
from Extracted_Text import record_audio
from Extracted_Text import wav_to_text_and_tokenize
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# file name 
filename = "C:\\Repo\\SIADs_Audio_Text_SRS\\Datasets\\audio.wav"

# call the record_audio function to record audio from the microphone and save it to a file
print("Recording audio...")
record_audio(filename)
print("Recording finished.")
# call the extract_text_from_audio function to extract text from the audio file and save it to a file

if os.path.exists(filename):
    tokens = wav_to_text_and_tokenize(filename)

    if tokens is not None:
        if tokens:
            print("Tokens:", tokens)

            # Stop word removal (example)
            from nltk.corpus import stopwords
            stop_words = set(stopwords.words('english'))
            filtered_tokens = [w for w in tokens if not w.lower() in stop_words]
            print("Filtered Tokens (Stop words removed):", filtered_tokens)
        else:
            print("No text was recognized.")
    else:
        print("An error occurred during processing.")
else:
    print(f"File not found: {filename}")

Recording audio...


FileNotFoundError: [Errno 2] No such file or directory: 'Output/complaint.wav'