In [4]:
import pandas as pd

In [14]:
import spacy
import re
from spellchecker import SpellChecker

nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

def clean_question(text):
    # Lowercase and remove punctuation
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    # Correct typos (optional)
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)
    
    # Lemmatize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return ' '.join(tokens)

In [15]:
def clean_answer(text):
    # Trim whitespace
    text = text.strip()
    # Remove markdown/HTML tags (example)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\*\*', '', text)      # Remove bold markers
    return text

In [5]:
D2 = pd.read_csv('data/FAQ Answering/Preprocessed data/D2.csv')
D4 = pd.read_csv('data/FAQ Answering/Preprocessed data/D4.csv')
D5 = pd.read_csv('data/FAQ Answering/Preprocessed data/D5.csv')

In [6]:
D2

Unnamed: 0,query,finalpassage
0,why do children get aggressive,"At the same time, despite claiming the review ..."
1,which credit bureau is used the most for auto ...,Best Answer: both of those answers are wrong. ...
2,what is the minimum healthy calorie intake,Safe Intakes. If you’re not supervised by a me...
3,why is coffee making gain weight,Is coffee making you fat? If you are overweigh...
4,"what county is grand rapids, mi in","Located in Grand Rapids, Michigan, the 61st Di..."
...,...,...
130555,where is sesame seed harvested,Sesame has one of the highest oil contents of ...
130556,requirements to become a real estate agent,Shares & Saves. Save. To become a real estate ...
130557,what are butterflies live?,Monarchs and Swallowtails may live about a mon...
130558,how much does a new fence cost,1 Chain-link fence prices fluctuate with the c...


In [12]:
D2 = D2.rename(columns={'query':'question', 'finalpassage':'answer'})

In [13]:
D2

Unnamed: 0,question,answer
0,why do children get aggressive,"At the same time, despite claiming the review ..."
1,which credit bureau is used the most for auto ...,Best Answer: both of those answers are wrong. ...
2,what is the minimum healthy calorie intake,Safe Intakes. If you’re not supervised by a me...
3,why is coffee making gain weight,Is coffee making you fat? If you are overweigh...
4,"what county is grand rapids, mi in","Located in Grand Rapids, Michigan, the 61st Di..."
...,...,...
130555,where is sesame seed harvested,Sesame has one of the highest oil contents of ...
130556,requirements to become a real estate agent,Shares & Saves. Save. To become a real estate ...
130557,what are butterflies live?,Monarchs and Swallowtails may live about a mon...
130558,how much does a new fence cost,1 Chain-link fence prices fluctuate with the c...


In [30]:
# Apply to the `query` column
D2["cleaned_query"] = D2["question"].apply(clean_question)

KeyboardInterrupt: 

In [None]:
D2["cleaned_answer"] = D2["answer"].apply(clean_answer)

In [7]:
D4

Unnamed: 0,question,answer
0,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.
1,model number,This may help InSinkErator Model BADGER-1: Bad...
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...
3,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...
4,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...
...,...,...
1396891,Does the adaptor cord for the iPhone 5 work wi...,No
1396892,will it charge a kidle fire?,"Simply answered, yes. It comes with a 3-in-1 a..."
1396893,What are the dimensions of this product?,4 by 1 1/2 inches
1396894,Does this have connector for 5C? I think 5C is...,I was able to charge Gembonics battery with iP...


In [None]:
# Apply to the `query` column
D4["cleaned_query"] = D4["question"].apply(clean_question)
D4["cleaned_answer"] = D4["answer"].apply(clean_answer)

In [8]:
D5

Unnamed: 0,question,answer
0,How can I create an account?,"To create an account, click on the 'Sign Up' b..."
1,What payment methods do you accept?,"We accept major credit cards, debit cards, and..."
2,How can I track my order?,You can track your order by logging into your ...
3,What is your return policy?,Our return policy allows you to return product...
4,Can I cancel my order?,You can cancel your order if it has not been s...
...,...,...
195,Do you offer a satisfaction guarantee?,"Yes, we offer a satisfaction guarantee on our ..."
196,How can I apply for a job at your company?,"To apply for a job at our company, visit our C..."
197,What is the warranty on your products?,The warranty on our products varies by item. P...
198,Can I request a refund if the price drops afte...,If the price of a product drops within 7 days ...


In [17]:
# Apply to the `query` column
D5["cleaned_query"] = D5["question"].apply(clean_question)
D5["cleaned_answer"] = D5["answer"].apply(clean_answer)

In [18]:
D5

Unnamed: 0,question,answer,cleaned_query,cleaned_answer
0,How can I create an account?,"To create an account, click on the 'Sign Up' b...",create account,"To create an account, click on the 'Sign Up' b..."
1,What payment methods do you accept?,"We accept major credit cards, debit cards, and...",payment method accept,"We accept major credit cards, debit cards, and..."
2,How can I track my order?,You can track your order by logging into your ...,track order,You can track your order by logging into your ...
3,What is your return policy?,Our return policy allows you to return product...,return policy,Our return policy allows you to return product...
4,Can I cancel my order?,You can cancel your order if it has not been s...,cancel order,You can cancel your order if it has not been s...
...,...,...,...,...
195,Do you offer a satisfaction guarantee?,"Yes, we offer a satisfaction guarantee on our ...",offer satisfaction guarantee,"Yes, we offer a satisfaction guarantee on our ..."
196,How can I apply for a job at your company?,"To apply for a job at our company, visit our C...",apply job company,"To apply for a job at our company, visit our C..."
197,What is the warranty on your products?,The warranty on our products varies by item. P...,warranty product,The warranty on our products varies by item. P...
198,Can I request a refund if the price drops afte...,If the price of a product drops within 7 days ...,request refund price drop purchase,If the price of a product drops within 7 days ...


In [20]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model (no training needed)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for cleaned queries
D5["embedding"] = D5["cleaned_query"].apply(lambda x: model.encode(x))

In [21]:
D5

Unnamed: 0,question,answer,cleaned_query,cleaned_answer,embedding
0,How can I create an account?,"To create an account, click on the 'Sign Up' b...",create account,"To create an account, click on the 'Sign Up' b...","[-0.03933173, -0.073190376, -0.058713168, -0.0..."
1,What payment methods do you accept?,"We accept major credit cards, debit cards, and...",payment method accept,"We accept major credit cards, debit cards, and...","[-0.00738355, 0.06920027, 0.0021899657, -0.034..."
2,How can I track my order?,You can track your order by logging into your ...,track order,You can track your order by logging into your ...,"[-0.024611782, -0.022893803, 0.050966974, -0.0..."
3,What is your return policy?,Our return policy allows you to return product...,return policy,Our return policy allows you to return product...,"[-0.035745326, 0.062710054, 0.04189153, 0.0047..."
4,Can I cancel my order?,You can cancel your order if it has not been s...,cancel order,You can cancel your order if it has not been s...,"[-0.046343867, 0.060871985, 0.100930884, 0.023..."
...,...,...,...,...,...
195,Do you offer a satisfaction guarantee?,"Yes, we offer a satisfaction guarantee on our ...",offer satisfaction guarantee,"Yes, we offer a satisfaction guarantee on our ...","[-0.09863094, 0.052422367, 0.06813188, -0.0070..."
196,How can I apply for a job at your company?,"To apply for a job at our company, visit our C...",apply job company,"To apply for a job at our company, visit our C...","[-0.07541368, -0.05075224, 0.018541757, -0.031..."
197,What is the warranty on your products?,The warranty on our products varies by item. P...,warranty product,The warranty on our products varies by item. P...,"[-0.18379788, 0.04935003, 0.081956245, -0.0053..."
198,Can I request a refund if the price drops afte...,If the price of a product drops within 7 days ...,request refund price drop purchase,If the price of a product drops within 7 days ...,"[-0.09591464, 0.04843379, 0.060017083, -0.0115..."


In [29]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_answer(user_query, df, threshold=0.7):
    # Clean the user query
    cleaned_user_query = clean_question(user_query)
    # Generate embedding for the query
    query_embedding = model.encode([cleaned_user_query])
    # Compute similarity with FAQ embeddings
    similarities = cosine_similarity(query_embedding, np.stack(df["embedding"]))
    best_match_idx = np.argmax(similarities)
    best_score = similarities[0][best_match_idx]
    
    if best_score >= threshold:
        return df.iloc[best_match_idx]["cleaned_answer"]
    else:
        return "Sorry, I couldn't find a relevant answer."

# Example
user_query = "My order hasn’t arrived yet. This is so frustrating!"
response = retrieve_answer(user_query, D5)
print(response)  # Output: "At the same time..."

Sorry, I couldn't find a relevant answer.
