In [45]:
import pandas as pd
import spacy

df = pd.read_csv("data/FAQ Answering/D5/Data_2.csv")

In [46]:
df

Unnamed: 0.1,Unnamed: 0,question,answer
0,0,How can I create an account?,"To create an account, click on the 'Sign Up' b..."
1,1,What payment methods do you accept?,"We accept major credit cards, debit cards, and..."
2,2,How can I track my order?,You can track your order by logging into your ...
3,3,What is your return policy?,Our return policy allows you to return product...
4,4,Can I cancel my order?,You can cancel your order if it has not been s...
...,...,...,...
195,195,Do you offer a satisfaction guarantee?,"Yes, we offer a satisfaction guarantee on our ..."
196,196,How can I apply for a job at your company?,"To apply for a job at our company, visit our C..."
197,197,What is the warranty on your products?,The warranty on our products varies by item. P...
198,198,Can I request a refund if the price drops afte...,If the price of a product drops within 7 days ...


In [47]:
D5 = df[["question", "answer"]]

In [48]:
D5

Unnamed: 0,question,answer
0,How can I create an account?,"To create an account, click on the 'Sign Up' b..."
1,What payment methods do you accept?,"We accept major credit cards, debit cards, and..."
2,How can I track my order?,You can track your order by logging into your ...
3,What is your return policy?,Our return policy allows you to return product...
4,Can I cancel my order?,You can cancel your order if it has not been s...
...,...,...
195,Do you offer a satisfaction guarantee?,"Yes, we offer a satisfaction guarantee on our ..."
196,How can I apply for a job at your company?,"To apply for a job at our company, visit our C..."
197,What is the warranty on your products?,The warranty on our products varies by item. P...
198,Can I request a refund if the price drops afte...,If the price of a product drops within 7 days ...


In [49]:
D5.to_csv("data/FAQ Answering/Preprocessed data/D5.csv", index=False)

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:

def clean_question(text):
    # Remove punctuation and lowercase
    text = text.lower().strip()
    text = ''.join([c for c in text if c.isalnum() or c == ' '])
    
    # Lemmatize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return ' '.join(tokens)

# Apply cleaning to the 'question' column
df["cleaned_question"] = df["question"].apply(clean_question)

In [37]:
# Ensure answers are non-empty and trim whitespace
df["answer"] = df["answer"].str.strip()
df = df.dropna(subset=["answer"])  # Remove rows with missing answers

In [17]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for cleaned questions
df["embedding"] = df["cleaned_question"].apply(lambda x: model.encode(x))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
df

Unnamed: 0.1,Unnamed: 0,question,answer,cleaned_question,embedding
0,0,How can I create an account?,"To create an account, click on the 'Sign Up' b...",create account,"[-0.03933173, -0.073190376, -0.058713168, -0.0..."
1,1,What payment methods do you accept?,"We accept major credit cards, debit cards, and...",payment method accept,"[-0.00738355, 0.06920027, 0.0021899657, -0.034..."
2,2,How can I track my order?,You can track your order by logging into your ...,track order,"[-0.024611782, -0.022893803, 0.050966974, -0.0..."
3,3,What is your return policy?,Our return policy allows you to return product...,return policy,"[-0.035745326, 0.062710054, 0.04189153, 0.0047..."
4,4,Can I cancel my order?,You can cancel your order if it has not been s...,cancel order,"[-0.046343867, 0.060871985, 0.100930884, 0.023..."
...,...,...,...,...,...
195,195,Do you offer a satisfaction guarantee?,"Yes, we offer a satisfaction guarantee on our ...",offer satisfaction guarantee,"[-0.09863094, 0.052422367, 0.06813188, -0.0070..."
196,196,How can I apply for a job at your company?,"To apply for a job at our company, visit our C...",apply job company,"[-0.07541368, -0.05075224, 0.018541757, -0.031..."
197,197,What is the warranty on your products?,The warranty on our products varies by item. P...,warranty product,"[-0.18379788, 0.04935003, 0.081956245, -0.0053..."
198,198,Can I request a refund if the price drops afte...,If the price of a product drops within 7 days ...,request refund price drop purchase,"[-0.09591464, 0.04843379, 0.060017083, -0.0115..."


In [19]:
# Save to CSV (without embeddings)
df.to_csv("preprocessed_faq.csv", index=False)

# Save embeddings separately (e.g., for FAISS/Pinecone)
import numpy as np
embeddings = np.array(df["embedding"].tolist())
np.save("faq_embeddings.npy", embeddings)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

def get_answer(user_query, df, threshold=0.7):
    # Clean the user query
    cleaned_query = clean_question(user_query)
    # Generate embedding for the query
    query_embedding = model.encode([cleaned_query])
    # Compute similarity with FAQ embeddings
    similarities = cosine_similarity(query_embedding, np.array(df["embedding"].tolist()))
    best_match_idx = np.argmax(similarities)
    best_score = similarities[0][best_match_idx]
    
    if best_score >= threshold:
        return df.iloc[best_match_idx]["answer"]
    else:
        return "Sorry, I couldn't find a relevant answer."



In [30]:
# Example usage
user_query = "return product"
response = get_answer(user_query, df)
print(response)  # Output: "To create an account, click on the 'Sign Up' button..."

Yes, you can return a product if you changed your mind. Please ensure the product is in its original condition and packaging, and refer to our return policy for instructions.
