In [1]:
import pandas as pd

In [2]:
D2_emb = pd.read_csv('data/FAQ Answering/Preprocessed embedding/D2_emb.csv')
D3_emb = pd.read_csv('data/FAQ Answering/Preprocessed embedding/D3_emb.csv')
D5_emb = pd.read_csv('data/FAQ Answering/Preprocessed embedding/D5_emb.csv')

In [3]:
D2_emb.shape

(130560, 5)

In [4]:
D3_emb.shape

(414, 5)

In [5]:
D5_emb.shape

(200, 5)

In [8]:
D_emb = pd.concat([D2_emb, D3_emb, D5_emb], axis=0)

In [9]:
D_emb.shape

(131174, 5)

In [10]:
D_emb.to_csv('data/FAQ Answering/Preprocessed embedding/D_emb.csv', index=False)

In [12]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer  # For encoding the query
import re
from spellchecker import SpellChecker
import spacy




In [13]:

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

In [14]:

# Load pre-trained model for encoding
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Clean question function
def clean_question(text):
    # Lowercase and remove punctuation
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Correct typos (optional)
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    # Lemmatize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return ' '.join(tokens)

# Load data
D_emb = pd.read_csv('data/FAQ Answering/Preprocessed embedding/D_emb.csv')

# Convert embeddings from strings to numpy arrays
D_emb["embedding"] = D_emb["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Retrieve answer function
def retrieve_answer(user_query, df, threshold=0.7):
    # Clean the user query
    cleaned_user_query = clean_question(user_query)
    # Generate embedding for the query
    query_embedding = model.encode([cleaned_user_query])
    # Compute similarity with FAQ embeddings
    similarities = cosine_similarity(query_embedding, np.stack(df["embedding"]))
    best_match_idx = np.argmax(similarities)
    best_score = similarities[0][best_match_idx]

    if best_score >= threshold:
        return df.iloc[best_match_idx]["cleaned_answer"]
    else:
        return "Sorry, I couldn't find a relevant answer."

In [24]:

# Example
user_query = "How can I reset my password?"
response = retrieve_answer(user_query, D_emb)
print(response)  # Output: "At the same time..."

To reset your password, click on the 'Forgot Password' link on the login page and follow the instructions to reset your password.
