Using with Dataset without BERT

In [14]:
import nltk
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download required NLTK data
nltk.download('punkt')

# Load HDFC FAQ dataset from the text file
file_path = "/content/HDFC_Faq.txt"  # Adjust the path if needed

with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Preprocess the dataset (assuming questions and answers are separated by ":")
faq_data = {}
for line in lines:
    if ":" in line:
        question, answer = line.split(":", 1)  # Splitting at the first occurrence of ":"
        faq_data[question.strip()] = answer.strip()

# Convert dictionary to DataFrame
faq_df = pd.DataFrame(list(faq_data.items()), columns=["question", "answer"])

# Extract questions and answers
questions = faq_df['question'].astype(str).tolist()
answers = faq_df['answer'].astype(str).tolist()

# Text preprocessing function
def preprocess(text):
    return text.lower().translate(str.maketrans('', '', string.punctuation))

# Vectorize the FAQ questions using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
faq_vectors = vectorizer.fit_transform([preprocess(q) for q in questions])

# Chatbot response function
def chatbot_response(user_input):
    user_input = preprocess(user_input)
    user_vector = vectorizer.transform([user_input])

    similarities = cosine_similarity(user_vector, faq_vectors)
    best_match_index = np.argmax(similarities)

    if similarities[0, best_match_index] > 0.2:  # Threshold for a valid match
        return answers[best_match_index]
    else:
        return "I'm sorry, I don't have an answer for that. Please contact customer support."

# Chat loop
print("Chatbot: Hello! Ask me an HDFC-related question or type 'exit' to quit.")
while True:
    user_query = input("You: ")
    if user_query.lower() == "exit":
        print("Chatbot: Goodbye! Have a great day!")
        break
    response = chatbot_response(user_query)
    print("Chatbot:", response)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chatbot: Hello! Ask me an HDFC-related question or type 'exit' to quit.
You: How do I change my password
Chatbot: I'm sorry, I don't have an answer for that. Please contact customer support.
You: What is RTGS Funds Transfer?
Chatbot: I'm sorry, I don't have an answer for that. Please contact customer support.
You: How to make payment for Insta Loan / Insta Jumbo Loan / SmartEMI?
Chatbot: I'm sorry, I don't have an answer for that. Please contact customer support.
You: exit
Chatbot: Goodbye! Have a great day!


Using without Dataset

In [15]:
import nltk
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download required NLTK data
nltk.download('punkt')

# Corrected FAQ dataset as a dictionary
faq_data = {
    "What is your name?": "I am a chatbot designed to assist you.",
    "How can I reset my password?": "To reset your password, go to the settings page and click 'Reset Password'.",
    "What is the capital of France?": "The capital of France is Paris.",
    "How do I contact customer support?": "You can contact customer support via email at support@example.com.",
    "What are your working hours?": "Our support is available 24/7.",
    "How do I track my order?": "You can track your order by logging into your account and clicking 'Track Order'.",
    "What payment methods do you accept?": "We accept Visa, MasterCard, PayPal, and Apple Pay.",
    "How do I return a product?": "To return a product, visit our returns page and follow the instructions.",
    "Where are you located?": "We are located in San Francisco, CA.",
    "How do I subscribe to your newsletter?": "You can subscribe by entering your email on our homepage.",
    "What is the refund policy?": "Our refund policy allows returns within 30 days of purchase.",
    "How do I apply for a job?": "You can apply for jobs on our Careers page.",
    "Do you offer discounts for students?": "Yes, we offer special discounts for students with a valid ID.",
    "What is your phone number?": "You can reach us at (123) 456-7890.",
    "How can I update my billing information?": "You can update your billing details in the 'Billing' section of your account settings.",
    "Do you have a mobile app?": "Yes, our mobile app is available for download on iOS and Android.",
    "Can I change my shipping address after placing an order?": "Yes, you can modify your shipping address within 24 hours of placing your order.",
    "Do you offer international shipping?": "Yes, we ship to most countries worldwide.",
    "How do I cancel my subscription?": "To cancel your subscription, go to the 'Subscriptions' section of your account and select 'Cancel'.",
    "Can I speak to a human agent?": "Yes, you can request to speak with a live agent by calling our support number or using the live chat feature."
}

# Convert dictionary to DataFrame correctly
faq_df = pd.DataFrame(list(faq_data.items()), columns=["question", "answer"])

# Preprocess the dataset
questions = faq_df['question'].astype(str).tolist()
answers = faq_df['answer'].astype(str).tolist()

def preprocess(text):
    return text.lower().translate(str.maketrans('', '', string.punctuation))

# Vectorize the FAQ questions
vectorizer = TfidfVectorizer(stop_words="english")
faq_vectors = vectorizer.fit_transform([preprocess(q) for q in questions])

# Chatbot response function
def chatbot_response(user_input):
    user_input = preprocess(user_input)
    user_vector = vectorizer.transform([user_input])

    similarities = cosine_similarity(user_vector, faq_vectors)
    best_match_index = np.argmax(similarities)

    if similarities[0, best_match_index] > 0.2:  # Threshold for matching
        return answers[best_match_index]
    else:
        return "I'm sorry, I don't have an answer for that."

# Chat loop
print("Chatbot: Hello! Ask me a question or type 'exit' to quit.")
while True:
    user_query = input("You: ")
    if user_query.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    response = chatbot_response(user_query)
    print("Chatbot:", response)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chatbot: Hello! Ask me a question or type 'exit' to quit.
You: How do I cancel my subscription?
Chatbot: To cancel your subscription, go to the 'Subscriptions' section of your account and select 'Cancel'.
You: What is your phone number?
Chatbot: You can reach us at (123) 456-7890.
You: What is RTGS Funds Transfer?
Chatbot: I'm sorry, I don't have an answer for that.
You: exit
Chatbot: Goodbye!


Using BERT model for ChatBot

In [2]:
import torch
import nltk
import numpy as np
import pandas as pd
import string
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Download required NLTK data
nltk.download('punkt')

# Load HDFC FAQ dataset from the text file
file_path = "/content/HDFC_Faq.txt"

# Load FAQ dataset
faq_df = pd.read_json(file_path)

# Extract questions and answers
questions = faq_df['question'].astype(str).tolist()
answers = faq_df['answer'].astype(str).tolist()

# Load pre-trained BERT model & tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Function to convert text into BERT embeddings
def get_bert_embedding(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state[:, 0, :].numpy().reshape(1, -1)

# Generate BERT embeddings for all FAQ questions
faq_embeddings = np.vstack([get_bert_embedding(q) for q in questions])

# Chatbot response function
def chatbot_response(user_input):
    user_embedding = get_bert_embedding(user_input)  # Already 2D due to reshaping

    # Compute cosine similarity
    similarities = cosine_similarity(user_embedding, faq_embeddings)
    best_match_index = np.argmax(similarities)

    if similarities[0, best_match_index] > 0.5:  # Higher threshold for BERT
        return answers[best_match_index]
    else:
        return "I'm sorry, I don't have an answer for that. Please contact customer support."

# Chat loop
print("Chatbot (BERT): Hello! Ask me an HDFC-related question or type 'exit' to quit.")
while True:
    user_query = input("You: ")
    if user_query.lower() == "exit":
        print("Chatbot: Goodbye! Have a great day!")
        break
    response = chatbot_response(user_query)
    print("Chatbot:", response)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chatbot (BERT): Hello! Ask me an HDFC-related question or type 'exit' to quit.
You: How do I change my password?
Chatbot: After you have logged in, you can change your password using the "Change password" option in the top part of the screen. You have to type your current password and the new password you have chosen in their respective boxes.
You: Are there minimum or maximum limits for RTGS transfers?
Chatbot: The RTGS system has been designed for large value transactions.The minimum amount to be remitted through RTGS is Rs. 2 Lakhs. There is no upper ceiling for RTGS transactions.The only change to this is RTGS transactions done through NetBanking, when the maximum amount of funds that can be transferred per Customer ID per day is Rs. 10 Lakhs.
You: exit
Chatbot: Goodbye! Have a great day!
