In [None]:
import sklearn
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Ensure WordNet is fully functional



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import json
import torch
import nltk
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')

# Fallback tokenization function (if nltk word_tokenize fails)
def tokenize_text(text):
    try:
        nltk.download('punkt')
        from nltk.tokenize import word_tokenize
        return word_tokenize(text.lower())
    except LookupError:
        return text.lower().split()  # Fallback to simple split-based tokenization

class BERTChatbot:
    def __init__(self, json_file):
        """Initialize chatbot with BERT model and tokenizer."""
        self.json_file = json_file
        self.load_data()
        self.lemmatizer = WordNetLemmatizer()

        # Load pre-trained BERT model and tokenizer
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = BertModel.from_pretrained("bert-base-uncased")

        # Convert all questions into BERT embeddings
        self.question_embeddings = self.encode_questions()

    def load_data(self):
        """Load Q&A data from a JSON file."""
        with open(self.json_file, 'r', encoding='utf-8') as file:
            self.qa_data = json.load(file)
        self.questions = list(self.qa_data.keys())
        self.answers = list(self.qa_data.values())

    def lemmatize_text(self, text):
        """Lemmatize text for better preprocessing."""
        tokens = tokenize_text(text)  # Use the improved tokenizer
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        return " ".join(lemmatized_tokens)

    def encode_text(self, text):
        """Convert text into a BERT embedding."""
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()  # Extract [CLS] token representation

    def encode_questions(self):
        """Convert all questions into BERT embeddings."""
        return np.vstack([self.encode_text(self.lemmatize_text(q)) for q in self.questions])

    def get_best_answer(self, user_question):
        """Find the best-matching question and return the corresponding answer."""
        user_question = self.lemmatize_text(user_question)
        user_embedding = self.encode_text(user_question)

        similarities = cosine_similarity(user_embedding, self.question_embeddings).flatten()
        best_match_index = np.argmax(similarities)

        if similarities[best_match_index] < 0.5:  # Confidence threshold
            return "I'm not sure about that. Can you rephrase your question?"
        return self.answers[best_match_index]

# Load chatbot
chatbot = BERTChatbot("qa_data.json")

# Chat loop
while True:
    user_input = input("\nYou: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break
    response = chatbot.get_best_answer(user_input)
    print(f"Chatbot: {response}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[


You: hi


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chatbot: Deep learning is a subset of AI that uses neural networks to process data.

You: how are you?


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chatbot: I'm a chatbot, so I don't have feelings, but I'm here to help!

You: what is ai?


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chatbot: AI stands for Artificial Intelligence, enabling machines to think and learn.

You: exit
Chatbot: Goodbye!
