<a href="https://colab.research.google.com/github/Nawel-Bellil/AI---Deep-Learning--/blob/main/mini_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk




In [None]:
import numpy as np
import pandas as pd
import json
import os
import re
import requests
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
import io
from io import StringIO
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# DATA PREPARATION

In [None]:
def download_data():
    """Download and prepare an advanced FAQ dataset"""
    print("Downloading FAQ dataset...")

    # Using the StackExchange FAQ dataset
    url = "https://raw.githubusercontent.com/Dibakarroy1997/Chatbot-FAQ/master/data/faq.csv"

    try:
        response = requests.get(url)
        data = pd.read_csv(io.StringIO(response.text))
        print(f"Dataset loaded with {len(data)} question-answer pairs")
        return data
    except Exception as e:
        print(f"Error downloading dataset: {e}")
        # Fallback to a small built-in dataset
        return pd.DataFrame({
            'Question': [
                "What is machine learning?",
                "How do neural networks work?",
                "What is natural language processing?",
                "Explain what a chatbot is",
                "What are embeddings in NLP?",
                "How does sentiment analysis work?",
                "What is transfer learning in AI?",
                "Explain the difference between AI and ML",
                "What is deep learning?",
                "How to measure chatbot effectiveness?"
            ],
            'Answer': [
                "Machine learning is a field of AI that enables computers to learn from data without explicit programming.",
                "Neural networks are computing systems inspired by biological neural networks, consisting of nodes (neurons) arranged in layers that process and transform input data to produce output.",
                "Natural Language Processing (NLP) is a field of AI focused on enabling computers to understand, interpret, and generate human language.",
                "A chatbot is a software application that uses AI to conduct conversations with users through text or speech interfaces.",
                "Embeddings in NLP are vector representations of words or sentences that capture semantic meanings, allowing machines to understand relationships between terms.",
                "Sentiment analysis works by using NLP techniques to identify and extract subjective information from text, determining if the expressed opinion is positive, negative, or neutral.",
                "Transfer learning is an ML technique where a model developed for one task is reused as the starting point for another related task, saving time and computational resources.",
                "AI (Artificial Intelligence) is the broader concept of machines being able to carry out tasks intelligently, while ML (Machine Learning) is a specific subset focused on training machines to learn patterns from data.",
                "Deep learning is a subset of machine learning that uses neural networks with many layers (deep neural networks) to analyze various factors with a structure similar to the human brain.",
                "Chatbot effectiveness can be measured through metrics like task completion rate, conversation length, user satisfaction scores, and correct response rate."
            ]
        })

# preprocessing

In [None]:
def preprocess_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""

    # Ensure NLTK data is downloaded
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)

    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords', quiet=True)

    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)
            # Download the 'punkt_tab' data if not already present
    try:
        nltk.data.find('tokenizers/punkt_tab') # Check if 'punkt_tab' data is present
    except LookupError:
        nltk.download('punkt_tab', quiet=True) # Download 'punkt_tab' if not found


    # Lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return " ".join(cleaned_tokens)

In [None]:
def prepare_knowledge_base(data):
    """Process the dataset into a clean knowledge base"""
    # Use original questions and answers but also add preprocessed versions
    knowledge_base = []

    for i, row in data.iterrows():
        question = row['Question'] if isinstance(row['Question'], str) else ""
        answer = row['Answer'] if isinstance(row['Answer'], str) else ""

        if question and answer:
            # Preprocess for better matching
            processed_question = preprocess_text(question)

            knowledge_base.append({
                'original_question': question,
                'processed_question': processed_question,
                'answer': answer
            })

    print(f"Knowledge base prepared with {len(knowledge_base)} entries")
    return knowledge_base

In [None]:
def save_knowledge_base(knowledge_base, filename="knowledge_base.json"):
    """Save the knowledge base to a file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(knowledge_base, f, ensure_ascii=False, indent=2)
    print(f"Knowledge base saved to {filename}")


In [None]:
def load_knowledge_base(filename="knowledge_base.json"):
    """Load the knowledge base from a file"""
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            knowledge_base = json.load(f)
        print(f"Knowledge base loaded with {len(knowledge_base)} entries")
        return knowledge_base
    else:
        print(f"No knowledge base file found at {filename}")
        return None

# EMBEDDING MODULE

In [None]:
def initialize_embedding_model():
    """Initialize the sentence embedding model"""
    print("Initializing embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model

In [None]:
def compute_embeddings(texts, model):
    """Compute embeddings for a list of texts"""
    return model.encode(texts)

In [None]:
def prepare_embeddings(knowledge_base, model):
    """Create embeddings for the knowledge base"""
    questions = [item['processed_question'] for item in knowledge_base]
    embeddings = compute_embeddings(questions, model)

    # Add embeddings to knowledge base
    for i, item in enumerate(knowledge_base):
        item['embedding'] = embeddings[i].tolist()

    print("Embeddings prepared for knowledge base")
    return knowledge_base

# QUERY HANDLING MODULE

In [None]:
def preprocess_query(query):
    """Clean and prepare the user query"""
    return preprocess_text(query)


In [None]:
def find_best_match(query, knowledge_base, model, threshold=0.6):
    """Find the best matching question for the query"""
    # Preprocess the query
    processed_query = preprocess_query(query)

    # Compute embedding for the query
    query_embedding = model.encode([processed_query])[0]

    # Find the best match
    best_match_idx = -1
    best_match_score = -1

    for i, item in enumerate(knowledge_base):
        # Get embedding from knowledge base
        item_embedding = np.array(item['embedding'])

        # Calculate cosine similarity
        similarity = cosine_similarity([query_embedding], [item_embedding])[0][0]

        if similarity > best_match_score:
            best_match_score = similarity
            best_match_idx = i

    # Return the best match if it exceeds the threshold
    if best_match_score >= threshold and best_match_idx != -1:
        return knowledge_base[best_match_idx]['answer'], best_match_score, knowledge_base[best_match_idx]['original_question']
    else:
        fallback_responses = [
            "I'm not sure I understand. Could you rephrase your question?",
            "I don't have enough information to answer that properly.",
            "That's an interesting question, but I'm not confident in my answer.",
            "I'm still learning about that topic. Could you ask something else?"
        ]
        return random.choice(fallback_responses), best_match_score, None

# RESPONSE GENERATION MODULE

In [None]:
def generate_response(query, knowledge_base, model):
    """Generate a response to the user query"""
    answer, confidence, matched_question = find_best_match(query, knowledge_base, model)

    response_data = {
        'answer': answer,
        'confidence': confidence,
        'matched_question': matched_question
    }

    return response_data

In [None]:
def format_response(response_data):
    """Format the response data for display"""
    answer = response_data['answer']
    confidence = response_data['confidence']
    matched_question = response_data['matched_question']

    formatted_response = f"{answer}"

    if matched_question and confidence >= 0.8:
        formatted_response += f"\n\n(I matched your question to: '{matched_question}')"

    return formatted_response

# KNOWLEDGE BASE EXPANSION MODULE

In [None]:
def add_to_knowledge_base(question, answer, knowledge_base, model):
    """Add a new question-answer pair to the knowledge base"""
    processed_question = preprocess_text(question)
    embedding = model.encode([processed_question])[0].tolist()

    new_entry = {
        'original_question': question,
        'processed_question': processed_question,
        'answer': answer,
        'embedding': embedding
    }

    knowledge_base.append(new_entry)
    print(f"Added new knowledge: '{question}'")
    return knowledge_base

# CHAT INTERFACE MODULE

In [None]:
def start_chat():
    """Main chat interface"""
    print("Initializing chatbot...")

    # Step 1: Prepare data
    data = download_data()
    knowledge_base = prepare_knowledge_base(data)

    # Step 2: Initialize embedding model
    model = initialize_embedding_model()

    # Step 3: Prepare embeddings
    knowledge_base = prepare_embeddings(knowledge_base, model)

    # Save knowledge base (optional)
    save_knowledge_base(knowledge_base)

    print("\n=== Modular Chatbot Ready ===")
    print("Type 'exit' to end the conversation")
    print("Use 'learn: your question | your answer' to teach me something new")
    print("============================\n")

    while True:
        user_input = input("You: ").strip()

        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Chatbot: Goodbye!")
            break

        if user_input.lower().startswith("learn:"):
            # Format: learn: question | answer
            parts = user_input[6:].split("|")
            if len(parts) == 2:
                question = parts[0].strip()
                answer = parts[1].strip()
                knowledge_base = add_to_knowledge_base(question, answer, knowledge_base, model)
                print("Chatbot: Thanks! I've learned something new.")
                # Save the updated knowledge base
                save_knowledge_base(knowledge_base)
                continue

        # Process query and generate response
        response_data = generate_response(user_input, knowledge_base, model)
        formatted_response = format_response(response_data)

        print(f"Chatbot: {formatted_response}")
        print(f"(Confidence: {response_data['confidence']:.2f})")

# ENTRY POINT

In [None]:
if __name__ == "__main__":
    start_chat()

Initializing chatbot...
Downloading FAQ dataset...
Dataset loaded with 0 question-answer pairs
Knowledge base prepared with 0 entries
Initializing embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embeddings prepared for knowledge base
Knowledge base saved to knowledge_base.json

=== Modular Chatbot Ready ===
Type 'exit' to end the conversation
Use 'learn: your question | your answer' to teach me something new

Chatbot: I don't have enough information to answer that properly.
(Confidence: -1.00)
Chatbot: I'm not sure I understand. Could you rephrase your question?
(Confidence: -1.00)
Chatbot: That's an interesting question, but I'm not confident in my answer.
(Confidence: -1.00)
Chatbot: I don't have enough information to answer that properly.
(Confidence: -1.00)
