In [1]:
import os, json, pdfplumber
import tkinter as tk
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationStringBufferMemory
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from guardrails import Guard
from guardrails.hub import NSFWText, ToxicLanguage
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import SQLChatMessageHistory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

with open("classifier_data.json", "r") as file:
    data = json.load(file)

labels = data["labels"]
nlp_keywords = data["nlp_keywords"]
folder_path = "Courses"

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [3]:
def extract_text_with_pdfplumber(pdf_path):
    """Extract text from a PDF file using pdfplumber."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_documents_from_folder(folder_path):
    """Extract text from all PDF files in a folder and return as a list of Documents."""
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            print(f"Extracting text from {filename}...")
            text = extract_text_with_pdfplumber(file_path)

            document = Document(page_content=text, metadata={"source": filename})
            documents.append(document)
    return documents

def split_text_into_chunks(documents, chunk_size=700, chunk_overlap=50):
    """Split documents into smaller chunks for embedding."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(documents)
    return chunks

def create_faiss_index(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Create a FAISS vector index from text chunks using Sentence Transformers."""
    print(f"Loading SentenceTransformer model: {model_name}...")
    embedding_model = HuggingFaceEmbeddings(
        model_name = "sentence-transformers/all-MiniLM-L6-v2",
        encode_kwargs = {'normalize_embeddings': True} 
    )
    faiss_index = FAISS.from_documents(chunks, embedding_model)

    faiss_index.save_local("faiss_index")
    return faiss_index

def load_faiss_index(model_name="sentence-transformers/all-MiniLM-L6-v2"):
    print(f"Loading SentenceTransformer model: {model_name}...")
    embedding_model = HuggingFaceEmbeddings(
        model_name = "sentence-transformers/all-MiniLM-L6-v2",
        encode_kwargs = {'normalize_embeddings': True} 
    )

    faiss_index = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
    return faiss_index

def generate_faiss_index(folder_path):
    
    print("Starting text extraction...")
    documents = extract_documents_from_folder(folder_path)
    
    print("Splitting text into chunks...")
    chunks = split_text_into_chunks(documents)
    
    print("Creating FAISS index...")
    return create_faiss_index(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2")

def format_docs(docs): 
    return "\n\n".join(doc.page_content for doc in docs)

def get_prompt_template():
    template = """
        Using only the given context about Natural Language Processing and Large Language Models, answer the user's question.
        Please follow the following rules:
            1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer."
            2. If you find the answer, write the answer concisely with at most two sentences.

        {context}
        Conversation history:
        {chat_history}

        Query: {question}
        """
    return PromptTemplate(template=template, input_variables=["context", "question", "chat_history"])
    
def custom_on_fail(value, fail_result):
    return "Please retain from using profanity with our model"
    
def get_session_history(session_id):
    return SQLChatMessageHistory(session_id, "sqlite:///memory.db")

def setup_retrieval_qa(faiss_index, threshold=0.5):
    
    llm = HuggingFaceEndpoint(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        temperature= 0.5
    )
    retriever = faiss_index.as_retriever(
        score_threshold=threshold,
        search_type="similarity",
        search_kwargs={"k": 5},  
    )

    guard = Guard().use_many(
        ToxicLanguage(threshold=0.95, validation_method="sentence", on_fail=custom_on_fail),
        NSFWText(threshold=0.95, validation_method="sentence", on_fail=custom_on_fail)
        )

    memory = ConversationStringBufferMemory(memory_key="chat_history", return_messages=False)
    prompt_template = get_prompt_template()
    runnable = prompt_template | llm | guard.to_runnable()

    runnable_with_history = RunnableWithMessageHistory(
        runnable,
        get_session_history,
        input_messages_key="question",
        history_messages_key="chat_history",
    )
    return retriever, runnable_with_history

def classify_question(question: str):
    question_lower = question.lower()
    for keyword in nlp_keywords:
        if keyword in question_lower:
            return "Natural Language Processing", 1.0

    result = classifier(question, labels)
    label = result["labels"][0]
    score = result["scores"][0] 
    return label, score

In [None]:
#for first time run create the index
#faiss_index = generate_faiss_index(folder_path)
    
#for next run just load the created index
faiss_index = load_faiss_index(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
print("Setting up the chatbot...")
retriever, my_rag_chain = setup_retrieval_qa(faiss_index)

Starting text extraction...
Extracting text from 0. Course Introduction.pdf...
Extracting text from 1. NLP Overview.pdf...
Extracting text from 10. Transformers II.pdf...
Extracting text from 11. From Transformers to LLMs.pdf...
Extracting text from 12. HuggingFace.pdf...
Extracting text from 13. Encoder-only Transformers.pdf...
Extracting text from 14. Decoder-only Transformers.pdf...
Extracting text from 15. Encoder-Decoder Transformers.pdf...
Extracting text from 17. Fine tuning.pdf...
Extracting text from 18. Prompt Engineering.pdf...
Extracting text from 2. Representing Text.pdf...
Extracting text from 20. RAG.pdf...
Extracting text from 21. RLHF for LLMs.pdf...
Extracting text from 22. Guardrails for LLMs.pdf...
Extracting text from 3. Math with Words.pdf...
Extracting text from 4. Text Classification.pdf...
Extracting text from 5. Word Embeddings.pdf...
Extracting text from 6. Neural Networks for NLP.pdf...
Extracting text from 7. Dialog Engines.pdf...
Extracting text from 8. Bu

  memory = ConversationStringBufferMemory(memory_key="chat_history", return_messages=False)


In [5]:
def chatbot(user_input):

    if not user_input or len(user_input) == 0 or user_input == "":
        return "Please enter a question."
    
    print(f'Question: {user_input}')
    
    label, score = classify_question(user_input)

    if label != "Natural Language Processing" or score < 0.55:
        return "I'm only able to answer questions about NLP, LLMs, and the related course materials."
    
    matches = retriever.invoke(user_input) 
    relevant_chunks = " ".join([match.page_content for match in matches])

    try:
        result = my_rag_chain.invoke(
            {"context": relevant_chunks, "question": user_input},
            {"configurable": {"session_id": "4"}})
        
        return result
    except Exception as e:
        print(e)
        return "There was an error."

In [6]:
def submit_query(event=None):
    user_input = entry.get()
    if user_input.lower() == 'exit':
        chat_box.insert(tk.END, "Chatbot: Goodbye!\n\n")
        chat_box.yview(tk.END)
        window.quit()
        window.destroy() 
        return
    
    response = chatbot(user_input)
    print(response)
    
    chat_box.config(state=tk.NORMAL)
    chat_box.insert(tk.END, f"User: {user_input}\nChatbot: {response}\n\n")
    chat_box.yview(tk.END)
    chat_box.config(state=tk.DISABLED)
    
    entry.delete(0, tk.END)
    
window = tk.Tk()
window.title("Chatbot UI")
window.geometry("800x500")

chat_box = tk.Text(window, height=25, width=80, state=tk.DISABLED, wrap=tk.WORD)
chat_box.pack(pady=10)

entry = tk.Entry(window, width=80)
entry.pack(pady=5)

entry.bind("<Return>", submit_query)

'2107824970048submit_query'

In [7]:
window.mainloop()

Question: What is LLM?


  message_history = self.get_session_history(



        Response: LLM, or Large Language Model, is an advanced model used in Natural Language Processing (NLP) for understanding and generating human-like text. It is trained on extensive data and computational power to understand and generate text based on context and meaning. LLMs are used in various applications such as text generation, machine translation, chatbots, code generation, question answering, text summarization, writing assistance, and multimodal LLM. They have been proven to be effective in these applications and have significantly advanced the field of NLP. For example, they have improved the quality of machine translation, enabled the development of advanced chatbots, and facilitated code generation and summarization tasks. LLMs are a valuable tool for researchers, developers, and businesses in the field of NLP, and their usefulness continues to grow as new applications and use cases are discovered.
Question: Are you sure?
I'm only able to answer questions about NLP, 




        Response: Yes, I'm an assistant specifically designed to help answer questions related to Natural Language Processing, Large Language Models, and the provided course materials. I can provide accurate and detailed information about the concepts, applications, and technologies related to these topics. I'm here to help clarify any doubts or questions you may have, so feel free to ask me anything related to these topics. I cannot answer questions about personal names or other unrelated topics.

        LLM, or Large Language Model, is an advanced model used in Natural Language Processing (NLP) for understanding and generating human-like text. It is trained on extensive data and computational power to understand and generate text based on context and meaning. LLMs are used in various applications such as text generation, machine translation, chatbots, code generation, question answering, text summarization, writing assistance, and multimodal LLM. They have been proven to be effecti




        Response: Nicola Capuano and Antonio Greco are the teachers of the course.
Question: And what about the exam?
I'm only able to answer questions about NLP, LLMs, and the related course materials.
Question: What do you know about the exam for LLM?





        Response: I'm an assistant specifically designed to help answer questions related to Natural Language Processing, Large Language Models, and the provided course materials. However, I cannot find specific information about an exam for LLM in the context you've given. If you could provide more context or details about the exam, such as the name, date, or format, I would be happy to help you with that. In the meantime, if you have any questions related to LLMs, NLP, or the provided course materials, feel free to ask me. I cannot answer questions about personal names or other unrelated topics.
Question: What about the exam for NLP?





        Response: I'm an assistant specifically designed to help answer questions related to Natural Language Processing, Large Language Models, and the provided course materials. However, I cannot find specific information about an exam for NLP in the context you've given. If you could provide more context or details about the exam, such as the name, date, or format, I would be happy to help you with that. In the meantime, if you have any questions related to NLP, LLMs, or the provided course materials, feel free to ask me. I cannot answer questions about personal names or other unrelated topics.
Question: What about the exam of the course?





        Response: I'm an assistant specifically designed to help answer questions related to Natural Language Processing, Large Language Models, and the provided course materials. However, I cannot find specific information about an exam for the course in the context you've given. If you could provide more context or details about the exam, such as the name, date, or format, I would be happy to help you with that. In the meantime, if you have any questions related to NLP, LLMs, or the provided course materials, feel free to ask me. I cannot answer questions about personal names or other unrelated topics.
Question: What about the teachers of the course?





        Response: Nicola Capuano and Antonio Greco are the teachers of the course.
Question: And what do you know about them?
I'm only able to answer questions about NLP, LLMs, and the related course materials.
Question: And what do you know about the teachers?





        Response: Nicola Capuano and Antonio Greco are the teachers of the course.
Question: How to contact the teachers?





        Based on the given context, the teachers' contact information is not provided directly in the conversation history. However, it is mentioned in the example-classes tag that their names are Nicola Capuano and Antonio Greco. To contact them, one could look up their contact information using a search engine or the university's website.

        Answer: To contact the teachers, Nicola Capuano and Antonio Greco, you can look up their contact information using a search engine or the university's website.
Question: What is Bag of words?





        Response: Bag of words is a technique for representing text data as numerical vectors. Each word is assigned an index that represents its position in the vocabulary, and the text is represented as a vector of the indices of the words it contains. The distance between any two vectors is always the same, but it does not capture the semantics of words and is not efficient due to sparse vectors. It is often used as a baseline method for text classification tasks.
Question: What is a transformer?
I'm only able to answer questions about NLP, LLMs, and the related course materials.
Question: What are transformers?





        Answer: Transformers are a type of deep learning model architecture introduced by Vaswani et al. in 2017 for handling sequential data, such as text or speech. They use self-attention mechanisms to allow each position in the sequence to attend to all other positions, enabling the model to focus on relevant information in the input sequence for each output token. This allows the model to capture long-range dependencies and context in the data, making it particularly effective for natural language processing tasks. Transformers have been widely adopted in various NLP applications, such as machine translation, question answering, and text generation.
Question: What is tokenization?





        Answer: Tokenization is the process of breaking down text into smaller units, called tokens, for analysis by a computer. In Natural Language Processing (NLP), tokens can be words, punctuation marks, or subwords, depending on the specific tokenization method used. Tokenization helps to simplify complex text data and make it easier for machines to understand and process. For instance, the WordPiece model uses subword tokenization to break down words into smaller meaningful components, while the bag-of-words model represents text as a vector of word indices.
Question: What are RNNs?





        RNNs, or Recurrent Neural Networks, are a type of deep learning model architecture used for processing sequential data, such as text, speech, or time series data. Unlike traditional feedforward neural networks, RNNs have a recurrent connection that allows information from previous time steps to influence the current time step, enabling the model to capture temporal dependencies and context in the data. This makes RNNs particularly effective for natural language processing tasks, such as language translation, speech recognition, and text generation. RNNs can be unidirectional, processing the sequence only in one direction, or bidirectional, processing it in both directions, which can help capture more complex patterns and relationships in the data.
Question: And what are their limits in NLP?





        Response: The limits of Term Frequency (TF) include the inability to handle word variations and punctuation correctly. For instance, the word "dog" may appear more frequently in a short email to a veterinarian than in the novel "War and Peace," but it is not necessarily more important in the context of the document. Normalized Term Frequency (TF-IDF) addresses this issue by considering both the frequency of a word in a document and its frequency in the entire corpus. However, it still has limitations in handling word variations and complexities, such as synonyms, negations, and idiomatic expressions. To handle these complexities, more advanced NLP techniques, such as Word Embeddings, are used.
Question: What are different types of word embeddings?





        Response: Word embeddings are numerical representations of words that capture their meaning and relationships with other words. There are two main types of word embeddings: Static and Contextual.

        Static embeddings, such as Word2Vec, GloVe, and FastText, represent each word as a single static vector that captures the average meaning of the word based on the training corpus. These vectors do not change based on context and do not account for polysemy and homonymy.

        Contextual embeddings, such as ELMo and BERT, can be updated based on the context of surrounding words. They are particularly effective for applications that need deep language understanding, such as machine translation and question answering. In contextual embeddings, the embedding for a word changes depending on the context, making them more accurate in capturing the meaning of words in various contexts.

        For example, the word "bank" can refer to a financial institution or the side of a rive




        Transformers use self-attention mechanisms, which allow each position in the sequence to attend to all other positions, enabling the model to focus on relevant information in the input sequence for each output token. This allows the model to capture long-range dependencies and context in the data, making it particularly effective for natural language processing tasks. Transformers have been shown to be more effective than RNNs in handling long-range dependencies and addressing the vanishing gradient problem.

        Transformers' self-attention mechanisms enable the model to compute the weighted sum of input tokens based on their relevance to each other, which helps maintain the gradients during backpropagation. In contrast, RNNs suffer from the vanishing gradient problem, where the gradients become smaller and eventually disappear as they propagate through many time steps, making it difficult to learn long-term dependencies and patterns in the data. By using self-attention m




        TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic used in information retrieval and text mining to reflect how important a word is to a document in a collection or corpus. It is based on the idea that the importance of a word to a document should be measured by both its frequency (TF) in the document and its rarity (IDF) in the corpus. TF-IDF is calculated as the product of the term frequency (TF) and the inverse document frequency (IDF) of a word in a document collection. It is commonly used in text search engines, text mining, and information retrieval systems to rank the importance of words and documents. TF-IDF matrices have been the mainstay of information retrieval (search) for decades. The process involves tokenizing all documents and creating a TF-IDF matrix, where each row represents a document and each column represents a term in the corpus. IDF gives more weightage to the words that are less frequent in the corpus, and high TF-IDF indicates



Please retain from using profanity with our model
Question: Describe the self attention method





        Response: The self-attention method is a mechanism used in Transformer models for processing sequential data, such as text or speech. It allows each position in the sequence to attend to all other positions, enabling the model to capture long-range dependencies and context in the data. In self-attention, the input sequence is split into three parts: queries (Q), keys (K), and values (V). Each position in the sequence computes a dot product between the queries and keys, and the results are scaled and passed through a softmax function to get the attention weights. The values are then weighted by the attention weights and summed to produce the output for each position. This process is repeated for each position in the sequence, resulting in a sequence of outputs that captures the relationships and dependencies between the positions in the input sequence. Self-attention enables the model to focus on relevant information in the input sequence for each output token, making it partic




        The self-attention mechanism is a core component of the Transformers model architecture. It allows each position in the sequence to attend to all other positions, enabling the model to capture long-range dependencies and context in the data. This is particularly useful for natural language processing tasks, where understanding the relationships and dependencies between words and phrases is essential for accurate meaning representation. By using self-attention, the Transformers model can effectively learn the relationships and dependencies between positions in the input sequence, making it more effective for natural language processing tasks than traditional recurrent neural networks (RNNs).
Question: What is the textbook of the course?





        Response: The textbooks for the course are BookCorpus and Gutenberg. These datasets include a wide range of literary genres and contribute to the models' pre-training by exposing them to a diverse array of textual genres and subject matter, fostering a more comprehensive understanding of language across various domains. Book Corpus includes 800 million words. Common Crawl manages an accessible repository of web outputs, which is also used for content filtering and fine-tuning purposes.
Question: Who is one of the lectures of the course?





        Response: Nicola Capuano is one of the lecturers of the course.
Question: Are you sure?
I'm only able to answer questions about NLP, LLMs, and the related course materials.
Question: Are you sure he is one of the lecturers of the course?





        Response: Yes, Nicola Capuano is one of the lecturers of the course, as stated in the provided context.
Question: What is the unit of electrical voltage?
I'm only able to answer questions about NLP, LLMs, and the related course materials.
Question: Which is the program of the NLP course at University of Bologna?





        Response: I cannot directly answer that question as the context provided does not mention the specific NLP course or university you are referring to. However, I can provide some general information about NLP courses and the University of Bologna.

        NLP courses typically cover topics such as natural language understanding, natural language generation, text mining, information retrieval, and machine translation. They may also include practical projects and assignments to help students gain hands-on experience in applying NLP techniques to real-world data.

        The University of Bologna, also known as Alma Mater Studiorum - Università di Bologna, is one of the oldest universities in Europe, founded in 1088. It offers a wide range of degree programs, including several related to Computer Science and Engineering, which may include NLP courses. For more specific information about NLP courses offered at the University of Bologna, I would recommend checking their official w