In [1]:
"""FLAN-T5-based Custom Chatbot for Websites (Colab Version)"""

import os
import sys
import textwrap
from typing import List, Dict
import unittest


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m32.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m533.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m17.9 MB/s[0m 

In [None]:
!pip install -q langchain langchain_community pypdf sentence_transformers tiktoken tokenizers faiss-cpu unstructured numpy==1.24.4 nltk==3.9.1 transformers torch tqdm
!pip install -q google-colab  # For Colab-specific utilities

In [14]:
import nltk
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import HuggingFacePipeline
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [15]:
import unittest
import textwrap
from typing import List, Dict
from tqdm import tqdm
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQAWithSourcesChain
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

In [22]:
# Constants
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
MODEL_NAME = "google/flan-t5-base"  # You can also try "google/flan-t5-large" if you have more computational resources


In [26]:


def load_data_from_urls(urls: List[str]) -> List[Dict]:
    """Load data from given URLs."""
    loader = UnstructuredURLLoader(urls=urls)
    return loader.load()

def split_text(data: List[Dict]) -> List[Dict]:
    """Split the text data into chunks."""
    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    return text_splitter.split_documents(data)

def create_embeddings():
    """Create embeddings using HuggingFace."""
    return HuggingFaceEmbeddings()

def create_vector_store(text_chunks: List[Dict], embeddings):
    """Create a vector store from text chunks and embeddings."""
    return FAISS.from_documents(text_chunks, embeddings)

def create_llm():
    """Create a FLAN-T5 language model."""
    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=100,
        temperature=0.1,
    )

    return HuggingFacePipeline(pipeline=pipe)

def create_qa_chain(llm, vector_store):
    """Create a question-answering chain."""
    return RetrievalQAWithSourcesChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever()
    )



def main():
    urls = [
        'https://blog.gopenai.com/paper-review-llama-2-open-foundation-and-fine-tuned-chat-models-23e539522acb',
        'https://www.mosaicml.com/blog/mpt-7b',
        'https://stability.ai/blog/stability-ai-launches-the-first-of-its-stablelm-suite-of-language-models',
        'https://lmsys.org/blog/2023-03-30-vicuna/'
    ]

    print("Loading data...")
    data = load_data_from_urls(urls)
    print("Splitting text...")
    text_chunks = split_text(data)
    print("Creating embeddings...")
    embeddings = create_embeddings()
    print("Creating vector store...")
    vector_store = create_vector_store(text_chunks, embeddings)
    print("Creating LLM...")
    with tqdm(total=100, desc="Downloading model", ncols=100) as pbar:
        llm = create_llm()
        pbar.update(100)
    print("Creating QA chain...")
    qa_chain = create_qa_chain(llm, vector_store)

    print("\nChatbot is ready! Type 'exit' to quit.")
    while True:
        query = input("\nPrompt: ")
        if query.lower() == 'exit':
            print('Exiting')
            break
        if not query:
            continue

        result = qa_chain({'question': query}, return_only_outputs=True)
        wrapped_answer = textwrap.fill(result['answer'], width=100)
        print(f"\nAnswer: {wrapped_answer}")



In [None]:
if __name__ == "__main__":
        # Run the main function in Colab
        main()


Loading data...
Splitting text...
Creating embeddings...
Creating vector store...
Creating LLM...


Downloading model: 100%|██████████████████████████████████████████| 100/100 [00:01<00:00, 50.28it/s]


Creating QA chain...

Chatbot is ready! Type 'exit' to quit.

Prompt: what is stable lm


Token indices sequence length is longer than the specified maximum sequence length for this model (1655 > 512). Running this sequence through the model will result in indexing errors



Answer: Stable LM is a new open source language model.
