In [None]:
!pip install -q langchain langchain_community pypdf sentence_transformers tiktoken tokenizers faiss-cpu unstructured numpy==1.24.4 nltk==3.9.1 transformers torch tqdm
!pip install -q google-colab  # For Colab-specific utilities

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m38.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m58.8 MB/s[0m et

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from typing import List, Dict
import textwrap
import warnings
warnings.filterwarnings("ignore")

In [None]:

# Constants
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
MODEL_NAME = "EleutherAI/gpt-neo-1.3B"  # Open model
#MAX_LENGTH = 512

In [None]:


def load_data_from_urls(urls: List[str]) -> List[Dict]:
    """Load data from given URLs using LangChain's UnstructuredURLLoader."""
    loader = UnstructuredURLLoader(urls=urls)
    return loader.load()

def split_text(data: List[Dict]) -> List[str]:
    """Split the text data into chunks using LangChain's CharacterTextSplitter."""
    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    return text_splitter.split_documents(data)

def create_embeddings():
    """Create embeddings using HuggingFace with LangChain."""
    return HuggingFaceEmbeddings()

def create_vector_store(text_chunks: List[str], embeddings):
    """Create a vector store from text chunks and embeddings."""
    return FAISS.from_documents(text_chunks, embeddings)

def create_llm():
    """Create a GPT-Neo language model using Hugging Face Transformers."""
    model_name = "EleutherAI/gpt-neo-1.3B"  # Open model
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Set truncation to true
    tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,  # New tokens to generate
        temperature=0.3,
    )

    return HuggingFacePipeline(pipeline=pipe)


def create_qa_chain(llm, vector_store):
    """Create a question-answering chain."""
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )

def main():
    urls = [
        'https://blog.gopenai.com/paper-review-llama-2-open-foundation-and-fine-tuned-chat-models-23e539522acb',
        'https://www.mosaicml.com/blog/mpt-7b',
        'https://stability.ai/blog/stability-ai-launches-the-first-of-its-stablelm-suite-of-language-models',
        'https://lmsys.org/blog/2023-03-30-vicuna/'
    ]

    print("Loading data...")
    data = load_data_from_urls(urls)
    print("Splitting text...")
    text_chunks = split_text(data)
    print("Creating embeddings...")
    embeddings = create_embeddings()
    print("Creating vector store...")
    vector_store = create_vector_store(text_chunks, embeddings)
    print("Creating LLM...")
    llm = create_llm()
    print("Creating QA chain...")
    qa_chain = create_qa_chain(llm, vector_store)

    print("\nChatbot is ready! Type 'exit' to quit.")
    while True:
        query = input("\nPrompt: ")
        if query.lower() == 'exit':
            print('Exiting')
            break
        if not query:
            continue

        result = qa_chain({'query': query})
        wrapped_answer = textwrap.fill(result['result'], width=100)
        print(f"\nAnswer: {wrapped_answer}")




In [None]:
if __name__ == "__main__":
    main()

Loading data...
Splitting text...
Creating embeddings...
Creating vector store...
Creating LLM...
Creating QA chain...

Chatbot is ready! Type 'exit' to quit.

Prompt: what do you know about llama2


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer,
just say that you don't know, don't try to make up an answer.  Pretraining Data The authors utilized
a novel mix of data from publicly accessible sources to train the Llama 2 models, excluding any data
from Meta’s products or services. They made efforts to erase data from certain sites known for
harboring large amounts of personal information about private individuals. They trained the models
on 2 trillion tokens of data, believing this amount provided a beneficial performance-cost balance.
They also up-sampled the most factual sources to boost knowledge and reduce instances of false
information generation or “hallucinations”. Llama 2 Pretrained Model Evaluation Llama 2 models
significantly outperform their Llama 1 counterparts: The 70 billion-parameter Llama 2 model notably
improves results on the MMLU and BBH benchmarks by roughly 5 and 8 points, respectively, when
compared t

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer,
just say that you don't know, don't try to make up an answer.  Stability AI Launches the First of
its Stable LM Suite of Language Models Product 19 Apr Written By Guest User Today, Stability AI
released a new open source language model, Stable LM. The Alpha version of the model is available in
3 billion and 7 billion parameters, with 15 billion to 65 billion parameter models to follow.
Developers can freely inspect, use, and adapt our Stable LM base models for commercial or research
purposes, subject to the terms of the CC BY-SA-4.0 license. In 2022, Stability AI drove the public
release of Stable Diffusion, a revolutionary image model representing a transparent, open, and
scalable alternative to proprietary AI. With the launch of the Stable LM suite of models, Stability
AI is continuing to make foundational AI technology accessible to all. Our Stable LM models can
generate tex