In [None]:
EMB_INSTRUCTOR_XL = "hkunlp/instructor-xl"
EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2"

In [None]:
LLM_FLAN_T5_XXL = "google/flan-t5-xxl"
LLM_FLAN_T5_XL = "google/flan-t5-xl"
LLM_FASTCHAT_T5_XL = "lmsys/fastchat-t5-3b-v1.0"
LLM_FLAN_T5_SMALL = "google/flan-t5-small"
LLM_FLAN_T5_BASE = "google/flan-t5-base"
LLM_FLAN_T5_LARGE = "google/flan-t5-large"
LLM_FALCON_SMALL = "tiiuae/falcon-7b-instruct"

In [None]:
config = {"persist_directory":None,
          "load_in_8bit":False,
          "embedding" : EMB_SBERT_MPNET_BASE,
          "llm":LLM_FLAN_T5_BASE,
          }

In [None]:
#!pip install langchain


In [None]:
# !pip install sentence-transformers

In [None]:
# !pip install transformers


In [None]:
# !pip install accelerate

In [None]:
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer, pipeline
from accelerate import Accelerator


def create_sbert_mpnet():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return HuggingFaceEmbeddings(model_name=EMB_SBERT_MPNET_BASE, model_kwargs={"device": device})

def create_flan_t5_base(load_in_8bit=False):
    # Wrap it in HF pipeline for use with LangChain
    model = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model)
    return pipeline(
        task="text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
    )

# Assuming EMB_SBERT_MPNET_BASE and LLM_FLAN_T5_BASE are defined elsewhere in your code
if config["embedding"] == EMB_SBERT_MPNET_BASE:
    embedding = create_sbert_mpnet()

load_in_8bit = config["load_in_8bit"]

if config["llm"] == LLM_FLAN_T5_BASE:
    llm = create_flan_t5_base(load_in_8bit=load_in_8bit)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def create_falcon_instruct_small(load_in_8bit=False):
        model = "tiiuae/falcon-7b-instruct"

        tokenizer = AutoTokenizer.from_pretrained(model)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                tokenizer = tokenizer,
                trust_remote_code = True,
                max_new_tokens=100,
                model_kwargs={
                    "device_map": "auto",
                    "load_in_8bit": load_in_8bit,
                    "max_length": 512,
                    "temperature": 0.01,
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline

In [None]:
# !pip install pdfplumber

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [None]:
import pdfplumber

pdf_path = "/content/first_pdf.pdf"
with pdfplumber.open(pdf_path) as pdf:
    # Extract text from each page
    documents = [page.extract_text() for page in pdf.pages]

# Split documents into text snippets (directly pass the text strings)
chunk_size = 100
chunk_overlap = 0
texts = []

for document in documents:
    start = 0
    while start < len(document):
        end = start + chunk_size
        texts.append(document[start:end])
        start = end - chunk_overlap



In [None]:
# !pip install chromadb

In [None]:
from langchain.vectorstores.chroma import Chroma

# Assuming 'embedding' is already defined
persist_directory = config["persist_directory"]

# Create a custom Document class with 'page_content' and 'metadata' attributes
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

# Assuming 'texts' is a list of text extracted from each page
# You can provide metadata as needed for each document
documents = [Document(page_content=text, metadata={'some_key': 'some_value'}) for text in texts]

# Use Chroma.from_documents with the custom Document objects
vectordb = Chroma.from_documents(documents=documents, embedding=embedding, persist_directory=persist_directory)


In [None]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb.as_retriever(search_kwargs={"k":4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)

# Defining a default prompt for flan models
if config["llm"] == LLM_FLAN_T5_SMALL or config["llm"] == LLM_FLAN_T5_BASE or config["llm"] == LLM_FLAN_T5_LARGE:
    question_t5_template = """
    context: {context}
    question: {question}
    answer:
    """
    QUESTION_T5_PROMPT = PromptTemplate(
        template=question_t5_template, input_variables=["context", "question"]
    )
    qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT

In [None]:

import warnings
warnings.filterwarnings('ignore')

question = ": I cry"
qa.combine_documents_chain.verbose = True
qa.return_source_documents = True
qa({"query":question,})



[1m> Entering new StuffDocumentsChain chain...[0m

[1m> Finished chain.[0m


{'query': ': I cry',
 'result': 'THERAPIST: I cry',
 'source_documents': [Document(page_content="CLIENT 3: Uh well my sister and I don't get a long at all We just sit\nthere and fight and throw insu", metadata={'some_key': 'some_value'}),
  Document(page_content='n\neverybody else\nCLIENT 1: laughs', metadata={'some_key': 'some_value'}),
  Document(page_content='CLIENT 2: laughs', metadata={'some_key': 'some_value'}),
  Document(page_content='le 00 23 12\nTHERAPIST: Why why is there so much uhm bitterness between you and her\nCLIENT 3: I think', metadata={'some_key': 'some_value'})]}

In [None]:
import langchain

print(dir(langchain))




In [None]:
import langchain

print(dir(langchain))


In [None]:
#this code find the path of hugging face embeddigns
# import os

# langchain_path = '/usr/local/lib/python3.10/dist-packages/langchain'
# target_class_name = 'HuggingFaceEmbeddings'

# for root, dirs, files in os.walk(langchain_path):
#     for file in files:
#         if file.endswith(".py"):
#             file_path = os.path.join(root, file)
#             with open(file_path, 'r', encoding='utf-8') as f:
#                 contents = f.read()
#                 if target_class_name in contents:
#                     relative_path = file_path.replace(langchain_path, 'langchain')
#                     print(f'Found in: {relative_path}')


In [None]:
# !pip install --upgrade langchain