In [1]:
!pwd
!source /home/monoshi/venv/bin/activate

/home/monoshi/COMS_579_NLP


In [2]:
import os
import argparse
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def upload_pdf(pdf_file):
    try:
        text_loader = PyPDFLoader(pdf_file)
        text = text_loader.load_and_split()
    except Exception as e:
        print(f"Error uploading PDF: {e}")
        return None
    return text

In [4]:
def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=0)
    split_docs = text_splitter.split_documents(text)
    return split_docs

In [5]:
pdf_files = [f for f in os.listdir("/home/monoshi/COMS_579_NLP/KB/") if f.endswith('.pdf')]

In [6]:
import weaviate

WEAVIATE_URL = "https://project-nl7mysdi.weaviate.network"

client = weaviate.Client(
    url=WEAVIATE_URL)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [7]:
# specify embedding model (using huggingface sentence transformer)
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name, 
  model_kwargs=model_kwargs
)

In [8]:
all_text=[]
for pdf_file in pdf_files:
    pdf_path = os.path.join("/home/monoshi/COMS_579_NLP/KB/", pdf_file)
    text = upload_pdf(pdf_path)
    all_text.extend(text)
    chunked=chunk_text(text)

In [9]:
chunked=chunk_text(all_text)

vector_db = Weaviate.from_documents(
    chunked, embeddings, client=client, by_text=False
)

In [10]:
print(
    vector_db.similarity_search(
        "What is the full form of Large Language Models?", k=3)
    )

[Document(page_content='transformer,\nLarge\nLanguage\nModels(LLMs)\nhave\nnot\nonly\nbecome\nthe\nforefront\nof\ntext\ngeneration\ntasks\nbut\nhave\nstarted\nto', metadata={'page': 0, 'source': '/home/monoshi/COMS_579_NLP/KB/Introduction.pdf'}), Document(page_content='Title:\nCapabilities\nof\nLarge\nLanguage\nModels\nin\nProgram\nAnalysis\nTasks\nIntroduction:\nIn\nrecent\nyears,\nafter\nthe\nemergence\nof\nthe', metadata={'page': 0, 'source': '/home/monoshi/COMS_579_NLP/KB/Introduction.pdf'}), Document(page_content='large\nlanguage\nmodels\nand\nmodel\npredictive\ncontrol\nfor\nbuildings\noptimal\noperation,\nDec.\n2023.\ndoi:10.21203/rs.3.rs-3735947/v1', metadata={'page': 0, 'source': '/home/monoshi/COMS_579_NLP/KB/Introduction.pdf'})]


In [21]:
%pip install ipywidgets
from huggingface_hub import notebook_login
notebook_login()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ipywidgets
  Downloading ipywidgets-8.1.2-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting jupyterlab-widgets~=3.0.10
  Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.0/215.0 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting widgetsnbextension~=4.0.10
  Downloading widgetsnbextension-4.0.10-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.2 jupyterlab-widgets-3.0.10 widgetsnbextension-4.0.10
Note: you may need to restart the kernel to use updated packages.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
# specify model huggingface mode name
model_name = "meta-llama/Llama-2-13b-chat-hf"

# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        token=os.getenv("HF_Token"),
    )
    return model

# function for initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=True)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer


# initialize tokenizer
tokenizer = initialize_tokenizer(model_name)
# load model
model = load_quantized_model(model_name)
# specify stop token ids
stop_token_ids = [0]


# build huggingface pipeline for using zephyr-7b-alpha
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)



tokenizer_config.json: 100%|██████████| 1.62k/1.62k [00:00<00:00, 4.43MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 11.2MB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 15.0MB/s]
special_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 1.41MB/s]
config.json: 100%|██████████| 587/587 [00:00<00:00, 2.00MB/s]
model.safetensors.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 30.8MB/s]
model-00001-of-00003.safetensors: 100%|██████████| 9.95G/9.95G [01:33<00:00, 106MB/s] 
model-00002-of-00003.safetensors: 100%|██████████| 9.90G/9.90G [01:32<00:00, 107MB/s]
model-00003-of-00003.safetensors: 100%|██████████| 6.18G/6.18G [00:56<00:00, 109MB/s]
Downloading shards: 100%|██████████| 3/3 [04:03<00:00, 81.25s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:16<00:00,  5.56s/it]
generation_config.json: 100%|██████████| 188/188 [00:00<00:00, 644kB/s]


In [25]:
from langchain import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipeline)

In [26]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_db.as_retriever()
)

In [27]:
response = qa_chain.run(
    "How is Mr. Whisker?")
print(response)

  warn_deprecated(


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Mr .
Whisk ers
was
no
or dinar y
cat.
He
had
a
knack
for
getting
himself
int o
all
sor ts
of
adv entur es,
much
t o
the
chagrin

of
his
owner ,
Mrs.
Smith.
One
sunny
morning,
as
Mrs.
Smith
was
busy
tending
t o
her
gar den,
Mr .
Whisk ers
spotted
a
butterﬂy

Once
upon
a
time
in
a
small
village
nestled
between
r olling
hills,
ther e
liv ed
a
curious
little
cat
named
Mr .
Whisk ers.

holding
a
nut
in
its
tiny
paws.
With
a
twinkle
in
its
e y e,
the
squirr el
off er ed
the
nut
t o
Mr .
Whisk ers,
who
eagerly

Question: How is Mr. Whisker?
Helpful Answer: Mr. Whiskers is a curious little cat who loves to get into all sorts of adventures, much to the chagrin of his owner, Mrs. Smith. One sunny morning, as Mrs. Smith was busy tending to her garden, Mr. Whiskers spotted a butterfly.


In [37]:
# Split the response string by newline characters ("\n")
response_lines = response.split("\n")

for i in range(len(response_lines)):
    if response_lines[i].startswith('Question:'):
        print(f"{response_lines[i]}")
    elif response_lines[i].startswith('Helpful Answer:'):
        print(f"{response_lines[i]}")


Question: How is Mr. Whisker?
Helpful Answer: Mr. Whiskers is a curious little cat who loves to get into all sorts of adventures, much to the chagrin of his owner, Mrs. Smith. One sunny morning, as Mrs. Smith was busy tending to her garden, Mr. Whiskers spotted a butterfly.


In [41]:
with open("Query/Questions.txt") as file:
    lines=file.readlines()

for i in range(len(lines)):
    print(lines[i])

Who is Mr. Whisker?

What is the cost of Tech giants?

