In [1]:
import joblib
import torch
from torch import bfloat16
import transformers
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain.llms import HuggingFacePipeline
import pandas as pd
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

import os
from dotenv import load_dotenv
# load the environment
load_dotenv()
hf_pat = os.getenv('HUGGING_FACE_PAT')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# GPU setup
device = torch.device("cuda")
device

device(type='cuda')

In [3]:
# Define the model ID for a pre-trained language model
model_id = 'Llama-2-13b-chat-hf'

# Configure quantization settings for loading the model with less GPU memory usage
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [4]:
# Initialize HuggingFace authentication token
hf_auth = hf_pat

# Load the configuration for the pre-trained model
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [5]:
# Load the model for causal language modeling
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    #offload_folder="offload", 
    #offload_state_dict = True,
    device_map='auto',
    low_cpu_mem_usage=True,
    torch_dtype=torch.float32,
    use_auth_token=hf_auth
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.71s/it]


In [6]:
# Set the model in evaluation mode for inference
model.eval()

# Print device information where the model is loaded
print(f"Model loaded on {device}")

Model loaded on cuda


In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

stop_token_ids



[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [8]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [9]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

  warn_deprecated(


In [10]:
loader = CSVLoader(file_path="C:\\Users\\Paul\\Documents\\Masters_Program\\AAI_590_Capstone\\AAI-590-Captstone\\ChatBot\\Conversation.csv",
                   encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

# loader = CSVLoader(file_path="test.csv",
#                    encoding="utf-8", csv_args={'delimiter': ','})
# data = loader.load()

In [11]:
# Initialize embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
# Initialize the vector store
vectorstore = FAISS.from_documents(data, embeddings)

  warn_deprecated(


In [12]:
chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [13]:
def print_results(query,chat_history):
    result = chain({"question": query, "chat_history": chat_history})
    # Find the start of the "Helpful Answer" section
    start_marker = "Helpful Answer: "
    start_index = result['answer'].find(start_marker)

    if start_index != -1:
        # Extract the helpful answer text
        helpful_answer =  result['answer'][start_index + len(start_marker):].strip()

        # Create a dictionary with the extracted information
        output_dict = {"helpful_answer": helpful_answer}

        # Print helpful ouput
        print(output_dict["helpful_answer"])
    else:
        print("Helpful Answer not found in the output.")

In [14]:
chat_history = []

query = "Where is paris located?"
print_results(query,chat_history)


  warn_deprecated(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Paris is located in France.


In [15]:
query = "What is the solar system?"
print_results(query,chat_history)

The solar system consists of eight planets and other celestial bodies orbiting around the sun.


In [17]:
query = "Who is the fariest of them all?"
print_results(query,chat_history)

Based on the answers provided, it seems that the person who is the fairest of them all is the one who makes up all these jokes.
