# Importing packages

In [1]:
# from huggingface_hub import notebook_login

# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import torch
import torch.nn as nn
import pandas as pd
import transformers
import pinecone
import langchain

from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationChain
from langchain.schema import BaseOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from tqdm.auto import tqdm



from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList, pipeline
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset


import time

In [3]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

In [4]:
device

'cuda:0'

In [10]:
# model_name = 'microsoft/phi-2'
# model_name = 'bofenghuang/vigogne-2-7b-instruct'

model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

# bnb_config = BitsAndBytesConfig(load_in_4bit=True,
#                                 bnb_4bit_quant_type='nf4',
#                                 bnb_4bit_use_double_quant=True,
#                                 bnb_4bit_compute_dtype=torch.bfloat16)


model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



# Fine-tuning

### Freezing the original weights

In [None]:
for param in model.parameters():
  param.requires_grad = False
  if param.ndim == 1:
    param.data = param.data.to(torch.float16)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x):
    return super().forward(x).to(torch.float16)

model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa adapters

In [None]:
def print_trainable_parameters(model):

  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()

  print(f"Trainable params: {trainable_params} || all params: {all_param} || trainable%: {100*trainable_params/all_param}")

In [None]:
lora_config = LoraConfig(r=4,    #attention heads
                         lora_alpha=32,
                         lora_dropout=0.01,
                         bias='none',
                         task_type='CAUSAL_LM')

peft_model = get_peft_model(model, lora_config)
print_trainable_parameters(peft_model)

Trainable params: 2097152 || all params: 3502510080 || trainable%: 0.059875687781032735


### Importing data

In [None]:
qna_df = pd.read_csv('faq_qna_certideal.csv')
qna_df.reset_index(inplace=True)
dataset = Dataset.from_pandas(qna_df)

tokenized_data = dataset.map(lambda samples: tokenizer(samples['Answer']), batched=True)
tokenized_df = tokenized_data.to_pandas()

In [None]:
qna_df.isnull().sum()

In [None]:
to_be_dropped = []
for i in tokenized_df.index:
  if len(tokenized_df.loc[i, 'input_ids']) > 512:
    to_be_dropped.append(i)

train_dataset = Dataset.from_pandas(tokenized_df.drop(index=to_be_dropped).reset_index(drop=True))

In [None]:
# train_args = transformers.TrainingArguments(per_device_train_batch_size=4,
#                                             gradient_accumulation_steps=4,
#                                             max_steps=200,
#                                             fp16=True,
#                                             logging_steps=1,
#                                             output_dir='outputs')


train_args = transformers.TrainingArguments(learning_rate=2e-5,
                                            gradient_accumulation_steps=4,
                                            weight_decay=0.01,
                                            max_steps=10,
                                            push_to_hub=True,
                                            output_dir='outputs')


data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)


trainer = transformers.Trainer(model=peft_model,
                               train_dataset=tokenized_data,
                               args=train_args,
                               data_collator=data_collator)


model.config.use_cache = False
trainer.train()

# Conversation chain

### Text generation task

In [11]:
# list of stopping criteria
stop_token_ids = [tokenizer.convert_tokens_to_ids(x) for x in [['Human', ':'], ['AI', ':']]]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

# stopping criteria
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [14]:
generate_text = pipeline(model=model,
                         tokenizer=tokenizer,
                         return_full_text=True,
                         task='text-generation',
                         stopping_criteria=stopping_criteria,  # without this model rambles during chat
                         temperature=0.5,
                         max_new_tokens=1024,  # max number of tokens to generate in the output
                         repetition_penalty=1.1  # without this output begins repeating
                         )

# template for an instruction with no input
prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")


llm = HuggingFacePipeline(pipeline=generate_text)

In [15]:
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(memory_key="history",  # important to align
                                        k=10,
                                        #return_messages=True,
                                        return_only_outputs=True  # for conversation chain
                                        )

In [16]:
class OutputParser(BaseOutputParser):
    def parse(self, text: str) -> str:
        """Cleans output text"""
        text = text.strip()
        # remove suffix containing "Human:" or "AI:"
        stopwords = ['\nHuman:', '\nAI:']
        for word in stopwords:
            text = text.removesuffix(word)
        return text.strip()

    @property
    def _type(self) -> str:
        """Return output parser type for serialization"""
        return "output_parser"

In [17]:
parser = OutputParser()

prompt_template = \
"""
Ce qui suit est une conversation entre une IA et un humain. La conversation est en français.
Tu es un assistant pour les clients de CertiDeal. CertiDeal vends des smartphones reconditionnés.
Ci-dessous est une requête d'un utilisateur ainsi que quelques contextes qui peuvent être pertinents.
Réponds à la question en fonction des informations contenues dans ces contextes.
Si tu ne trouve pas la réponse à la question, dis "Je ne sais pas".
Si les contextes ne sont pas en relation avec la question et n'apporte aucun élément de réponse dis "Je ne sais pas".

Current conversation:
{history}
Humain: {input}
IA:"""

prompt = PromptTemplate(input_variables=["history", "input"],
                        template=prompt_template,
                        output_parser=parser)

# Chain to have a conversation and load context from memory
chat = ConversationChain(llm=llm,
                         memory=memory,
                         verbose=True,
                         prompt=prompt)

In [18]:
from langchain.embeddings import HuggingFaceEmbeddings
from datasets import load_dataset, Dataset

path = '/home/paperspace/certibot/data/certibot-data.jsonl'

raw_dataset = load_dataset('json', data_files=path, split='train')

data = raw_dataset.to_pandas()

data = data.reset_index()
data['context'] = data['topic'] + ': ' + data['article']
data['index'] = data['index'].astype((str))

dataset = Dataset.from_pandas(data)



### Initialize Vectore Store

In [20]:
import pinecone

api_key = 'a344a187-8b52-422d-97c2-96628ef67ef6'

pc = pinecone.Pinecone(api_key=api_key)
index_name = 'certibot-rag'


embed_dim = 1024
batch_size = 4
add = True

In [36]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=embed_dim,
        metric='cosine',
        spec=pinecone.PodSpec('gcp-starter')
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.00097,
 'namespaces': {'': {'vector_count': 97}},
 'total_vector_count': 97}

In [24]:
checkpoint = "intfloat/multilingual-e5-large-instruct"

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embedding = HuggingFaceEmbeddings(
    model_name=checkpoint,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)






In [25]:
if add:
  progress_bar = tqdm(range(0, len(data), batch_size))

  for i in range(0, len(data), batch_size):
    end = min(len(data), i + batch_size)
    batch = data[i:end]

    ids = batch['index']
    texts = batch['context']

    embeds = embedding.embed_documents(texts)

    metadata = [{
        'context': x['context'],
        'topic': x['topic']
        } for _, x in batch.iterrows()]

    vectors = list(zip(ids, embeds, metadata))
    index.upsert(vectors=vectors)
    progress_bar.update(1)
else:
  print('No data to upsert')

  0%|          | 0/25 [00:00<?, ?it/s]

In [26]:
vectorstore = Pinecone(index, embedding.embed_query, 'text')

  warn_deprecated(


In [33]:
def augmented_prompt(query:str, vectorstore=vectorstore) -> str:
  """
  Performs similarity search on the vectore database and returns
  an augmented prompt for the Language Model
  """

  res = vectorstore.similarity_search(query, k=3)
  source = '\n'.join([x.page_content for x in res])
  aug_prompt = f"""
  En utilisant les contextes ci-dessous, répondez à la requête.
  Contextes:
  {source}
  Requête:
  {query}
  """
  return aug_prompt, res

def get_chat_response(query:str, chat=chat, rag=True) -> str:
  """
  Returns an answer to a given question or query
  """

  if chat != None:

    if rag:
      prompt = augmented_prompt(query, vectorstore)
    else:
      prompt = query

    res = chat.predict_and_parse(input=prompt)
    return res

  else:
    print('No chat')

In [34]:
_, res = augmented_prompt("bonjour, quelles sont les modalités de livraison ?")



In [35]:
print(res)

[]


In [28]:
res = get_chat_response("Parle moi de CertiDeal")
print(res)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Ce qui suit est une conversation entre une IA et un humain. La conversation est en français.
Tu es un assistant pour les clients de CertiDeal. CertiDeal vends des smartphones reconditionnés.
Ci-dessous est une requête d'un utilisateur ainsi que quelques contextes qui peuvent être pertinents.
Réponds à la question en fonction des informations contenues dans ces contextes.
Si tu ne trouve pas la réponse à la question, dis "Je ne sais pas".
Si les contextes ne sont pas en relation avec la question et n'apporte aucun élément de réponse dis "Je ne sais pas".

Current conversation:

Humain: 
  En utilisant les contextes ci-dessous, répondez à la requête.
  Contextes:
  
  Requête:
  Parle moi de CertiDeal
  
IA:[0m


KeyboardInterrupt: 