#### Create langchain docs from folder path

In [None]:
from langchain.document_loaders import DirectoryLoader

directory = '/app/dir_path'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=1024,chunk_overlap=30):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
len(docs)

#### Create custom langchain docs with metadata

In [None]:
from langchain.docstore.document import Document

d = {'how are you?':'I am fine',
     'what is your name?':'My name is smith'}

docs = []
for question,answer in d.items():
    doc =  Document(page_content=question, metadata={"answer": answer})
    docs.append(doc)

len(docs)

#### Create retriever using chromadb
* For more information visit [chroma langchain](https://python.langchain.com/docs/integrations/vectorstores/chroma)

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

openai_api_key = 'xxx'
openai_embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = Chroma.from_documents(documents=docs, embedding=openai_embedding)
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})
# retriever = vectorstore.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold": .5})
# retriever = vectorstore.as_retriever(search_type="mmr")

#### Create retriever using FAISS
* For more information visit [FAISS langchain](https://python.langchain.com/docs/integrations/vectorstores/FAISS)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

openai_api_key = 'xxx'
openai_embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = FAISS.from_documents(documents=docs, embedding=openai_embedding)
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})
# retriever = vectorstore.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold": .5})
# retriever = vectorstore.as_retriever(search_type="mmr")

In [None]:
#save database
vectorstore.save_local('faiss_index')

#load database
openai_api_key = 'xxx'
openai_embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = FAISS.load_local("faiss_index", openai_embedding)

##### Faiss vectorstore to dataframe

In [None]:
import pandas as pd

def get_vectorstore_df(vectorstore):
    d = vectorstore.docstore._dict
    data_rows = []
    for chunk_id,content in d.items():
        s1 = {"chunk_id": chunk_id,
            "content": content.page_content.strip()}
        s = {**s1,**content.metadata}
        data_rows.append(s)
        
    return pd.DataFrame(data_rows)

##### Faiss vectorstore delete records

In [None]:
# Note chunk id you can get from above dataframe
chunk_id_list = ['abcd','xxxx']
vectorstore.delete(ids=chunk_id_list)

##### Faiss vectorstore add new record docs

In [None]:
from langchain.docstore.document import Document

d = {'how are you?':'I am fine',
     'what is your name?':'My name is smith'}

docs = []
for question,answer in d.items():
    doc =  Document(page_content=question, metadata={"answer": answer})
    docs.append(doc)
    
vectorstore.add_documents(docs)

In [None]:
# once you have done update and delete you can 
# save your database
vectorstore.save_local('faiss_index')

#### Finding top k similar docs

In [None]:
query = 'how many awards did messi won?'
similar_docs = retriever.get_relevant_documents(query)
similar_docs

#### Using retrievers as chatbot

In [None]:
from langchain.vectorstores import FAISS

openai_api_key = 'xxx'
openai_embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = FAISS.from_documents(documents=docs, embedding=openai_embedding)
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryMemory

openai_api_key = 'xxx'
model_name = 'gpt-3.5-turbo-16k'
# check all available models at https://platform.openai.com/docs/models
# check pricing at https://openai.com/pricing
llm = ChatOpenAI(model_name=model_name,temperature=0.0, openai_api_key=openai_api_key)
memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history",return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory,verbose=True)

qa.run({'question':'tell me about goglocal?'})

#### custom langchain response generation using retrievers

In [None]:
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

openai_api_key = 'xxx'
model_name = 'gpt-3.5-turbo-16k'
# check all available models at https://platform.openai.com/docs/models
# check pricing at https://openai.com/pricing
chat_model = ChatOpenAI(temperature=0.0, model_name=model_name, openai_api_key=openai_api_key)

In [None]:
response_schemas = []
response_schemas.append(ResponseSchema(name="email_subject", description="subject of email based on context"))
response_schemas.append(ResponseSchema(name="email_body", description="body of email based on context"))

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [None]:
system_prompt = (
            "You are a marketing specialist at FoodForGood. Your responsibility is to respond to the sender's "
            "email using the company context provided below. Ensure that your reply is professional and "
            "incorporates the specified end template for concluding the email response. The sender's email, "
            "company context, and end template are provided, so be sure to utilize them to craft a professional "
            "subject and body for the email.It is crucial to precisely respond to each inquiry in the email by "
            "leveraging the company context provided below.\n"
        )

In [None]:
user_query = '\n\nSender\'s Email:\"\"\"\n{input_email}\n\"\"\"\n'
user_query += '##'*30
user_query += '\n\nEnd Template:\n{end_template}\n\n'
user_query += '##'*30
user_query += '\n\nCompany Context:\n{context}\n\n'
user_query += '##'*30
user_query += "\n\n{format_instructions}\n"

In [None]:
input_variables = []
input_variables.append("input_email")
input_variables.append("context")
input_variables.append("end_template")
prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(system_prompt),
        HumanMessagePromptTemplate.from_template(user_query)  
    ],
    input_variables=input_variables,
    partial_variables={"format_instructions": format_instructions}
)

In [None]:
# pip install langchain==0.0.345
from langchain_core.runnables import (
    RunnableParallel,
    RunnablePassthrough,
    RunnableLambda
)

from operator import itemgetter

def extract_docs(input_email):
    r = retriever.get_relevant_documents(input_email)
    r = ['Q.' + i.page_content.strip() + '\nA.' + i.metadata['answer'].strip() for i in r]
    # r = [f'Q.What vendors are in close proximity, and what menu choices do they offer?\nA.{s}'] + r
    return r

get_docs = RunnableLambda(extract_docs)

retrieval = RunnableParallel(
    {
        "context": itemgetter("input_email") | get_docs, 
        "input_email": itemgetter("input_email"),
        "end_template": itemgetter("end_template")
    }
)

In [None]:
chain = retrieval | prompt | chat_model | output_parser

In [None]:
full_email = """Hello I’m from Anna McCrea Public School and i want to know which are nearby vendors and what menu options you provide?"""

end_template = """Warmest regards,
Customer Service Team
support@foodforgood.ca"""

d = {'input_email' : full_email,
     'end_template' : end_template}

d

In [None]:
parse_response = chain.invoke(d)

print(parse_response['email_subject'])
print(parse_response['email_body'])

In [None]:
#check prompt
model_input_prompt = (retrieval | prompt).invoke(d)
print(model_input_prompt.messages[0].content)
print(model_input_prompt.messages[1].content)

#### Calculate token length for gpt

In [None]:
#pip install --upgrade tiktoken
#pip install --upgrade openai

import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
#for openai
num_tokens_from_string("tiktoken is great!", "cl100k_base")

#### Calculate token length for huggingface models

In [None]:
def huggingface_model_token_len(tokenizer,prompt):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    return model_inputs, len(model_inputs["input_ids"][0])

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" 

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

prompt = "My favourite condiment is"
model_inputs, token_len = huggingface_model_token_len(tokenizer,prompt)

print(token_len)

model_inputs.to(device)
model.to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
response = tokenizer.batch_decode(generated_ids)[0]
response

#### Pinecone

In [None]:
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from tqdm.auto import tqdm
import uuid
import pandas as pd
from pathlib import Path
import os
from dotenv import load_dotenv
load_dotenv('.env')

openai_api_key = os.environ.get("OPENAI_API_KEY")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pinecone_region = os.environ.get("PINECONE_REGION")
pinecone_vector_db_name = os.environ.get("PINECONE_VECTOR_DB")

openai_embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
pinecone.init(api_key=pinecone_api_key, environment=pinecone_region)
index = pinecone.Index(pinecone_vector_db_name)

df = pd.read_csv('file.csv')

vector = []

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    query_vec = openai_embedding.embed_query(row.descp)
    vector.append({'id':uuid.uuid4().hex, 
          'values':query_vec, 
          'metadata':{'hs_code': row.hs_code,'parent':row.parent,'country':row.country,'descp':row.descp}})
    if len(vector) == 50:
        index.upsert(vectors=vector)
        vector = []
index.upsert(vectors=vector)

In [None]:
def get_sim_hsn(query,hsn,country,k=3):
    query_vec = openai_embedding.embed_query(query)
    query_response = index.query(
        top_k=k,
        include_metadata=True,
        vector=query_vec,
        filter={'country': country,'parent': hsn})
    
    return query_response