# Modeling 진행

- Private AI Search with LangChain and Elasticsearch 참조
- 원본 그대로 되는지부터 확인

## Using Langchain to generate vectors and store in Elasticsearch

- DPR (https://ll2ll.tistory.com/90)
    - title + [SEP] + passage
    - 문서를 100개 단어로 이뤄진 block으로 나누고, 이를 passage라 한다
    

In [1]:
import os
from dotenv import load_dotenv

load_dotenv(verbose=True)
ES_CLOUD_ID = os.getenv("ES_CLOUD_ID")
ES_USER = os.getenv("ES_USER")
ES_PASSWORD = os.getenv("ES_PASSWORD")
ES_API_KEY = os.getenv("ES_API_KEY")

### Langchain (*)

- sentence-transformer는 우선 예제에서 쓰던 그대로
- 예제에서는 json -> pickle 형태로 저장 // df로는 어떻게 반환?
- retrieval, bm25 기반의 것들도 함께 사용해보기
- retrieval 개수 늘리기

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings


def setup_embeddings():
    # Huggingface embedding setup
    print(">> Prep. Huggingface embedding setup")
    model_name = "sentence-transformers/all-mpnet-base-v2"
    return HuggingFaceEmbeddings(model_name=model_name)

hf = setup_embeddings()

>> Prep. Huggingface embedding setup


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [3]:
# ElasticSearch vectorstore in langchain style
from langchain_elasticsearch import ElasticsearchStore

index_name = "hana_bank_eng_data"

db = ElasticsearchStore(
    es_cloud_id=ES_CLOUD_ID,
    es_user=ES_USER,
    es_password=ES_PASSWORD,
    es_api_key=ES_API_KEY,
    index_name=index_name,
    embedding=hf
)

### Data chunking

In [None]:
import pandas as pd
from tqdm import tqdm

data = pd.read_csv('./data/hana_preprocessed.csv')
data['text'] = ''
data.head()

In [None]:
data.shape

In [None]:
# batchtext 형태로 만들어주기
# 그냥 리스트에 개행 문자만 붙여주고 다 때려 넣기
# title + ' ' + passage
# passage는 100 words 단위로 chunking

import copy

N = 100

remove_idx = []
cnt = 0

# content에 title도 포함되어 있음
for idx in tqdm(range(len(data))):
    content_words = data.loc[idx, 'contents'].split(' ')
    title_words = data.loc[idx, 'title'].split(' ')

    # 원본 row 삭제 후 100 단어씩 청킹한거 넣기
    if len(content_words) + len(title_words) > N:
        remove_idx.append(idx)
        chunks = [content_words[i:i+N-len(title_words)] for i in range(0, len(content_words), N-len(title_words))]
        cnt += len(chunks)

        for chunk in chunks:
            tmp = copy.deepcopy(data.loc[idx]) # 행 복사
            tmp['text'] = data.loc[idx, 'title'] + ' ' + ' '.join(chunk)
            data = pd.concat([data, tmp.to_frame().T], ignore_index=True)
    
    else:
        data.loc[idx, 'text'] = data.loc[idx, 'title'] + ' ' + ' '.join(content_words)


print('## chunked data : ', len(remove_idx))
print('## appended data : ', cnt)

data.drop(remove_idx, inplace=True)

data.sort_values(by = ['category', 'sub_category', 'title'])
data.reset_index(drop=True, inplace=True)

In [None]:
34-32+291

In [None]:
data.shape

In [None]:
data.head()

In [None]:
batchtext = list(data['text'])
batchtext

In [None]:
"""
런타임 모두 실행 시 DB에 데이터가 중복 저장되는 것을 막기 위해 주석 처리 해놨습니다.
필요한 코드이니 지우지 말아주세요.
"""
# DB에 텍스트 데이터 추가
# db.from_texts(batchtext, 
#               embedding=hf,
#               es_cloud_id=ES_CLOUD_ID,
#               es_user=ES_USER,
#               es_password=ES_PASSWORD,
#               es_api_key=ES_API_KEY,
#               index_name=index_name)

### 본격 모델링 (*)

huggingface generator model 사용

- flan-t5-base ("google/flan-t5-base")
- llama3-8b ("meta-llama/Meta-Llama-3-8B")
- mistral ("mistralai/Mistral-7B-Instruct-v0.1")

In [4]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import torch

torch.cuda.empty_cache()
topic = "Hanabank for foreigners who speaks english"

cache_dir = "./cache"

def getModel(model_id):
    if model_id == "google/flan-t5-base" or model_id == "google/flan-t5-large":
        tokenizer = AutoTokenizer.from_pretrained(model_id) 
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id, cache_dir=cache_dir) 
    elif model_id == "mistralai/Mistral-7B-Instruct-v0.1":
        tokenizer = AutoTokenizer.from_pretrained(model_id) 
        model = AutoModelForCausalLM.from_pretrained(
            model_id, cache_dir=cache_dir) 
    
    pipe = pipeline(
        "text2text-generation",
        model=model, 
        tokenizer=tokenizer, 
        max_length=256,
        device=-1
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    print(f">> Prep. Get {model_id} ready to go")
    return llm

def make_the_llm(model_id="google/flan-t5-base"):
    template_informed = """
    I am a helpful AI that answers questions which is related to Hana Bank.
    When I don't know the answer I say I don't know.
    I know context: {context}
    when asked: {question}
    my response using only information in the context is: """

    prompt_informed = PromptTemplate(template=template_informed, input_variables=["context", "question"])

    llm = getModel(model_id)

    return LLMChain(prompt=prompt_informed, llm=llm)

llm_chain_informed = make_the_llm("google/flan-t5-large")

>> Prep. Get google/flan-t5-large ready to go


  warn_deprecated(
  warn_deprecated(


## Inference test (gradio)

In [5]:
## how to ask a question
def ask_a_question(question, top_k=3):
    similar_docs = db.similarity_search(question) # 임베딩한 벡터 간 코사인 유사도
    # bm25 사용하면 full-text 기반 검색 : Tf-idf 방식 활용 가능 
    print(f'## We retrieved top-{top_k} relevant documents!')
    print(f"""## The most relevant passage:\n
          top1 : {similar_docs[0].page_content}\n
          top2 : {similar_docs[1].page_content}\n
          top3 : {similar_docs[2].page_content}\n""")
    
    retrieved_documents = f"""## We retrieved top-{top_k} relevant documents!\n## The most relevant passage with query:\n"""
    print(retrieved_documents)
    
    for i in range(top_k):
      retrieved_documents += f"top{i+1} : {similar_docs[i].page_content}\n"

    ## Ask Local LLM context informed prompt
    informed_context = ' '.join([x.page_content for x in similar_docs[:top_k]])
    informed_response = llm_chain_informed.run(context=informed_context,question=question)
    
    return informed_response, retrieved_documents

In [7]:
import gradio as gr

# conversational loop
def inference(user_query, top_k=3):
  response, documents = ask_a_question(user_query, top_k=top_k)
  return response, documents

gr.Interface(
        fn=inference,
        inputs=[
            gr.components.Textbox(lines=2, label="Input", placeholder="What is an ISIC check card?"),
            gr.components.Slider(
                minimum=0, maximum=10, step=1, value=3, label="Top k"
            )
        ],
        outputs=[
            gr.components.Textbox(
                lines=5,
                label="Output",
            ),
            gr.components.Textbox(
                lines=5,
                label="Retrieved documents",
            )
        ],
        title="Hanabank chatbot (English)",
        description="Hello! I am a QA chat bot for Hanabank, ask me any question about it. (Model : flan-t5-large)",
    ).queue().launch(share=True, debug=True)


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://bcbf6344139e939e61.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


## We retrieved top-0 relevant documents!
## The most relevant passage:

          top1 : ISIC Check Card ISIC Check Card What is an ISIC check card? The ISIC check card is a card for students that combines the benefits of an International Student ID Card and a check card into one. KEB Hana Bank offers the WingO ISIC and the Viva G ISIC international student check cards. Issuance Guide Fill out an application online. A photo image is required for student ID Online Enrollment Visit your local KEB Hana Bank branch to apply. - Identification, Proof of Student Status (Issued within 1 month) - Issuance Fee of KRW 14,000 Issue Card Select Wingo ISIC

          top2 : ISIC Check Card earnings Preferential overseas usage fee WingO ISIC international student check card Discounts on language test fees Discounts on book purchases, language schools, and family restaurants Discounts when booking movie tickets online Guide For details, visit the ISIC website.(www.isic.co.kr) The above services and b

Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors


## We retrieved top-3 relevant documents!
## The most relevant passage:

          top1 : Foreign Currency Deposit Account Foreign Currency Deposit Account As a foreign currency deposit account with no eligibility or deposit limitations, customers can conduct unrestricted deposits and withdrawals as well as transactions using ten different currencies with a single account . Deposit Currency : Deposits are available in 27 different currencies that are subject to electronic exchange rate notifications. USD, JPY, EUR, GBP, CHF, CAD, AUD, SEK, DKK, NOK, NZD, HKD, THB, SGD, IDR, SAR, AED, KWD, BHD, RUB, ZAR, MXN, HUF, PLN, TRY, CNY, CZK Product Information Interest Rate Daily-announced deposit interest rate of the respective foreign currency. Deposit Account

          top2 : The Wide Foreign Currency Savings Account The Wide Foreign Currency Savings Account Free-installment foreign currency deposit account that offers unrestricted deposits and split withdrawals as well as a variety of pref

