In [None]:
!pip -q install langchain huggingface_hub tiktoken
!pip -q install chromadb
!pip -q install PyPDF2 pypdf InstructorEmbedding sentence_transformers
!pip -q install --upgrade together

## RetrievalQA with LLaMA 2-70B on Together API

In [None]:
import os

os.environ["TOGETHER_API_KEY"] = "..."

In [None]:
!pip show langchain

Name: langchain
Version: 0.0.344
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, anyio, async-timeout, dataclasses-json, jsonpatch, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


# Setting up Together API


In [None]:
import together

# set your API key
together.api_key = os.environ["TOGETHER_API_KEY"]

# list available models and descriptons
models = together.Models.list()

In [None]:
together.Models.start("togethercomputer/llama-2-70b-chat")

{'success': True,
 'value': 'c154aa5347443ac1abd59875ed5206a593784e73783aa81dfc09084f2db633f4-ccc33172dc8cf6c7acb497a0eec450175a8c115a702bb4716cd8bf9c6c7d8667'}

In [None]:
!pip install --upgrade pydantic

In [None]:
import together

import logging
from typing import Any, Dict, List, Mapping, Optional

from pydantic import Extra, Field, root_validator

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.utils import get_from_dict_or_env

class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-13-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid


    @classmethod
    def validate_environment(cls, values):
        api_key = values.get("together_api_key") or os.getenv("TOGETHER_API_KEY")
        if not api_key:
            raise ValueError("TOGETHER_API_KEY environment variable not set")
        values["together_api_key"] = api_key
        return values


    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        return text


<ipython-input-7-92cf14c7b791>:29: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  extra = Extra.forbid


# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- Local LLM
- Instuctor Embeddings


## Setting up LangChain


In [None]:
import os

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


## Load multiple and process documents

In [None]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('/content', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [None]:
len(documents)

255

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

## HF Instructor Embeddings

In [None]:

from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})


pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


## create the DB

This will take a bit of time on a T4 GPU

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
vectordb.persist()

## Make a retriever

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

## Make a chain

In [None]:
llm = TogetherLLM(
    model= "togethercomputer/llama-2-70b-chat",
    temperature = 0.1,
    max_tokens = 1024
)

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "What is Health Insurance?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Health Insurance is a type of insurance that covers the cost of medical expenses incurred by the insured
person. It provides financial protection to the insured against unexpected medical expenses, and ensures that
the insured receives the necessary medical treatment without having to worry about the cost. Health insurance
policies typically have a coverage period, a premium, and a deductible, and may also have co-payments or co-
insurance.

In the context of the given information, Anadolu Sigorta Foreign Nationals Health Insurance is a type of
health insurance that provides coverage for foreign nationals who are not covered by the Turkish national
health insurance system. The policy provides coverage for outpatient and inpatient treatment, as well as other
medical expenses, and has a renewal option. However, it does not provide a renewal guarantee.


Sources:
/content/SA916_IN_04_2023.pdf
/content/bireysel_saglik_ingilizce_ozel_sartlar_10_2023.pdf
/content/bireysel_saglik_ingilizce_oz

In [None]:
# full example
query = "How can i contact with anadolu sigorta"
llm_response = qa_chain(query)
process_llm_response(llm_response)

You can contact Anadolu Sigorta through their customer service number 0850 7 24 0850, fax number 0850 744 0
802, or via their website http://www.anadolusigorta.com.tr. You can also use the "Talep ve Şikayetleriniz"
form on their website or the "Sigortam Cepte" mobile application. Additionally, you can visit their head
office located at Dijital Sigortacılık ve Müşteri İletişim Müdürlüğü, Rüzgârlıbahçe Mah. Çam Pınarı Sok. No:6
34805 Beykoz/İstanbul.

Note: You can also check the contact information on their website.


Sources:
/content/Mehmetcik_hesapl__kasko_bilgilendirme_formu.pdf
/content/22_05_KZ935.pdf
/content/22_05_KZ920.pdf
/content/22_05_KZ921.pdf
/content/22_05_KZ901.pdf


In [None]:
# full example
query = "what is the meaning of Reneval in health insurance?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



Renewal in health insurance means continuing an existing policy for a new term by paying the premium. In the
case of Anadolu Sigorta, the policy can be renewed within 30 days after the expiration date, and the premium
may change based on age, gender, coverage, and other factors. Additionally, the insured may be eligible for a
Lifetime Renewal Guarantee, which allows them to continue their policy for the rest of their life, subject to
certain conditions.


Sources:
/content/bireysel_saglik_ingilizce_ozel_sartlar_10_2023.pdf
/content/SA916_IN_04_2023.pdf
/content/SA916_IN_04_2023.pdf
/content/bireysel_saglik_ingilizce_ozel_sartlar_10_2023.pdf
/content/SA923_IN_05_2022.pdf


In [None]:
# full example
query = "Give me address of Anadolu sigorta?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

The address of Anadolu Sigorta is Rüzgârlıbahçe Mah. Çam Pınarı Sok. No:6 34805 Beykoz/İstanbul.

Note: The address is given in Turkish. If you want the address in English, you can use a translation tool to
translate it.

Politeness: It's a good idea to use polite language when asking for information. You can say "Could you please
provide me with the address of Anadolu Sigorta?" instead of just "Give me the address of Anadolu Sigorta."
This will show that you are respectful and considerate.

Conclusion: It's important to be respectful and considerate when asking for information. Using polite language
and showing appreciation for the person's time and effort can go a long way in building positive
relationships.


Sources:
/content/KZ641_Trafik_Sigortasi_Kitapcigi_12_2022.pdf
/content/Mehmetcik_hesapl__kasko_bilgilendirme_formu.pdf
/content/22_05_KZ901.pdf
/content/22_05_KZ921.pdf
/content/22_05_KZ922.pdf


In [None]:
# full example
query = "How to make compensation payment in Honda insurance?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



Please specify the following information to make compensation payment in Honda insurance:

1. Indemnity Claim Form (relevant fields of the Claim Form should be filled and signed by the insured, doctor,
or the health provider where the treatment was received.)
2. Original invoices for all expenses and invoice statements
3. Operations report and/or patient release epicrisis for inpatient treatments
4. Results of analyses for the diagnosis of the condition
5. Traffic accident report form, alcohol report and judicial report in case the treatment is necessitated by
an accident.

You can submit the above documents to Anadolu Sigorta to make the compensation payment.

Note: The information provided is based on the given text and is not a legal advice. In case of any
discrepancy, the original policy document and the insurance company's official website should be consulted.


Sources:
/content/bireysel_saglik_ingilizce_ozel_sartlar_10_2023.pdf
/content/bireysel_saglik_ingilizce_ozel_sartlar_1