In [3]:
import os
from secret_key import openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [6]:
loader = TextLoader('data.txt', encoding='utf8')
documents = loader.load()

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)


In [28]:
len(texts)

52

In [29]:
texts[3]

Document(page_content='Adeegga eSim waa adeegga casri ah oo ku sahlaya inaad Taleefoonkaaga ku isticmaasho Lambarka Hormuud adigoo taleefoonkaaga wax Simcard ah gelinin.\nSidoo kale Adeegga eSim wuxuu u sahkayaa macaamisha ku sugan dalka dibadiisa in ee qaadan karaa ama ku badalan karaan si Online ah . laakin wuxuu ku shaqeyn karaa agaagga esim wadamo gaara tusaalo : Kenya ,United Arab Emirates ,Turkey,Saudi Arabia,Qatar,Egypt,Sweden,Kuwait', metadata={'source': 'data.txt'})

In [8]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [9]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [10]:
 #Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

In [11]:
retriever = vectordb.as_retriever()

In [12]:
docs = retriever.get_relevant_documents("kawaran adeega esim")

In [13]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [14]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [23]:
# full example
query = "Shuruuradaha Codadka Xayasiinta ah adeegga CRBT"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 1-Ganacsiga inuu yahay mid haysta rugsad ganacsi oo aan dhacsaneyn. 2-In Xayaysiinta aysan ka hor imaanayn shareecada islaam-ka, Shuruucda dalka, iyo Danta Shirkadda Hormuud. 3-In Xayaysiinta aysan ku xadgudbaynin ganacsi kale /shakhsi ama wax laga fahmi karo aysan ku jirin. 4-In codku fasiix yahay oo uusan ka dheereyn muddo 1daqiigo ah, size- kisuna yahay ugu badnaan 500Kb ama ka yar.


Sources:
data.txt
data.txt
data.txt
data.txt
