In [32]:
import os
import random
import itertools

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Pinecone as pinecone_lang
from langchain.chains.question_answering import load_qa_chain
from langchain_google_genai import GoogleGenerativeAI

from pinecone import Pinecone, ServerlessSpec

from markdown_utils import to_markdown

## Define constants


In [33]:
MODEL_NAME = "gemini-1.5-pro-latest"
EMBEDDING_MODEL_NAME = "models/embedding-001"

## Retrieve the environment variables


In [34]:
MODEL_API_KEY = os.environ.get("GOOGLE_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

## Pinecone init and configuration


In [35]:
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define the pinecone index name
index_name = "gimhara-pinecone-index"

## Initialize the llm model (google gemini model)


In [36]:
llm = GoogleGenerativeAI(model=MODEL_NAME, google_api_key=MODEL_API_KEY)

## Prepare the data


In [37]:
loader = TextLoader(file_path="./data/data.txt")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)
text_contents = [t.page_content for t in texts]

## Vector embedding data


In [38]:
vector_dim = 768  # number of vector dimension of google gemini embeddings
vector_count = len(texts)


# Initialize embeddings model
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME)

## A helper function to break an iterable into chunks of size batch_size


In [39]:
def chunks(iterable, batch_size=100):

    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

## Data Generator that generates many (id, vector) pairs


In [40]:
data_generator = map(
    lambda i: (f"id-{i}", [random.random() for _ in range(vector_dim)]),
    range(vector_count),
)

## Create and initialize a pinecone instance


In [41]:
pc.create_index(
    name=index_name,
    dimension=vector_dim,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

## Initialize the Pinecone index


In [42]:
index = pc.Index(index_name)

## Upsert data with 100 vectors per upsert request


In [43]:
for ids_vectors_chunk in chunks(data_generator, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)

## create a docserch object for given index


In [44]:
docsearch = pinecone_lang.from_texts(
    [t.page_content for t in texts], embeddings, index_name=index_name
)

## Define query


In [45]:
query = "what are the Challenges and Opportunities in Sri Lanka?"

## Get the documents from the pinecone db which is similar to the query


In [46]:
docs = docsearch.similarity_search(query, k=10)

In [47]:
docs

[Document(page_content='*Challenges and Opportunities*'),
 Document(page_content="*Tourism*\n\nSri Lanka's natural beauty and cultural heritage have made it a popular tourist destination, attracting millions of visitors each year. From the ancient city of Anuradhapura to the pristine beaches of Mirissa, the island offers a wealth of attractions for travelers seeking adventure, relaxation, and cultural immersion. However, the tourism industry also faces challenges, such as environmental degradation, overcrowding, and the need for sustainable development practices."),
 Document(page_content='Despite its many strengths, Sri Lanka faces numerous challenges in the modern world. Ethnic tensions, political instability, and socioeconomic disparities continue to hamper progress and development. However, the island also possesses significant opportunities for growth and prosperity, including its strategic location, natural resources, and human capital. By addressing these challenges and harnessi

## Create a chain object from langchain for questions and answerings


In [48]:
chain = load_qa_chain(llm, chain_type="stuff")

## Run the chain and store the results


In [49]:
chain_output = chain.run(input_documents=docs, question=query)

## Display the results in markdown format


In [50]:
to_markdown(chain_output)

> ## Challenges and Opportunities in Sri Lanka:
> 
> **Challenges:**
> 
> * **Ethnic Tensions and Political Instability:**  Sri Lanka's history is marked by ethnic tensions, leading to a long civil war. While the war ended in 2009, achieving lasting peace and reconciliation remains a challenge. Political instability further complicates progress and development. 
> * **Economic Issues:**  High levels of debt, poverty, and unemployment pose significant challenges. The impacts of natural disasters and political instability exacerbate these issues.
> * **Environmental Concerns:**  The tourism industry, while beneficial, also contributes to environmental degradation and overcrowding. Balancing economic growth with sustainable development is crucial.
> 
> **Opportunities:**
> 
> * **Strategic Location:**  Sri Lanka's position in the Indian Ocean offers potential for increased trade and economic growth.
> * **Natural Resources:**  The island's fertile lands and diverse ecosystems provide opportunities for agricultural development and ecotourism.
> * **Human Capital:**  Sri Lanka has a relatively well-educated population, which can contribute to advancements in various sectors.
> * **Tourism Potential:** With its natural beauty and cultural heritage, Sri Lanka can further develop sustainable tourism practices to attract visitors while preserving its environment and cultural identity. 
> 
> **By addressing its challenges and leveraging its opportunities, Sri Lanka can work towards a brighter future for its people.** 
