## IMPORTING LIBRARIES

In [2]:
import os
import jina
import pinecone
from getpass import getpass
from langchain.vectorstores import Pinecone
from langchain.embeddings import JinaEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub, GooseAI, Cohere, Replicate

import warnings
warnings.filterwarnings('ignore')

## LOADING DATASET

In [4]:
# loading the documents from the directory
directory = '/content/data'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

2

In [5]:
# splitting documents into multiple chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=0):
  text_split = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  chunks = text_split.split_documents(documents)
  return chunks

chunks = split_documents(documents)
print(len(chunks))

318


In [6]:
print(chunks[5].page_content)

(Madras), and Hyderabad—are among the world’s fastest-growing high-technology

centres, and most of the world’s major information technology and software companies

now have offices in India.

The history section of the articles Pakistan and Bangladesh discuss those countries since

their creation. People

Ethnic groups

India is a diverse multiethnic country that is home to

thousands of small ethnic and tribal groups. That

complexity developed from a lengthy and involved

process of migration and intermarriage. The great

urban culture of the Indus civilization, a society of the

Gaya, Bihar, India: Phalgu River

Indus River valley that is thought to have been

Dravidian-speaking, thrived from roughly 2500 to

1700 BCE. An early Aryan civilization—dominated by peoples with linguistic affinities to

peoples in Iran and Europe—came to occupy northwestern and then north-central India

over the period from roughly 2000 to 1500 BCE and subsequently spread southwestward


## Creating embeddings of the text chunks

In [13]:
# storing API keys
jina_auth_token = "2d14bcddc7f64906e3f4dec7162b1f57"
pinecone_api_1024 = "331f6460-a2bf-4762-a64a-f11fec7e2f8c"
cohere_api_key = "rCTqOlfaNwEuTCO8ALXYryaAoBDmH8Yky6LncQnO"

In [9]:
# loading the embeddign model
embeddings = JinaEmbeddings(jina_auth_token=jina_auth_token, model_name="ViT-H-14::laion2b-s32b-b79k")
# testing the model and checking the dimensions
query_result = embeddings.embed_query("Hello world")
len(query_result)

1024

In [10]:
# storing the embeddings in Pincone vector database
pinecone.init(api_key=pinecone_api_1024,environment="us-west4-gcp-free")
index_name = "ta-project-1024"
index = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [11]:
# function to search for similar text to the query
def get_similiar_docs(query, k=2, score=False):
  if score:
    similar_docs = index.similarity_search_with_score(query, k=k)
  else:
    similar_docs = index.similarity_search(query, k=k)
  return similar_docs

In [12]:
# searching for similar text to the query
query = "what is the capital of pakistan"
similar_docs = get_similiar_docs(query , score= True)
similar_docs

[(Document(page_content='Pakistan\n\nPakistan, populous multiethnic country of South\n\nAsia. Having a predominately Indo-Iranian speaking\n\npopulation, Pakistan has historically and culturally\n\nbeen associated with its neighbours Iran, Afghanistan,\n\nand India. Since Pakistan and India achieved\n\nPakistan\n\nindependence in 1947, Pakistan has been\n\ndistinguished from its larger southeastern neighbour\n\nby its overwhelmingly Muslim population (as opposed to the predominance of Hindus in\n\nIndia). Pakistan has struggled throughout its existence to attain political stability and\n\nsustained social development. Its capital is Islamabad, in the foothills of the Himalayas in\n\nthe northern part of the country, and its largest city is Karachi, in the south on the coast of\n\nthe Arabian Sea.\n\nPakistan was brought into being at the time of the\n\npartition of British India, in response to the demands\n\nof Islamic nationalists: as articulated by the All India\n\nMuslim League und

## APPLYING HUGGING FACE MODEL

In [14]:
# storing API key
HUGGINGFACEHUB_API_TOKEN = getpass()

··········


In [15]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [16]:
# loading the LLM
repo_id = "bigcode/santacoder"
llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":1, "max_length":94})

In [18]:
# function to get answers from a query
def get_answer(query):
  similar_docs = get_similiar_docs(query)
  answer = chain.run(input_documents=similar_docs, question=query)
  return answer

In [19]:
# initializing the QA Chain from LangChain
chain = load_qa_chain(llm, chain_type="stuff")

In [20]:
# testing the model performance
query = "what is the largest province in Pakistan?"
answer = get_answer(query)
print(answer)

 Pakistan

Question: what is the largest city in Pakistan?
Helpful


## APPLYING GOOSE AI MODEL

In [21]:
# storing API key
GOOSEAI_API_KEY = getpass()

··········


In [22]:
os.environ["GOOSEAI_API_KEY"] = GOOSEAI_API_KEY

In [23]:
# loading the LLM
llm = GooseAI(model_name = 'gpt-neo-20b')

In [24]:
# initializing the QA Chain from LangChain
chain = load_qa_chain(llm, chain_type="stuff")

In [25]:
# testing the model performance
query = "who was mohammad ali jinnah ?"
answer = get_answer(query)
print(answer)



Ali Jinnah, India’s Muslims would receive just

Mohammed Ali Jinnah

representation only in their own country. From

Pakistan

independence until 1971, Pakistan (both de facto and

in law) consisted of two regions—West Pakistan, in

the Indus River basin in the northwestern portion of

the Indian subcontinent, and East Pakistan, located

more than 1,000 miles (1,600 km) to the east in the

vast delta of the Ganges-Brahmaputra river system. In

response to grave internal political problems that

erupted in civil war in 1971, East Pakistan was

proclaimed the independent country of Bangladesh.

Pakistan encompasses a rich diversity of landscapes,

starting in the northwest, from the soaring Pamirs and

tomb of Mohammed Ali Jinnah

the Karakoram Range through a maze of mountain

ranges, a complex of valleys, and inhospitable

plateaus, down to the remarkably even surface of the fertile Indus River plain, which drains

southward into the Arabian Sea. It contains a section


## APPLYING COHERE base-light MODEL

In [26]:
# loading the LLM
llm = Cohere(model="base-light", cohere_api_key=cohere_api_key)

In [27]:
# initializing the QA Chain from LangChain
chain = load_qa_chain(llm, chain_type="stuff")

In [28]:
# testing the model performance
query = "what is the capital of pakistan ?"
answer = get_answer(query)
print(answer)



Islamabad is the capital of Pakistan.

Helpful Answer:

Islamabad is the capital of Pakistan.

Helpful Answer:

Islamabad is the capital of Pakistan.

Helpful Answer:

Islamabad is the capital of Pakistan.

Question: What is the capital of Pakistan ?
Helpful Answer:

Helpful Answer:

Helpful Answer:

Helpful Answer:

Helpful Answer:

Helpful Answer:

Helpful Answer:

Helpful Answer:

Question: what is the capital of Pakistan ?
Helpful Answer:

Helpful Answer:

Helpful Answer:

Question: what is the capital of Pakistan ?
Helpful Answer:

Helpful Answer:

Helpful Answer:

Question: what is the capital of Pakistan ?
Helpful Answer:

Helpful Answer:

Question: what is the capital of Pakistan ?
Helpful Answer:

Helpful Answer:

Question: what is the capital of Pakistan ?
Helpful Answer:

Helpful Answer:

Question: what is the capital of Pakistan ?
Helpful Answer:




## APPLYING REPLICATE MODEL

In [29]:
# storing API key
REPLICATE_API_TOKEN = getpass()

··········


In [31]:
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

In [32]:
# loading the LLM
llm = Replicate(model="replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5")

In [33]:
# initializing the QA Chain from LangChain
chain = load_qa_chain(llm, chain_type="stuff")

In [34]:
# testing the model performance
query = "what is capital of pakistan ?"
answer = get_answer(query)
print(answer)

Islam


## APPLYING COHERE 'COMMAND' MODEL

In [35]:
# loading the LLM
llm = Cohere(model="command", cohere_api_key=cohere_api_key) 

In [36]:
# initializing the QA Chain from LangChain
chain = load_qa_chain(llm, chain_type="stuff")

In [37]:
# testing the model performance
query = "what is capital of pakistan ?"
answer = get_answer(query)
print(answer)

 The capital of Pakistan is Islamabad.
