# RAG implementation for Tech-news dataset

### Installation

In [86]:
!pip install -q langchain sentence-transformers cohere

In [87]:
!pip install faiss-cpu



In [88]:
!pip install rank_bm25



In [89]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [90]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import FAISS
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
import os
from getpass import getpass

HF_token = getpass()

··········


In [91]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HF_token

## Dataset
[BBC Tech News Dataset](http://mlg.ucd.ie/datasets/bbc.html)

In [92]:
dataset_folder_path='/content/drive/MyDrive/Tech_news_dataset/dataset_news/'

In [93]:
documents=[]
for file in os.listdir(dataset_folder_path):
  loader=TextLoader(dataset_folder_path+file)
  documents.extend(loader.load())


In [94]:
documents[:3]

[Document(page_content='Millions buy MP3 players in US\n\nOne in 10 adult Americans - equivalent to 22 million people - owns an MP3 player, according to a survey.\n\nA study by the Pew Internet and American Life Project found that MP3 players are the gadget of choice among affluent young Americans. The survey did not interview teenagers but it is likely that millions of under-18s also have MP3 players. The American love affair with digital music players has been made possible as more and more homes get broadband.\n\nOf the 22 million Americans who own MP3 players, 59% are men compared to 41% of women. Those on high income - judged to be $75,000 (£39,000) or above - are four times more likely to have players than those earning less than $30, 000 ( £15,000). Broadband access plays a big part in ownership too. Almost a quarter of those with broadband at home have players, compared to 9% of those who have dial-up access. MP3 players are still the gadget of choice for younger adults. Almost

In [95]:
len(documents)

110

### Chunking the text

In [96]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=512,chunk_overlap=50)
text_splits=text_splitter.split_documents(documents)
print(len(text_splits))

866


### Embedding Model

In [97]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_token,
    model_name='BAAI/bge-base-en-v1.5'
)

In [98]:
vectorstore = FAISS.from_documents(text_splits, embeddings)

## **Implementing Hybrid Search with ensemble Retrieval**

In [99]:
retriever_vectordb = vectorstore.as_retriever(search_kwargs={"k": 5})

In [100]:
keyword_retriever = BM25Retriever.from_documents(text_splits)
keyword_retriever.k =  5

In [101]:
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [102]:
query="How many cafes were closed in 2004?"

In [None]:
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel

[Document(page_content="China net cafe culture crackdown\n\nChinese authorities closed 12,575 net cafes in the closing months of 2004, the country's government said.", metadata={'source': '/content/drive/MyDrive/Tech_news_dataset/dataset_news/002.txt'}),
 Document(page_content='According to the official news agency most of the net cafes were closed down because they were operating illegally. Chinese net cafes operate under a set of strict guidelines and many of those most recently closed broke rules that limit how close they can be to schools. The move is the latest in a series of steps the Chinese government has taken to crack down on what it considers to be immoral net use.', metadata={'source': '/content/drive/MyDrive/Tech_news_dataset/dataset_news/002.txt'}),
 Document(page_content='The official Xinhua News Agency said the crackdown was carried out to create a "safer environment for young people in China". Rules introduced in 2002 demand that net cafes be at least 200 metres away f

### Implementing Re-ranking with Cohere-Rerank

In [None]:
Cohere_API_token = getpass()

··········


In [103]:
os.environ["COHERE_API_KEY"] =Cohere_API_token

In [104]:
from langchain.llms import HuggingFaceHub
model=HuggingFaceHub(repo_id='HuggingFaceH4/zephyr-7b-alpha',
                     model_kwargs={"temperature":0.5,"max_new_tokens":512,"max_length":64}
)



In [105]:
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)
compressed_docs = compression_retriever.get_relevant_documents(query)

In [106]:
compression_retriever

ContextualCompressionRetriever(base_compressor=CohereRerank(client=<cohere.client.Client object at 0x7cd149d20760>, top_n=3, model='rerank-english-v2.0', cohere_api_key=None, user_agent='langchain'), base_retriever=EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7cd149d207f0>, search_kwargs={'k': 5}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7cd149e0a8f0>, k=5)], weights=[0.5, 0.5]))

In [107]:
compressed_docs

[Document(page_content="China net cafe culture crackdown\n\nChinese authorities closed 12,575 net cafes in the closing months of 2004, the country's government said.", metadata={'source': '/content/drive/MyDrive/Tech_news_dataset/dataset_news/002.txt', 'relevance_score': 0.9984022}),
 Document(page_content='According to the official news agency most of the net cafes were closed down because they were operating illegally. Chinese net cafes operate under a set of strict guidelines and many of those most recently closed broke rules that limit how close they can be to schools. The move is the latest in a series of steps the Chinese government has taken to crack down on what it considers to be immoral net use.', metadata={'source': '/content/drive/MyDrive/Tech_news_dataset/dataset_news/002.txt', 'relevance_score': 0.98469365}),
 Document(page_content='The official Xinhua News Agency said the crackdown was carried out to create a "safer environment for young people in China". Rules introduce

In [108]:
template = """
<|system|>>
You are an AI Assistant that follows instructions extremely well.
Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in CONTEXT

CONTEXT: {context}
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [109]:
prompt = ChatPromptTemplate.from_template(template)

In [110]:
output_parser = StrOutputParser()

In [111]:
chain = (
    {"context": compression_retriever, "query": RunnablePassthrough()}
    | prompt
    | model
    | output_parser
)

In [112]:
query="How many cafes were closed in 2004 in China?"

In [113]:
response = chain.invoke(query)

In [114]:
response

'According to the context provided, 12,575 net cafes were closed in the closing months of 2004 in China.'