# Retrieval Augmented Generation: BBC Politics News

##Packages Installation and Import

In [None]:
# Install Ollama v0.1.30
!curl https://ollama.ai/install.sh | sed 's#https://ollama.ai/download#https://github.com/jmorganca/ollama/releases/download/v0.1.30#' | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0>>> Downloading ollama...
100 10091    0 10091    0     0  42341      0 --:--:-- --:--:-- --:--:-- 42221
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
%%capture
# Setup the model as a global variable
OLLAMA_MODEL='phi:latest'

# Add the model to the environment of the operating system
import os
os.environ['OLLAMA_MODEL'] = OLLAMA_MODEL
!echo $OLLAMA_MODEL # print the global variable to check it saved

import subprocess
import time

# Start ollama on the server ("serve")
command = "nohup ollama serve&" # "nohup" and "&" means run in the background

# Use subprocess.Popen to run the command
process = subprocess.Popen(command,
                            shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

time.sleep(5)  # Makes Python wait for 5 seconds

# Install prerequisites
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-ollama
!pip install llama-index ipywidgets
!pip install llama-index-llms-huggingface
!pip install llama_index.readers.web
!pip install llama-index-vector-stores-chroma
!pip install chromadb

# Import required modules from the llama_index library
from llama_index.core import VectorStoreIndex, SummaryIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import StorageContext

# Import ChromaVectorStore and chromadb module
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

# Import the Ollama class
from llama_index.llms.ollama import Ollama

# Use the global variable (OLLAMA_MODEL) as our LLM
# Set a timeout of 8 minutes in case of CPU
llm = Ollama(model=OLLAMA_MODEL, request_timeout=480.0)

In [None]:
# Query the model via the command line
# First time running it will "pull" (import) the model

# Test question 1: simple question

!ollama run $OLLAMA_MODEL "Who is Neil Kinnock?"

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
p

In [None]:
!ollama run $OLLAMA_MODEL "Why could CSA close?"

In [None]:
%%capture

# Install prerequisites
!pip install datasets
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-ollama
!pip install llama-index-vector-stores-chroma
!pip install llama-index ipywidgets
!pip install llama-index-llms-huggingface
!pip install chromadb

# Import required modules from the llama_index library
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import StorageContext

# Import ChromaVectorStore and chromadb module
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

# Import the Ollama class
from llama_index.llms.ollama import Ollama

In [None]:
# use capture to hide output messages
%%capture

!pip install accelerate -U
!pip install -U sentence-transformers
!pip install faiss-gpu
!pip install arxiv

import faiss
import arxiv
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE

In [None]:
%%capture
!pip install llama_index.core
!pip install llama_index.readers.file
!pip install llama_index.readers.web

##Data Loading

BBC News data

In [None]:
!pip install datasets
from datasets import load_dataset

dataset = load_dataset("RealTimeData/News_August_2023")



Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/542.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.54M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5059 [00:00<?, ? examples/s]

In [None]:
# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset['train'])


In [None]:
import pandas as pd
import re  # Importing regular expressions library
from google.colab import files
import io

# Create an upload button and get the uploaded file
uploaded_files = files.upload()

# Get the file data from the uploaded files
file_name = next(iter(uploaded_files))  # This gets the name of the first uploaded file
file_data = uploaded_files[file_name]

# Read the CSV data into a DataFrame
df = pd.read_csv(io.BytesIO(file_data))

# Only keep politics data for further exploration
#df = df[df['labels'] == 'politics']

#df = df[df['labels'].isin(['politics', 'tech'])]

# Remove duplicate entries in 'data' column
df = df.drop_duplicates(subset=['Article text'])

# Display the filtered DataFrame
df.head()


Saving CNN_Articels_clean.csv to CNN_Articels_clean.csv


Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d..."
1,2,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...
2,3,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut..."
3,4,"Paul R. La Monica, CNN Business",2022-03-15 09:57:36,business,investing,https://www.cnn.com/2022/03/15/investing/brics...,Russia is no longer an option for investors. T...,"For many years, the world's most popular emerg...","investing, Russia is no longer an option for i...",Russia is no longer an option for investors. T...,"New York (CNN Business)For many years, the wor..."
4,7,Reuters,2022-03-15 11:27:02,business,business,https://www.cnn.com/2022/03/15/business/russia...,Russian energy investment ban part of new EU s...,The European Union formally approved on Tuesda...,"business, Russian energy investment ban part o...",EU bans investment in Russian energy in new sa...,The European Union formally approved on Tuesda...


In [None]:
#store each row in column 'data' in separated txt files

!mkdir -p '/content/bbc_data/' # create an empty directory called "bbc_data"

count = 0

for index, row in df.iterrows():
    data_content = row['data']
    fname = "/content/bbc_data/Output" + str(count) + ".txt"
    with open(fname, "w") as text_file:
        text_file.write(data_content)
    count += 1


##Chunking

###Semantic Splitter

In [None]:
# Load documents
reader = SimpleDirectoryReader("/content/bbc_data") # load documents from the /data folder
docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

# Initialize a HuggingFace Embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
#embed_model = HuggingFaceEmbedding(model_name="TaylorAI/gte-tiny")
llm = Ollama(model=OLLAMA_MODEL, request_timeout=1500.0)

# Specify the LLM and embedding model into LlamaIndex's settings
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
# run this one, works
from llama_index.core.node_parser import SemanticSplitterNodeParser
from pathlib import Path  # for finding the file

parser = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=90, embed_model=embed_model
)

semantic_nodes = parser.get_nodes_from_documents(docs)

# Here, semantic_nodes will contain the output which can be used for further processing
print(semantic_nodes)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
semantic_nodes[0].text

'Baron Kinnock makes Lords debut  Former Labour leader Neil Kinnock has officially been made a life peer during a ceremony in the House of Lords.  He will be known Baron Kinnock of Bedwellty - after his former constituency. Lord Kinnock - who led Labour from 1983 until 1992 - was until recently one of Britains EU commissioners. A former critic of the House of Lords, he has said he will use the Upper House to advocate its reform and to talk on issues like higher education. "I accepted the kind invitation to enter the House of Lords as a working peer for practical political reasons," he said when his peerage was first announced. "It is a good base for campaigning on national issues like education, sustainable transport, industrial change and the ageing society and global concerns, particularly poverty and oppression." During his induction into the Upper House, Lord Kinnock was accompanied by Lords Leader Baroness Amos and Baroness Royall of Blaisdon, a former aide to the ex-Labour leader

In [None]:
# extract splitted text from the semantic output

# Assuming 'semantic_nodes' is your list of TextNode objects
all_texts = [node.text for node in semantic_nodes]

In [None]:
!mkdir -p '/content/splitted_bbc_politics_data/' # create an empty directory called "data"

count = 0

for doc in all_texts: # iterate through the results
  fname = "/content/splitted_bbc_politics_data/Output" + str(count) + ".txt"
  with open(fname, "w") as text_file:
    text_file.write(doc) # save the file
  count += 1 # increment the count

##Embedding
Now we have our data we will create embedings of the abstracts (encoding) using sentence level [DistilBERT](https://huggingface.co/docs/transformers/en/model_doc/distilbert). DistilBERT is a smaller version of classic BERT, designed to have similar performance with 40% fewer parameters (so faster).

In [None]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Convert abstracts to vectors
embeddings = model.encode(df.abstract.to_list(), show_progress_bar=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

AttributeError: 'DataFrame' object has no attribute 'abstract'

##Vector Database Setup

In [None]:
# bbc data

# Import ChromaVectorStore and chromadb module
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

# Initialize a HuggingFace Embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

#embed_model = HuggingFaceEmbedding(model_name="TaylorAI/gte-tiny")
llm = Ollama(model=OLLAMA_MODEL, request_timeout=1500.0)

# Specify the LLM and embedding model into LlamaIndex's settings
Settings.llm = llm
Settings.embed_model = embed_model

# Load documents
reader = SimpleDirectoryReader("/content/splitted_bbc_politics_data") # load documents from the /data folder
docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

# Create client ("db") and a database ("chroma_db")
db = chromadb.PersistentClient(path="./chroma_db")

# Create a collection/table ("demo-for-ram") in the db
chroma_collection = db.create_collection("my-demo")

# Set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# Specify Chroma as our vector db
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create the vector index
vector_index = VectorStoreIndex.from_documents(
    docs, # the file created earlier
    storage_context = storage_context,
    embed_model = embed_model
)

# Print the metadata
print(chroma_collection)

# Print the name of the collection (table)
print(f'Collection name is: {chroma_collection.name}')

Loaded 1397 docs
name='my-demo3' id=UUID('f6d73a74-5660-49fc-b316-229bf8af1b65') metadata=None tenant='default_tenant' database='default_database'
Collection name is: my-demo3


##Prompt Template Setup

In [None]:
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate

qa_prompt_str = (
    "Below is the context information.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the question: {query_str}\n"
)

# Text QA Prompt
chat_text_qa_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            "Please just say 'I don't know the answer' if the answer is not provided in the context."
        ),
    ),
    ChatMessage(role=MessageRole.USER, content=qa_prompt_str),
]

text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

##Query Testing

In [None]:
#ignore this one
query_engine = vector_index.as_query_engine(response_mode="refine") # use the vector db for queries
response = query_engine.query("Who is Neil Kinnock?") # query Phi-3 with context
response.response # print the response

' Neil Kinnock is a former British politician who served as the leader of the Labour Party from 1993 to 1997. He was born in Tredegar, South Wales in 1942 and went on to become an influential figure in British politics. In recent years, he has been advocating for reform of the House of Lords and expressing his views on important political issues such as higher education, sustainable transport, industrial change, and the ageing society.\n'

In [None]:
response.metadata

{'ef1adacd-506f-44d6-8fac-fa8ddf9db24a': {'file_path': '/content/splitted_bbc_politics_data/Output0.txt',
  'file_name': 'Output0.txt',
  'file_type': 'text/plain',
  'file_size': 1378,
  'creation_date': '2024-05-18',
  'last_modified_date': '2024-05-18'},
 'bad518a5-6d3f-479d-b8b0-d6d3166a5903': {'file_path': '/content/splitted_bbc_politics_data/Output1382.txt',
  'file_name': 'Output1382.txt',
  'file_type': 'text/plain',
  'file_size': 450,
  'creation_date': '2024-05-18',
  'last_modified_date': '2024-05-18'}}

In [None]:
# Test1: Answerable question (politics)
print(
    vector_index.as_query_engine(
        text_qa_template=text_qa_template,
        llm=llm,
    ).query("Why could CSA close?")
)

 The Child Support Agency (CSA) may close because it has failed to improve its service and is facing criticism from a report by the Commons work and pensions committee.



In [None]:
# Test2: Not answerable question
print(
    vector_index.as_query_engine(
        text_qa_template=text_qa_template,
        llm=llm,
    ).query("What are the most famous plays written by William Shakespeare?")
)

 I do not have access to the specific list of plays written by william shakespeare. however, some of his most famous plays include "romeo and juliet," "hamlet," "macbeth," "a Midsummer Night's Dream," "poetry," and many more.



In [None]:
# Test3: Complex question
print(
    vector_index.as_query_engine(
        response_mode="refine",
        text_qa_template=text_qa_template,
        llm=llm,
    ).query("What are the main issues that the Muslim Association of Britain (MAB) believes Muslims should consider before voting in the next general election?")
)

 The Muslim Association of Britain (MAB) believes there are several main issues that should be considered by Muslim voters before casting their votes in the next general election. These include: 
1. The war on Iraq, which is expected to have a significant impact due to its influence on voting intentions and the number of seats Muslims could potentially sway. 
2. The Palestinian situation, an issue that holds importance for many Muslim voters and may affect their decision-making process. 
3. The erosion of civil liberties for Muslims in the United Kingdom, which is another factor worth considering while making informed choices during the election. 
4. Economic, social, and educational problems faced by the UK as a whole, regardless of one's religious background. It is crucial to be aware of these issues when casting votes to shape the future of the nation.

