# **Installing the required libraries**

In [1]:
!pip install llama-index

Collecting llama-index
  Downloading llama_index-0.12.11-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.2-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.11 (from llama-index)
  Downloading llama_index_core-0.12.11-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.3-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_llms_openai-0.3.13-py3-none-any.whl.metadata (3.3 kB)
Collec

In [2]:
!pip install -q transformers>=4.41.0 einops accelerate langchain bitsandbytes sentence-transformers
!pip install sentence-transformers
%pip install llama-index-llms-huggingface
%pip install llama-index-llms-huggingface-api
!pip install langchain_community
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor
%pip install llama-index-embeddings-langchain
!pip install llama-index-readers-file


Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.4.2-py3-none-any.whl.metadata (2.9 kB)
Collecting text-generation<0.8.0,>=0.7.0 (from llama-index-llms-huggingface)
  Downloading text_generation-0.7.0-py3-none-any.whl.metadata (8.5 kB)
Downloading llama_index_llms_huggingface-0.4.2-py3-none-any.whl (11 kB)
Downloading text_generation-0.7.0-py3-none-any.whl (12 kB)
Installing collected packages: text-generation, llama-index-llms-huggingface
Successfully installed llama-index-llms-huggingface-0.4.2 text-generation-0.7.0
Collecting llama-index-llms-huggingface-api
  Downloading llama_index_llms_huggingface_api-0.3.1-py3-none-any.whl.metadata (1.3 kB)
Downloading llama_index_llms_huggingface_api-0.3.1-py3-none-any.whl (4.8 kB)
Installing collected packages: llama-index-llms-huggingface-api
Successfully installed llama-index-llms-huggingface-api-0.3.1
Collecting langchain_community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB

In [3]:
!pip install llama-index --upgrade




In [4]:
!pip install Flask




In [5]:
!pip install flask-ngrok pyngrok


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok, flask-ngrok
Successfully installed flask-ngrok-0.0.25 pyngrok-7.2.3


In [6]:
!pip install flask-ngrok
!pip install flask-cors
!pip install flask
!pip install pyngrok


Collecting flask-cors
  Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-5.0.0


In [7]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.core.prompts.prompts import SimpleInputPrompt

# **Data Retrieval**

In [8]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

#API key
api_key = "387f971f-0b7e-4bf2-9a5c-2bba3a2c638f"

#Particular Sections to fetch
sections = ["technology", "science", "sports", "health", "education"]

#Fetching Articles of last 1 month
from_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
to_date = datetime.now().strftime("%Y-%m-%d")

#Endpoint URL
base_url = "https://content.guardianapis.com/search"

# Function to Fetch Articles
def fetch_articles(section, page=1):
    params = {
        "api-key": api_key,
        "section": section,
        "from-date": from_date,
        "to-date": to_date,
        "show-fields": "body",
        "page-size": 50,
        "page": page
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json().get("response", {})
    else:
        print(f"Failed to fetch articles for section {section}. Status code: {response.status_code}")
        return {}

In [9]:
# Fetching and Agregating the Articles
all_articles = []

for section in sections:
    print(f"Fetching articles from section: {section}")
    page = 1
    while True:
        data = fetch_articles(section, page)
        results = data.get("results", [])
        if not results:
            break
        all_articles.extend(results)
        if not data.get("pages", 1) > page:
            break
        page = page+1

Fetching articles from section: technology
Fetching articles from section: science
Fetching articles from section: sports
Fetching articles from section: health
Fetching articles from section: education


In [10]:
#Processing the Articles
if all_articles:
    articles_data = []
    for article in all_articles:
        title = article.get("webTitle", "N/A")
        section = article.get("sectionName", "N/A")
        url = article.get("webUrl", "N/A")
        published_date = article.get("webPublicationDate", "N/A")
        content = article.get("fields", {}).get("body", "Content not available")

        articles_data.append({
            "Title": title,
            "Section": section,
            "URL": url,
            "Published Date": published_date,
            "Content": content
        })


In [11]:
#Converting the data to DatafRame
import pandas as pd
df = pd.DataFrame(articles_data)
print(f"Fetched {len(df)} articles from the past month.")
print(df.head())  # Display the first few articles


Fetched 224 articles from the past month.
                                               Title     Section  \
0  AI could destroy democracy as we know it | Letter  Technology   
1  Who banned TikTok? Politicians toss culpabilit...  Technology   
2  Should I be worried about my obsessive TikTok ...  Technology   
3              Would you let AI choose your outfits?  Technology   
4  ‘Young women can fall pregnant very easily’: i...  Technology   

                                                 URL        Published Date  \
0  https://www.theguardian.com/technology/2025/ja...  2025-01-19T17:24:52Z   
1  https://www.theguardian.com/technology/2025/ja...  2025-01-19T14:00:06Z   
2  https://www.theguardian.com/technology/2025/ja...  2025-01-19T13:00:02Z   
3  https://www.theguardian.com/technology/2025/ja...  2025-01-19T13:00:02Z   
4  https://www.theguardian.com/technology/2025/ja...  2025-01-19T10:00:03Z   

                                             Content  
0  <p>Your editorials and

In [12]:
df.shape

(224, 5)

# **Text Processing**

**Parsing HTML content**

In [13]:
#Creating a function to parse HTML content and extrating text
def parse_html_content(html_content):
    return BeautifulSoup(html_content, "html.parser").get_text(separator=' ', strip=True)

#Applying the fn to Content column
df["Content"] = df["Content"].apply(parse_html_content)

print(df['Content'].head(2))

0    Your editorials and articles about AI, includi...
1    The United States of America deleted TikTok ea...
Name: Content, dtype: object


**Removing StopWords from content**

In [14]:
#Creating a function to remove StopWords from the Content column
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

#Fn to remove stopwords from Content column
def remove_stopwords(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

#Applying the fn to remove stopwords from Content column
df["Content"] = df["Content"].apply(remove_stopwords)
df['Content'].head(2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,Content
0,"editorials articles AI , including Rafael Behr..."
1,United States America deleted TikTok early mor...


In [15]:
#Creating a fn to convert Content text to lower case
def clean_text(text):
  text = text.lower()
  return text

df['Content'] = df['Content'].apply(clean_text)
df['Content'].head(2)

Unnamed: 0,Content
0,"editorials articles ai , including rafael behr..."
1,united states america deleted tiktok early mor...


**Extracting Metadata and storing all the content and metadata in a variable named document**

In [16]:
from llama_index.core import Document
document = []
for index, row in df.iterrows():
  parsed_text = BeautifulSoup(row['Content'], 'html.parser').get_text(separator=' ', strip=True)
  document.append(Document(
  text=parsed_text,
  metadata={
      "Title": row['Title'],
      "Section": row['Section'],
      "Url": row['URL'],
      "date": row['Published Date']
      }
  ))

In [17]:
document[0:2]

[Document(id_='662d110f-e127-4f74-adc6-c1ca1eb7821a', embedding=None, metadata={'Title': 'AI could destroy democracy as we know it | Letter', 'Section': 'Technology', 'Url': 'https://www.theguardian.com/technology/2025/jan/19/ai-could-destroy-democracy-as-we-know-it', 'date': '2025-01-19T17:24:52Z'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='editorials articles ai , including rafael behr ’ piece ( keir starmer right gamble ai revolution , might pay time , 15 january ) , thoughtful contributions debate fifth industrial revolution . much considered democracies might govern ai . little , however , written elephant room : labour markets transformed ai affect democratic governance . since second industrial revolution late 19th century , prevailing national political superstructure industrial capitalism global north , apart interlu

# **Creating Search Workflow using LLAMA INDEX + RAG + LANGCHAIN for Content-based filtering**

**Setting up System prompt and the format supported by LLM (LLama2)**

In [18]:
system_prompt = """
You are a recommendation search assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.
For each result, provide the article title, URL, publication date, and a brief summary. Ensure the metadata is clearly displayed for reference.
"""

#default format supported by llama2
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

**Hugging Face api login**

In [19]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `sssss` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authen

**LLM MODEL (LLama2)**

In [20]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=500,
    generate_kwargs={"temperature": 0.5, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

**Embeddings**

In [21]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

lc_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
embed_model = LangchainEmbedding(lc_embed_model)

  lc_embed_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**Settings**

In [22]:
from llama_index.core import ServiceContext
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter

In [23]:
node_parser = SentenceSplitter(chunk_size=900, chunk_overlap=50)

In [24]:
Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = node_parser


In [25]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

Settings.transformations = [SentenceSplitter(chunk_size=900)]

**Indexing**

In [26]:
index = VectorStoreIndex.from_documents(
    document, embed_model=embed_model
)

query_engine = index.as_query_engine(llm=llm)

In [None]:
# Save the embeddings object using pickle
import pickle
with open('vector_store_index.pkl', 'wb') as file:
    pickle.dump(index, file)




In [None]:
import pickle

# Load the index object from the file
with open('/content/vector_store_index.pkl', 'rb') as file:
    loaded_index = pickle.load(file)


**Asking search recommendations from the bot**

In [27]:
response=query_engine.query("Retrieve top articles related to 'Sentiment analysis in finance'.")
print(response)






Based on the provided context information, the top articles related to 'Sentiment analysis in finance' are:

1. "NVIDIA's AI Chips Dominate Stock Market, Triple Share Price Since Start of 2024"
2. "Google's AI-Generated Search Results: A Game-Changer or a Recipe for Disaster?"
3. "British Novelists Criticize Government Over AI 'Theft'"
4. "AI Podcasting Software: The Next $50 Billion Market?"
5. "Sentiment Analysis in Finance: AI's Next Frontier"

These articles provide insights into the latest developments and trends in the field of sentiment analysis, including the growing influence of AI in finance, the potential of AI-generated search results, and the ongoing debate over the use of AI in creative fields.
