In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the HUGGINGFACE_TOKEN
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
openai_key = os.getenv('OPENAI_KEY')

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': True})

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': True})


In [5]:
from datasets import load_dataset

# Load the dataset from Hugging Face
bbc_news_collection = load_dataset('permutans/fineweb-bbc-news', 'CC-MAIN-2013-20')

# Print the dataset to verify
print(bbc_news_collection)

DatasetDict({
    train: Dataset({
        features: ['url', 'text'],
        num_rows: 179829
    })
})


In [9]:
import os

# Create the directory if it doesn't exist
output_dir = './Week_2/Day_3/BBC_News_Files'
os.makedirs(output_dir, exist_ok=True)

#get number of entries in bbc_news_collection
num_entries = bbc_news_collection.num_rows
print(num_entries)

{'train': 179829}


In [10]:
import random
import json
from tqdm import tqdm

# Select 100 random indices
random_indices = random.sample(range(num_entries['train']), 100)

# Extract the news items
random_news_items = [bbc_news_collection['train'][i] for i in random_indices]

# Save each news item as a separate JSON file
for idx, news_item in tqdm(enumerate(random_news_items)):
    #create an embedding for news_item.text
    #add that as BAAI embedding to the news_item
    news_item['baai_embedding'] = embeddings.embed_query(news_item['text'])
    file_path = os.path.join(output_dir, f'news_item_{idx + 1}.json')
    with open(file_path, 'w') as f:
        json.dump(news_item, f)

print(f'Saved 100 random news items to {output_dir}')

100it [00:48,  2.05it/s]

Saved 100 random news items to ./Week_2/Day_3/BBC_News_Files





In [11]:
from chromadb import PersistentClient
from chromadb.config import Settings

# Create a new client for chromaDB
client = PersistentClient(path='./chromaDB',settings=Settings(allow_reset=True))
client.reset()

# Create a new collection called 'bbc_news'
bbc_news_collection = client.create_collection(name="bbc_news")

# Print the collection to verify
print(bbc_news_collection)

Collection(name=bbc_news)


In [12]:
# Iterate over the random news items and add them to the ChromaDB collection
for idx, news_item in enumerate(random_news_items):
    file_path = os.path.join(output_dir, f'news_item_{idx + 1}.json')
    metadata = {
        'json_file_path': file_path,
        'url': news_item['url']
    }
    bbc_news_collection.add(
        ids=f'news_item_{idx + 1}',
        documents=[news_item['text']],
        embeddings=[news_item['baai_embedding']],
        metadatas=[metadata]
    )

print(f'Saved 100 random news items to the ChromaDB collection with metadata.')

Saved 100 random news items to the ChromaDB collection with metadata.


In [13]:
# Function to get the top 3 matching documents for a user query
def search_query(query):
    # Create the embedding for the query
    query_embedding = embeddings.embed_query(query)
    
    # Search through the ChromaDB vector store
    results = bbc_news_collection.query(
        query_embeddings=[query_embedding],
        n_results=3
    )
    
    return results

# Example usage
user_query = "How was the voter turnout during the elections in Ireland?"
top_matches = search_query(user_query)

print(top_matches)

# Print the top matches
print("Top matches:")
for i, (document, metadata) in enumerate(zip(top_matches['documents'][0], top_matches['metadatas'][0])):
    print(f"Match {i+1}: {document}")
    print(f"URL: {metadata['url']}")
    print("\n")

{'ids': [['news_item_27', 'news_item_16', 'news_item_36']], 'embeddings': None, 'documents': [['Turnout during the Irish vote was being watched intently across Europe\nPeople in the Republic of Ireland have voted in a referendum on whether to ratify the EU reform treaty.\nThe BBC\'s Jonny Dymond in Dublin says all eyes are on the turnout, as a low figure would suggest a rejection which could plunge the EU into crisis.\nSome reports suggest Thursday\'s voter turnout was about 40%. Results are expected later on Friday.\nAll 27 member states have to ratify the treaty for it to take effect, but only Ireland has held a public vote on it.\nThe treaty is aimed at streamlining decision-making in the EU to cope with its expansion into Eastern Europe and would reduce countries\' veto powers.\nThe Lisbon Treaty replaces a more ambitious draft constitution that was rejected by French and Dutch voters in 2005.\nOpinion polls suggesting the referendum\'s result is too close to call, despite a high-p

In [14]:
context = ""

for i, document in enumerate(top_matches['documents'][0]):
   context += f"\n{document}\n"
   
print(context)


Turnout during the Irish vote was being watched intently across Europe
People in the Republic of Ireland have voted in a referendum on whether to ratify the EU reform treaty.
The BBC's Jonny Dymond in Dublin says all eyes are on the turnout, as a low figure would suggest a rejection which could plunge the EU into crisis.
Some reports suggest Thursday's voter turnout was about 40%. Results are expected later on Friday.
All 27 member states have to ratify the treaty for it to take effect, but only Ireland has held a public vote on it.
The treaty is aimed at streamlining decision-making in the EU to cope with its expansion into Eastern Europe and would reduce countries' veto powers.
The Lisbon Treaty replaces a more ambitious draft constitution that was rejected by French and Dutch voters in 2005.
Opinion polls suggesting the referendum's result is too close to call, despite a high-profile "Yes" campaign led by Prime Minister Brian Cowen which had the support of most of the country's mai

In [18]:
import openai
# Set up the OpenAI API key
openai.api_key = openai_key
client = openai.Client()

# Function to query OpenAI
def query_with_context(query, context):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "developer", "content": "You are a helpful assistant. You will given a query in following format - 'query:<query_text>, context:<contextual information>'. Answer the query only on the basis of the contectual information."},
            {"role": "user", "content": f"'query':{query}, \n'context':{context}"}
        ],
        temperature=0.7,
        max_tokens=250,
    )

    return response.choices[0].message.content

# Example usage
response_openai = query_with_context(user_query, context)
print("Generated Response using OpenAI API:")
print(response_openai)

Generated Response using OpenAI API:
The voter turnout during the elections in Ireland was reported to be about 40%. Reports indicated that turnout was being closely monitored, as a low turnout could suggest a rejection of the referendum on the EU reform treaty. Some areas experienced brisk voting, while others had slower turnout.


In [20]:
import ollama

# Function to query Ollama Gemma
def query_with_context_ollama(query, context):
    prompt = f"""
    You are provided with the following context.
    Context: {context}

    Based on the above context, answer the user's query as accurately as possible.
    Query: {query}
    """
    
    response = ollama.chat(
        model="llama3.2:3b",
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )
    return response["message"]["content"]

# Example usage
response_ollama = query_with_context_ollama(user_query, context)
print("Generated Response using Ollama API:")
print(response_ollama)

Generated Response using Ollama API:
According to the BBC report, the voter turnout during the referendum in Ireland was approximately 40%. This low turnout would suggest a rejection of the EU reform treaty, which could have significant implications for the EU. However, it's worth noting that the results are still being counted and will be confirmed later on Friday morning.


0
