In [1]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the HUGGINGFACE_TOKEN
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
openai_key = os.getenv('OPENAI_KEY')

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': True})

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en", encode_kwargs={'normalize_embeddings': True})


In [3]:
from datasets import load_dataset

# Load the dataset from Hugging Face
bbc_news_collection = load_dataset('permutans/fineweb-bbc-news', 'CC-MAIN-2013-20')

# Print the dataset to verify
print(bbc_news_collection)

DatasetDict({
    train: Dataset({
        features: ['url', 'text'],
        num_rows: 179829
    })
})


In [4]:
import os

# Create the directory if it doesn't exist
output_dir = './Week_2/Day_3/BBC_News_Files'
os.makedirs(output_dir, exist_ok=True)

#get number of entries in bbc_news_collection
num_entries = bbc_news_collection.num_rows
print(num_entries)

{'train': 179829}


In [5]:
import random
import json
from tqdm import tqdm

# Select 100 random indices
random_indices = random.sample(range(num_entries['train']), 100)

# Extract the news items
random_news_items = [bbc_news_collection['train'][i] for i in random_indices]

# Save each news item as a separate JSON file
for idx, news_item in tqdm(enumerate(random_news_items)):
    #create an embedding for news_item.text
    #add that as BAAI embedding to the news_item
    news_item['baai_embedding'] = embeddings.embed_query(news_item['text'])
    file_path = os.path.join(output_dir, f'news_item_{idx + 1}.json')
    with open(file_path, 'w') as f:
        json.dump(news_item, f)

print(f'Saved 100 random news items to {output_dir}')

100it [00:33,  2.97it/s]

Saved 100 random news items to ./Week_2/Day_3/BBC_News_Files





In [7]:
from chromadb import PersistentClient
from chromadb.config import Settings

# Create a new client for chromaDB
client = PersistentClient(path='./chromaDB',settings=Settings(allow_reset=True))
client.reset()

# Create a new collection called 'bbc_news'
bbc_news_collection = client.create_collection(name="bbc_news")

# Print the collection to verify
print(bbc_news_collection)

Collection(name=bbc_news)


In [13]:
# Iterate over the random news items and add them to the ChromaDB collection
for idx, news_item in enumerate(random_news_items):
    file_path = os.path.join(output_dir, f'news_item_{idx + 1}.json')
    metadata = {
        'json_file_path': file_path,
        'url': news_item['url']
    }
    bbc_news_collection.add(
        ids=f'news_item_{idx + 1}',
        documents=[news_item['text']],
        embeddings=[news_item['baai_embedding']],
        metadatas=[metadata]
    )

print(f'Saved 100 random news items to the ChromaDB collection with metadata.')

Saved 100 random news items to the ChromaDB collection with metadata.


In [14]:
# Function to get the top 3 matching documents for a user query
def search_query(query):
    # Create the embedding for the query
    query_embedding = embeddings.embed_query(query)
    
    # Search through the ChromaDB vector store
    results = bbc_news_collection.query(
        query_embeddings=[query_embedding],
        n_results=3
    )
    
    return results

# Example usage
user_query = "Do you now anything about bulgaria sucide bombing case?"
top_matches = search_query(user_query)

print(top_matches)

# Print the top matches
print("Top matches:")
for i, (document, metadata) in enumerate(zip(top_matches['documents'][0], top_matches['metadatas'][0])):
    print(f"Match {i+1}: {document}")
    print(f"URL: {metadata['url']}")
    print("\n")

{'ids': [['news_item_72', 'news_item_82', 'news_item_89']], 'embeddings': None, 'documents': [['Bulgaria blast: \'Suicide bomber\' killed Israelis\nThe bombing of an Israeli tourist bus in eastern Bulgaria was probably carried out by a male suicide bomber with fake US documents, officials say.\nAt least seven people died and 34 were injured when the bus exploded at Burgas airport, by the Black Sea.\nIsrael has sent planes to Burgas with doctors and officials to bring back the dead and injured.\nIsraeli Defence Minister Ehud Barak said Lebanese Hezbollah was the direct perpetrator, under Iran\'s auspices.\nFive tourists died along with the Bulgarian bus driver and the suspected bomber. Officials had said a sixth Israeli died overnight but this was later corrected.\nThe BBC\'s Jon Donnison, in Jerusalem, says the attack could be part of a covert but violent war between Israel and Iran, and there is a view among some analysts that this attack could be a response to a series of recent atta

In [15]:
context = ""

for i, document in enumerate(top_matches['documents'][0]):
   context += f"\n{document}\n"
   
print(context)


Bulgaria blast: 'Suicide bomber' killed Israelis
The bombing of an Israeli tourist bus in eastern Bulgaria was probably carried out by a male suicide bomber with fake US documents, officials say.
At least seven people died and 34 were injured when the bus exploded at Burgas airport, by the Black Sea.
Israel has sent planes to Burgas with doctors and officials to bring back the dead and injured.
Israeli Defence Minister Ehud Barak said Lebanese Hezbollah was the direct perpetrator, under Iran's auspices.
Five tourists died along with the Bulgarian bus driver and the suspected bomber. Officials had said a sixth Israeli died overnight but this was later corrected.
The BBC's Jon Donnison, in Jerusalem, says the attack could be part of a covert but violent war between Israel and Iran, and there is a view among some analysts that this attack could be a response to a series of recent attacks on Iranian nuclear scientists.
"All the signs lead to Iran," Israeli Prime Minister Benjamin Netanyah

In [16]:
import openai
# Set up the OpenAI API key
openai.api_key = openai_key
client = openai.Client()

# Function to query OpenAI
def query_with_context(query, context):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "developer", "content": "You are a helpful assistant. You will given a query in following format - '{query:<query_text>, context:<contextual information>'. Answer the query only on the basis of the contectual information."},
            {"role": "user", "content": f"'query':{query}, \n'context':{context}"}
        ],
        temperature=0.7,
        max_tokens=250,
    )

    return response.choices[0].message.content

# Example usage
response_openai = query_with_context(user_query, context)
print("Generated Response using OpenAI API:")
print(response_openai)

Generated Response using OpenAI API:
The Bulgaria suicide bombing case refers to the attack on an Israeli tourist bus at Burgas airport, which resulted in at least seven deaths and 34 injuries. Officials believe the bombing was executed by a male suicide bomber who carried fake US documents. Israeli Defence Minister Ehud Barak has stated that Lebanese Hezbollah was the direct perpetrator of the attack, suggesting it was conducted under Iran's auspices. The explosion occurred shortly after tourists from Israel boarded the bus. The incident has been interpreted as part of a broader conflict between Israel and Iran, possibly in retaliation for previous attacks on Iranian nuclear scientists. The suspected bomber was captured on CCTV footage walking around the terminal before the explosion. The Israeli government has indicated that they will respond forcefully to what they characterize as Iranian terror.


In [27]:
import ollama

# Function to query Ollama Gemma
def query_with_context_ollama(query, context):
    prompt = f"""
    You are provided with the following context.
    Context: {context}

    Based on the above context, answer the user's query as accurately as possible.
    Query: {query}
    """
    
    response = ollama.chat(
        model="gemma:2b",
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )

    print(context)
    return response["message"]["content"]

# Example usage
response_ollama = query_with_context_ollama(user_query, context)
print("Generated Response using Ollama API:")
print(response_ollama)


Bulgaria blast: 'Suicide bomber' killed Israelis
The bombing of an Israeli tourist bus in eastern Bulgaria was probably carried out by a male suicide bomber with fake US documents, officials say.
At least seven people died and 34 were injured when the bus exploded at Burgas airport, by the Black Sea.
Israel has sent planes to Burgas with doctors and officials to bring back the dead and injured.
Israeli Defence Minister Ehud Barak said Lebanese Hezbollah was the direct perpetrator, under Iran's auspices.
Five tourists died along with the Bulgarian bus driver and the suspected bomber. Officials had said a sixth Israeli died overnight but this was later corrected.
The BBC's Jon Donnison, in Jerusalem, says the attack could be part of a covert but violent war between Israel and Iran, and there is a view among some analysts that this attack could be a response to a series of recent attacks on Iranian nuclear scientists.
"All the signs lead to Iran," Israeli Prime Minister Benjamin Netanyah

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma

# Initialize the vector store (ChromaDB in this case)
vector_store = Chroma(persist_directory='./chromaDB', embedding_function=embeddings.embed_query)

# Initialize the language model (OpenAI in this case)
llm = OpenAI(api_key=openai_key)

# Create the RetrievalQA chain
rag_chain = RetrievalQA(
    retriever=vector_store.as_retriever(),
    llm=llm,
    return_source_documents=True
)

# Function to perform RAG-based query
def rag_query(query):
    response = rag_chain(query)
    return response

# Example usage
query = "Do you know anything about Bulgaria suicide bombing case?"
response = rag_query(query)

print("Generated Response:")
print(response['result'])

print("\nSource Documents:")
for doc in response['source_documents']:
    print(doc)

  llm = OpenAI(api_key=openai_key)


AttributeError: 'OpenAI' object has no attribute 'as_retriever'

0
