In [22]:
import json
import re
import pandas as pd
from rank_bm25 import BM25Okapi
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline

In [2]:
#Load the json file
file_path = 'news.article.json'

In [3]:
with open(file_path, 'r', encoding='utf-8') as file:
    articles = json.load(file)

In [4]:
articles

[{'articleBody': 'Sanjay Raut, a member of the Shiv Sena (UBT) party, responded to the Maharashtra chief minister\'s statement that Eknath Shinde "himself is Hamas" and that the Shiv Sena group led by Uddhav Thackeray is capable of collaborating with "Hamas and Lashkar-e-Taiba for their own selfishness" on Wednesday by claiming that Eknath Shinde is Hamas.\n\n\n\nRaut made fun of Shinde by claiming, "He himself is Hamas. Hamas and Lashkar-e-Taiba, two terrorist groups, are completely irrelevant in Maharashtra. But the BJP is to blame for sowing the worms in their (the Shinde faction\'s) thoughts, said Raut.\n\nWhen Shinde made a statement at the Tuesday Dussehra rally in Mumbai\'s Azad Maidan, Raut reacted to it. As part of the opposition alliance INDIA, Uddhav Thackeray\'s Shiv Sena (UBT) has formed an alliance with Congress and the Samajwadi Party. Shinde remarked of this alliance: "For their own selfishness, they will tie the knot with Hamas and Lashkar-e-Taiba."\n\nRaut highlighted

### Preprocessing

In [5]:
# Function to clean text
def clean_text(text):
    # Convert text to lower case
    text = text.lower()
    # Remove newline characters
    text = re.sub(r'\n+', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

In [6]:
cleaned_articles = []

In [None]:
for article in articles:
    cleaned_text = clean_text(article['articleBody'])
    cleaned_articles.append({
        'title': article['title'], 
        'content': cleaned_text, 
        'source': article['source']
    })

In [10]:
# Convert to DataFrame for easy manipulation
df = pd.DataFrame(cleaned_articles)

In [11]:
df

Unnamed: 0,title,content,source
0,Shiv Sena MP Sanjay Raut Responds To 'Hamas' R...,sanjay raut a member of the shiv sena ubt part...,https://www.thehansindia.com/
1,At IUML's pro-Palestine rally in Kerala Tharoo...,kozhikode kerala india october 27 ani pointing...,https://www.aninews.in/
2,Uddhav buried Bal Thackeray's 'Hindutva' for p...,mumbai oct 24 pti maharashtra chief minister e...,https://thefederal.com/
3,"New Bills replacing IPC, CrPC, Evidence Act wi...",sensex nifty rebound over 1 pc after six sessi...,https://english.varthabharati.in/
4,"Israel biggest terrorist nation in the world, ...",october 26 2023 0815 pm updated 0838 pm ist ...,https://www.thehindu.com/
...,...,...,...
37416,Lebanese media reports renewed IDF strikes in ...,lebanese media are reporting renewed idf strik...,https://www.timesofisrael.com/
37417,"US approves additional bombs, warplanes sales ...",amid escalating tensions and concerns over pot...,https://www.deccanchronicle.com/
37418,Israel Publishes Video of Islamic Jihad Terror...,haaretzcom the online english edition of haare...,https://www.haaretz.com/
37419,United Nations secretary general condemns expl...,the un secretary general antónio guterres has ...,https://www.theguardian.com/


In [12]:
df.shape

(37421, 3)

In [13]:
# Function to check if an article is relevant
def is_relevant(text):
    keywords = ['israel', 'hamas', 'gaza', 'palestine', 'war', 'conflict']
    return any(keyword in text for keyword in keywords)

In [14]:
# Filter relevant articles
df['is_relevant'] = df['content'].apply(is_relevant)
relevant_articles = df[df['is_relevant']]

In [15]:
# Display the relevant articles
relevant_articles.head()

Unnamed: 0,title,content,source,is_relevant
0,Shiv Sena MP Sanjay Raut Responds To 'Hamas' R...,sanjay raut a member of the shiv sena ubt part...,https://www.thehansindia.com/,True
1,At IUML's pro-Palestine rally in Kerala Tharoo...,kozhikode kerala india october 27 ani pointing...,https://www.aninews.in/,True
2,Uddhav buried Bal Thackeray's 'Hindutva' for p...,mumbai oct 24 pti maharashtra chief minister e...,https://thefederal.com/,True
3,"New Bills replacing IPC, CrPC, Evidence Act wi...",sensex nifty rebound over 1 pc after six sessi...,https://english.varthabharati.in/,True
4,"Israel biggest terrorist nation in the world, ...",october 26 2023 0815 pm updated 0838 pm ist ...,https://www.thehindu.com/,True


In [16]:
relevant_articles.shape

(36313, 4)

### BM25 retrieval

In [17]:
# Tokenize the documents
tokenized_corpus = [doc.split(" ") for doc in relevant_articles['content']]
bm25 = BM25Okapi(tokenized_corpus)

In [18]:
# Function to retrieve relevant articles
def retrieve_articles(query, bm25, articles, top_n=5):
    tokenized_query = query.lower().split(" ")
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = scores.argsort()[-top_n:][::-1]
    return articles.iloc[top_n_indices]

In [19]:
# Example query
query = "What happened at the Al-Shifa Hospital?"
top_articles = retrieve_articles(query, bm25, relevant_articles)

In [20]:
top_articles

Unnamed: 0,title,content,source,is_relevant
33491,Dueling claims after Palestinians die waiting ...,the palestinian health ministry said in the la...,https://thehill.com/,True
2898,Protester lights self on fire outside Israeli ...,newsnation a protester lit themselves on fire...,https://www.newsnationnow.com/,True
207,"Row Over Shashi Tharoor's Speech, CPI(M) Leade...",a row has erupted in kerala over the speech of...,https://www.ndtv.com/,True
26795,Alec Baldwin refuses to condemn Israel in pro-...,american actor alec baldwin found himself in a...,https://www.ynetnews.com/,True
17744,Mysterious: Russian Man Flies SAS To Los Angel...,summary passenger boards usbound scandinavian ...,https://simpleflying.com/,True


### T5 model

In [21]:
# Load T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
# Function to generate answers
def generate_answer(context, question):
    input_text = f"question: {question} context: {context}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt', truncation=True)
    outputs = t5_model.generate(input_ids)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

### QA Model

In [93]:
# Function to generate answers using T5 model
def question_answer_system(question, top_articles):
    context = ' '.join(top_articles['content'].tolist())
    inputs = t5_tokenizer.encode("question: " + question + " context: " + context, return_tensors="pt", max_length=512, truncation=True)
    outputs = t5_model.generate(inputs, max_length=150, num_return_sequences=1, early_stopping=True)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [91]:
# Create a pipeline
qa_pipeline = pipeline("question-answering", model=t5_model, tokenizer=t5_tokenizer)

The model 'T5ForConditionalGeneration' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionAnswering', 

In [94]:
# Repeatedly ask for questions and provide answers
while True:
    user_question = input("Please enter your question (or type 'exit' to stop): ")
    if user_question.lower() == 'exit':
        print("Exiting the question-answering system. Goodbye!")
        break
    
    # Retrieve relevant articles using BM25
    top_articles = retrieve_articles(user_question, bm25, relevant_articles)
    
    # Generate answer using T5 model
    answer = question_answer_system(user_question, top_articles)
    
    print("Answer:", answer)

Please enter your question (or type 'exit' to stop):  What happened at the Al-Shifa Hospital?


Answer: a protester lit themselves on fire


Please enter your question (or type 'exit' to stop):  exit


Exiting the question-answering system. Goodbye!
