In [2]:

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity





In [3]:
df = pd.read_csv('news_dataset.csv', encoding='latin1')

(df.head())

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB
None


In [5]:
nltk.download('punkt')  
nltk.download('stopwords')
nltk.download('wordnet') 
nltk.download('omw-1.4') 

import re

# Function to clean text
def clean(text):
   
    text = text.replace('?', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['article'] = df['article'].apply(clean)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:


# Define stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text for TF-IDF calculation
def preprocess_text(text):
    # Tokenize into words
    tokens = word_tokenize(text.lower())
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalnum()]
    return ' '.join(tokens)


In [7]:

# Preprocess articles and store both original and preprocessed versions
df['sentences'] = df['article'].apply(sent_tokenize)
df['pre-processed_sentences'] = df['sentences'].apply(lambda sentences: [preprocess_text(sentence) for sentence in sentences])


In [9]:
df.head()

Unnamed: 0,id,author,date,year,month,topic,article,sentences,pre-processed_sentences
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS When the Islamic State was about to be d...,[PARIS When the Islamic State was about to be ...,[paris islamic state driven ancient city palmy...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu iz family s ap...,[Angels are everywhere in the Mu iz family s a...,[angel everywhere mu iz family apartment bronx...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...,"[Finally., The Second Avenue subway opened in ...","[finally, second avenue subway opened new york..."
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON It s or time for Republicans. After...,"[WASHINGTON It s or time for Republicans., Aft...","[washington time republican, tumultuous decade..."
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB...","[For Megyn Kelly, the shift from Fox News to N...",[megyn kelly shift fox news nbc host daily day...


In [7]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()


In [10]:
import spacy
import neuralcoref

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Add neuralcoref to spaCy's pipe
neuralcoref.add_to_pipe(nlp)

# Function to apply coreference resolution
def resolve_coreferences(text):
    doc = nlp(text)
    return doc._.coref_clusters,doc._.coref_resolved

# Apply coreference resolution to the article texts
information_cluster,short_explaination= resolve_coreferences(sentence)

print("cluster:",information_cluster)
print("Explanation:",short_explaination)

cluster: []
Explanation: A Republican in the White House and a Republican majority in Congress present tremendous opportunity to make real progress, Senator Cory Gardner, Republican of Colorado, said in the party s weekly radio address on Saturday.


In [8]:
def find_relevant_sentence(question, article_index):
    # Check if the article index is valid and if data exists for that index
    if article_index not in df.index:
        return "No article found", 0

    # Get processed sentences and original sentences for the specific article
    pre_processed_sentences = df.loc[article_index, 'pre-processed_sentences']
    original_raw_sentences = df.loc[article_index, 'sentences']

    if not pre_processed_sentences:
        return "No valid sentences in article", 0
    
    # Include the question in the processing for vectorization
    processed_question = preprocess_text(question)
    processed_sentences_with_question = pre_processed_sentences + [processed_question]

    # Vectorize the sentences including the question
    sentence_vectors = tfidf_vectorizer.fit_transform(processed_sentences_with_question)

    # Calculate cosine similarities between the question and each processed sentence
    similarities = cosine_similarity(sentence_vectors[-1], sentence_vectors[:-1]).flatten()

    # Find the index of the highest similarity score
    if len(similarities) == 0:
        return "No similarities found", 0

    most_relevant_idx = similarities.argmax()

    # Check if the index is within the range of original sentences
    if most_relevant_idx >= len(original_raw_sentences):
        return "No relevant sentence found", 0

    return original_raw_sentences[most_relevant_idx], similarities[most_relevant_idx]


In [9]:
article_index = df[df['id'] == 17311].index[0]  # Adjust based on how the index is set in your DataFrame
question = "Who is the Senator of Colorado?"
sentence, confidence = find_relevant_sentence(question, article_index)
print("Most relevant sentence:", sentence)
print("Confidence score:", confidence)


Most relevant sentence: A Republican in the White House and a Republican majority in Congress present tremendous opportunity to make real progress, Senator Cory Gardner, Republican of Colorado, said in the party s weekly radio address on Saturday.
Confidence score: 0.30232007325256094


In [11]:
import spacy
from spacy.tokens import Token

# Load the large model
nlp = spacy.load('en_core_web_lg')

def question_focus_type(question):
    doc = nlp(question)
    question_word = next((tok for tok in doc if tok.dep_ == 'ROOT' or tok.head.dep_ == 'ROOT'), None)
    focus_words = {"who": ["PERSON", "ORG"], "where": ["GPE", "LOC"], "when": ["DATE", "TIME"]}
    
    if question_word:
        for key, types in focus_words.items():
            if key in question.lower():
                return types
    return ["ORG", "PERSON", "GPE", "LOC", "DATE", "TIME", "EVENT"]


In [12]:

def correct_answer(question, sentence):
    expected_types = question_focus_type(question)
    doc = nlp(sentence)
    relevant_entities = [ent for ent in doc.ents if ent.label_ in expected_types]
    
    # Improved scoring by checking the proximity to the root or question words
    best_score = float('-inf')
    best_entity = None
    for entity in relevant_entities:
        # Score entities based on proximity to the root or important tokens
        entity_score = 100 - min(abs(tok.i - entity.root.i) for tok in doc if tok.dep_ == 'ROOT' or tok.head.dep_ == 'ROOT')
        if entity_score > best_score:
            best_score = entity_score
            best_entity = entity.text
    
    return best_entity if best_entity else sentence, best_score if best_score != float('-inf') else 0
best_answer, confidence = correct_answer(question, sentence)
print(f"Best Answer: {best_answer}")

Best Answer: Cory Gardner


In [13]:
import pandas as pd
import collections


# Load test data
test_data = pd.read_csv('test_questions.csv')  # Assuming columns: ['article_id', 'question', 'correct_answer']

In [14]:
def tokens(text):
    # Assuming text is a string of words separated by spaces
    return text.split()

In [20]:
def f1_score_calculation(gold, pred):
    # print("original",gold)
    # print("Predicted",pred)
    gold_toks = tokens(gold)
    pred_toks = tokens(pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


In [21]:
# Incorporate this function into the evaluation system
def evaluate_f1(test_data):
    f1_scores = []

    for _, row in test_data.iterrows():
        article_ids = df[df['id'] == row['Article ID']].index
        if article_ids.empty:
            f1_scores.append(0)
            continue  # Skip if the article ID is not found
        article_index = article_ids[0]

        retrieved_sentence, _ = find_relevant_sentence(row['Question'], article_index)
        predicted_answer, _ = correct_answer(row['Question'], retrieved_sentence)
        correct_A_answer = row['Answer']

        f1 = f1_score_calculation(correct_A_answer, predicted_answer)
        f1_scores.append(f1)

    # Calculate the average F1 score across all test samples
    avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
    return avg_f1

In [22]:
# Now, call evaluate_f1 with test data
average_f1 = evaluate_f1(test_data)
print(f"Average F1 Score: {average_f1:.2%}")

Average F1 Score: 16.75%


In [None]:
def question_answer():
    article_id = int(input("Enter article ID: "))
    question = input("Enter your question: ")
    try:
        # Find the index of the article with the given ID
        article_index = df.index[df['id'] == article_id]
        if not article_index.empty:
            # Get the first index if there are multiple articles with the same ID
            article_index = article_index[0]
            # Assuming 'find_relevant_sentence' is already defined and working correctly
            sentence, confidence = find_relevant_sentence(question, article_index)  # Pass the article index directly
           
            
            # Assuming 'correct_answer' is already defined and working correctly
            best_answer, confidence = correct_answer(question, sentence)
            print("Answer:", best_answer)
        else:
            print("No article found with the provided ID.")
    except Exception as e:
        print("Error processing your request:", str(e))

question_answer()



Answer: Cory Gardner
