In [2]:
import json
import time
import os
import re
import nltk
from tqdm.notebook import tqdm
import wikipedia
from wikipedia.exceptions import WikipediaException, DisambiguationError, PageError
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
print("✅ Working directory set to:", os.getcwd())
os.chdir(os.path.join(os.getcwd(), "CS6200InformationRetrievalProject"))
print("✅ Current working directory:", os.getcwd())

✅ Working directory set to: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project
✅ Current working directory: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject


In [8]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Now download NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Load the annotated queries data from JSON file
def load_queries(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data['queries']

# Function to extract page title from Wikipedia URL
def extract_title_from_url(url):
    # Extract the page title from the URL path
    parts = url.split('/')
    title = parts[-1]
    # Replace underscores with spaces and URL decode
    title = title.replace('_', ' ')
    return title

# Function to extract content from Wikipedia using the wikipedia library
def extract_wikipedia_content(url):
    try:
        # Extract the title from the URL
        title = extract_title_from_url(url)
        
        # Add a small delay to be respectful
        time.sleep(0.5)
        
        # Get the page content
        page = wikipedia.page(title, auto_suggest=False)
        content = page.content
        
        # Clean the content
        content = re.sub(r'\n+', ' ', content)  # Replace newlines with spaces
        content = re.sub(r'\s+', ' ', content)  # Normalize whitespace
        content = content.strip()
        
        return content, page.url
    except DisambiguationError as e:
        print(f"Disambiguation error for {title}: {e}")
        return "", url
    except PageError as e:
        print(f"Page not found for {title}: {e}")
        return "", url
    except WikipediaException as e:
        print(f"Wikipedia API error for {title}: {e}")
        return "", url
    except Exception as e:
        print(f"Error extracting content for {title}: {str(e)}")
        return "", url


# Function to preprocess text (tokenize, lemmatize, remove stopwords)
def preprocess_text(text):
    # Remove URLs, hyphens, and non-alphanumeric characters (except whitespace)
    finaltext = re.sub(r'http\S+', '', text)  # URLs
    finaltext = re.sub(r'-', ' ', finaltext)       # Hyphens → spaces
    finaltext = re.sub(r'[^\w\s]', ' ', finaltext) # Punctuation
    
    # Remove numbers and extra whitespace
    finaltext = re.sub(r'\d+', ' ', finaltext)     # Numbers
    finaltext = re.sub(r'\s+', ' ', finaltext).strip().lower()
    
    # Tokenize and lowercase
    tokens = word_tokenize(finaltext.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Remove non-alphabetic tokens
    filtered_tokens = [token for token in filtered_tokens if token.isalpha()]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join tokens back into a string
    return ' '.join(lemmatized_tokens)

In [None]:
from tqdm import tqdm


# Function to build dataset with highly relevant documents
def build_dataset(queries, output_path, min_relevance=3):
    dataset = []
    
    # Process each query
    for query in tqdm(queries, desc="Processing queries"):
        
        if 'documents' not in query:
            print(f"Skipping query ID {query.get('id', 'unknown')}: No documents found")
            continue
            
        query_id = query.get('id')
        query_text = query.get('query', '')
        narrative = query.get('narrative', '')
        
        # Extract documents with relevance >= min_relevance
        relevant_docs = []
        for doc in query['documents']:
            relevance = doc.get('relevance_score')
            if relevance is not None and relevance >= min_relevance:
                doc_url = doc.get('url', '')
                doc_title = doc.get('title', '')
                
                if not doc_url:
                    print(f"Skipping document {doc_title}: No URL provided")
                    continue
                
                # Extract content from Wikipedia
                content, actual_url = extract_wikipedia_content(doc_url)
                
                if content:
                    preprocessed_content = preprocess_text(content)
                    relevant_docs.append({
                        'title': doc_title,
                        'url': actual_url,
                        'content': preprocessed_content,
                        'relevance_score': relevance
                    })
        
        # Only add the query if it has relevant documents
        if relevant_docs:
            dataset.append({
                'query_id': query_id,
                'query': query_text,
                'narrative': narrative,
                'documents': relevant_docs
            })
    
    # Save the dataset as JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump({'dataset': dataset}, f, indent=2, ensure_ascii=False)
    
    print(f"Dataset saved to {output_path}")
    return dataset

# Main execution
json_path = 'data/ManualAnnotatedQueries.json'
output_path = 'data/WikipediaRelevantDocs.json'

# Load queries and build dataset
queries = load_queries(json_path)
print(f"Loaded {len(queries)} queries from {json_path}")

# Build the dataset with documents that have relevance score >= 3
dataset = build_dataset(queries, output_path, min_relevance=3)

# Print some statistics
total_docs = sum(len(item['documents']) for item in dataset)
print(f"Total queries with relevant documents: {len(dataset)}")
print(f"Total documents extracted: {total_docs}")
print(f"Average documents per query: {total_docs / len(dataset) if len(dataset) > 0 else 0:.2f}")

# Display first query and document as sample (if available)
if dataset and dataset[0].get('documents'):
    sample_query = dataset[0]
    sample_doc = sample_query['documents'][0]
    
    print("\nSample Query:")
    print(f"ID: {sample_query.get('query_id')}")
    print(f"Query: {sample_query.get('query')}")
    print(f"Narrative: {sample_query.get('narrative', '')[:100]}...")
    
    print("\nSample Document:")
    print(f"Title: {sample_doc.get('title')}")
    print(f"URL: {sample_doc.get('url')}")
    print(f"Content (first 200 chars): {sample_doc.get('content', '')[:200]}...")

In [None]:
#Now use the following models to summarize the content generated above:

#Pegasus: https://huggingface.co/docs/transformers/en/model_doc/pegasus
#Bart: https://huggingface.co/transformers/v2.11.0/model_doc/bart.html
#T5: https://huggingface.co/docs/transformers/en/model_doc/t5
#LongFormer: https://huggingface.co/docs/transformers/en/model_doc/longformer

#Steps:
# 1. Get all the values.
# 2. Use the models to summarize the content.
# 3. Save the summarized content in a new file.




In [14]:
import json

# Load JSON file
with open("data/WikipediaRelevantDocs.json", "r") as file:
    data = json.load(file)  # Now 'data' is a Python dictionary

print(len(data['dataset']))
dataset = data['dataset']
# Print the first item in the dataset
print(dataset[0])

50


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

models = {
    "pegasus": "google/pegasus-xsum",
    "bart": "facebook/bart-large-cnn",
    "t5": "t5-base",
    "longformer": "allenai/led-base-16384"
}

summarized_dataset = dataset

# Longformer summarizer function
def longformer_summarize(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    global_attention_mask = torch.zeros_like(input_ids)
    global_attention_mask[:, 0] = 1  # global attention on first token

    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,
        max_length=150,
        num_beams=4
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Run for each model
for name, model_id in models.items():
    summarized_docs = []

    if name == "longformer":
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

        for doc in documents:
            summary = longformer_summarize(doc["content"], tokenizer, model)
            summarized_docs.append({
                "title": doc["title"],
                "url": doc["url"],
                "summary": summary
            })
    else:
        summarizer = pipeline("summarization", model=model_id, tokenizer=model_id)
        for doc in documents:
            summary = summarizer(doc["content"], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            summarized_docs.append({
                "title": doc["title"],
                "url": doc["url"],
                "summary": summary
            })

    output_data = {
        "dataset": [
            {
                "query_id": query_info["query_id"],
                "query": query_info["query"],
                "narrative": query_info["narrative"],
                "documents": summarized_docs
            }
        ]
    }

    filename = f"summarized_output_{name}.json"
    with open(filename, "w") as f:
        json.dump(output_data, f, indent=2)

    print(f"✅ Saved: {filename}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Your max_length is set to 100, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 100, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


✅ Saved: summarized_output_pegasus.json


Device set to use mps:0
Your max_length is set to 100, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 100, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ Saved: summarized_output_bart.json


Device set to use mps:0
Your max_length is set to 100, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 100, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


✅ Saved: summarized_output_t5.json


In [None]:
%pip install torch