In [2]:
import json
import time
import os
import re
import nltk
from tqdm.notebook import tqdm
import wikipedia
from wikipedia.exceptions import WikipediaException, DisambiguationError, PageError
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [33]:
print("✅ Working directory set to:", os.getcwd())
os.chdir(os.path.join(os.getcwd(), "CS6200InformationRetrievalProject"))
print("✅ Current working directory:", os.getcwd())

✅ Working directory set to: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject


FileNotFoundError: [Errno 2] No such file or directory: '/Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/CS6200InformationRetrievalProject'

In [8]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Now download NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Load the annotated queries data from JSON file
def load_queries(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data['queries']

# Function to extract page title from Wikipedia URL
def extract_title_from_url(url):
    # Extract the page title from the URL path
    parts = url.split('/')
    title = parts[-1]
    # Replace underscores with spaces and URL decode
    title = title.replace('_', ' ')
    return title

# Function to extract content from Wikipedia using the wikipedia library
def extract_wikipedia_content(url):
    try:
        # Extract the title from the URL
        title = extract_title_from_url(url)
        
        # Add a small delay to be respectful
        time.sleep(0.5)
        
        # Get the page content
        page = wikipedia.page(title, auto_suggest=False)
        content = page.content
        
        # Clean the content
        content = re.sub(r'\n+', ' ', content)  # Replace newlines with spaces
        content = re.sub(r'\s+', ' ', content)  # Normalize whitespace
        content = content.strip()
        
        return content, page.url
    except DisambiguationError as e:
        print(f"Disambiguation error for {title}: {e}")
        return "", url
    except PageError as e:
        print(f"Page not found for {title}: {e}")
        return "", url
    except WikipediaException as e:
        print(f"Wikipedia API error for {title}: {e}")
        return "", url
    except Exception as e:
        print(f"Error extracting content for {title}: {str(e)}")
        return "", url


# Function to preprocess text (tokenize, lemmatize, remove stopwords)
def preprocess_text(text):
    # Remove URLs, hyphens, and non-alphanumeric characters (except whitespace)
    finaltext = re.sub(r'http\S+', '', text)  # URLs
    finaltext = re.sub(r'-', ' ', finaltext)       # Hyphens → spaces
    finaltext = re.sub(r'[^\w\s]', ' ', finaltext) # Punctuation
    
    # Remove numbers and extra whitespace
    finaltext = re.sub(r'\d+', ' ', finaltext)     # Numbers
    finaltext = re.sub(r'\s+', ' ', finaltext).strip().lower()
    
    # Tokenize and lowercase
    tokens = word_tokenize(finaltext.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Remove non-alphabetic tokens
    filtered_tokens = [token for token in filtered_tokens if token.isalpha()]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join tokens back into a string
    return ' '.join(lemmatized_tokens)

In [None]:
from tqdm import tqdm


# Function to build dataset with highly relevant documents
def build_dataset(queries, output_path, min_relevance=3):
    dataset = []
    
    # Process each query
    for query in tqdm(queries, desc="Processing queries"):
        
        if 'documents' not in query:
            print(f"Skipping query ID {query.get('id', 'unknown')}: No documents found")
            continue
            
        query_id = query.get('id')
        query_text = query.get('query', '')
        narrative = query.get('narrative', '')
        
        # Extract documents with relevance >= min_relevance
        relevant_docs = []
        for doc in query['documents']:
            relevance = doc.get('relevance_score')
            if relevance is not None and relevance >= min_relevance:
                doc_url = doc.get('url', '')
                doc_title = doc.get('title', '')
                
                if not doc_url:
                    print(f"Skipping document {doc_title}: No URL provided")
                    continue
                
                # Extract content from Wikipedia
                content, actual_url = extract_wikipedia_content(doc_url)
                
                if content:
                    preprocessed_content = preprocess_text(content)
                    relevant_docs.append({
                        'title': doc_title,
                        'url': actual_url,
                        'content': preprocessed_content,
                        'relevance_score': relevance
                    })
        
        # Only add the query if it has relevant documents
        if relevant_docs:
            dataset.append({
                'query_id': query_id,
                'query': query_text,
                'narrative': narrative,
                'documents': relevant_docs
            })
    
    # Save the dataset as JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump({'dataset': dataset}, f, indent=2, ensure_ascii=False)
    
    print(f"Dataset saved to {output_path}")
    return dataset

# Main execution
json_path = 'data/ManualAnnotatedQueries.json'
output_path = 'data/WikipediaRelevantDocs.json'

# Load queries and build dataset
queries = load_queries(json_path)
print(f"Loaded {len(queries)} queries from {json_path}")

# Build the dataset with documents that have relevance score >= 3
dataset = build_dataset(queries, output_path, min_relevance=3)

# Print some statistics
total_docs = sum(len(item['documents']) for item in dataset)
print(f"Total queries with relevant documents: {len(dataset)}")
print(f"Total documents extracted: {total_docs}")
print(f"Average documents per query: {total_docs / len(dataset) if len(dataset) > 0 else 0:.2f}")

# Display first query and document as sample (if available)
if dataset and dataset[0].get('documents'):
    sample_query = dataset[0]
    sample_doc = sample_query['documents'][0]
    
    print("\nSample Query:")
    print(f"ID: {sample_query.get('query_id')}")
    print(f"Query: {sample_query.get('query')}")
    print(f"Narrative: {sample_query.get('narrative', '')[:100]}...")
    
    print("\nSample Document:")
    print(f"Title: {sample_doc.get('title')}")
    print(f"URL: {sample_doc.get('url')}")
    print(f"Content (first 200 chars): {sample_doc.get('content', '')[:200]}...")

In [None]:
#Now use the following models to summarize the content generated above:

#Pegasus: https://huggingface.co/docs/transformers/en/model_doc/pegasus
#Bart: https://huggingface.co/transformers/v2.11.0/model_doc/bart.html
#T5: https://huggingface.co/docs/transformers/en/model_doc/t5
#LongFormer: https://huggingface.co/docs/transformers/en/model_doc/longformer

#Steps:
# 1. Get all the values.
# 2. Use the models to summarize the content.
# 3. Save the summarized content in a new file.




In [16]:
import json

# Load JSON file
with open("data/WikipediaRelevantDocs.json", "r") as file:
    data = json.load(file)  # Now 'data' is a Python dictionary

print(len(data['dataset']))
dataset = data['dataset']
# Print the first item in the dataset
print(dataset[0])

50


In [51]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import re

def load_dataset(file_path):
    """Load the JSON dataset from file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def save_dataset(data, output_file):
    """Save the modified dataset to a JSON file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

def summarize_content(content, model_name="bart", model_id="facebook/bart-large-cnn"):
    """Summarize the content using the provided model"""
    try:
        # Use BART or other summarization models
        device = -1  # Use CPU
        summarizer = pipeline("summarization", model=model_id, tokenizer=model_id, device=device)
        
        # Limit content length to avoid tokenizer issues
        # Split into smaller chunks, with some overlap
        words = content.split()
        max_words = 500  # Smaller chunk size to be safe
        chunks = []
        
        for i in range(0, len(words), max_words - 50):
            chunk = ' '.join(words[i:i + max_words])
            if chunk:  # Make sure chunk isn't empty
                chunks.append(chunk)
        
        # Process each chunk with error handling
        summaries = []
        for chunk in chunks:
            try:
                result = summarizer(chunk, max_length=100, min_length=30, do_sample=False)
                if result and len(result) > 0:
                    summaries.append(result[0]['summary_text'])
            except Exception as e:
                print(f"Error processing chunk: {e}")
                # Try with an even smaller piece if there's an error
                if len(chunk.split()) > 200:
                    smaller_chunks = [' '.join(chunk.split()[j:j+200]) for j in range(0, len(chunk.split()), 200)]
                    for small_chunk in smaller_chunks:
                        try:
                            result = summarizer(small_chunk, max_length=50, min_length=20, do_sample=False)
                            if result and len(result) > 0:
                                summaries.append(result[0]['summary_text'])
                        except:
                            # If still failing, just create a basic summary
                            summary_words = chunk.split()[:100]
                            summaries.append(' '.join(summary_words) + "...")
        
        if summaries:
            return ' '.join(summaries)
        else:
            # Fallback if all summarization attempts fail
            return content[:500] + "... (Text truncated due to processing limitations)"
    except Exception as e:
        print(f"Summarization error: {e}")
        # Return a truncated version of the original if summarization fails
        return content[:500] + "... (Text truncated due to processing error)"

def clean_text(text):
    """Clean and prepare text for summarization"""
    # Remove special characters, extra spaces, etc.
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def process_dataset(dataset, model_name="bart", model_id="facebook/bart-large-cnn"):
    """Process the dataset by summarizing each document's content"""
        
    # Now process the dictionary as before
    for query_data in dataset["dataset"]:
        print(f"Processing query ID: {query_data}")
        documents = query_data.get("documents", [])
        for doc in documents:
            if "content" in doc:
                # Clean the content text
                cleaned_content = clean_text(doc["content"])
                # Generate summary
                print(f"Summarizing document: {doc.get('title', 'Untitled')}")
                summary = summarize_content(cleaned_content, model_name, model_id)
                # Replace the original content with the summary
                doc["content"] = summary
                print(f"Summary generated: {summary[:100]}...")
    
    return dataset

def create_summaries(model_name, model_id):
    # Configuration
    input_file = "data/WikipediaRelevantDocs.json"  # Replace with your input file
    output_file = "data/" + model_name + "_summarized.json"  # Output file for summarized content
    
    # Load dataset
    print(f"Loading dataset from {input_file}...")
    dataset = load_dataset(input_file)
    
    # Process dataset
    print("Summarizing documents...")
    processed_dataset = process_dataset(dataset, model_name, model_id)
    
    # Save processed dataset
    print(f"Saving processed dataset to {output_file}...")
    save_dataset(processed_dataset, output_file)
    
    print("Done!")

In [52]:
create_summaries("bart", "facebook/bart-large-cnn")

Loading dataset from data/WikipediaRelevantDocs.json...
Summarizing documents...
Summarizing document: Cryptocurrency


Device set to use cpu


Summary generated:  cryptocurrency colloquially crypto digital currency designed work computer network reliant central ...
Summarizing document: Bitcoin


Device set to use cpu


Summary generated: Bitcoin is a decentralized cryptocurrency based free market ideology. The first known commercial tra...
Summarizing document: Blockchain


Device set to use cpu


Summary generated: Blockchains may considered secure design exemplify distributed computing system high byzantine fault...
Summarizing document: Ethereum


Device set to use cpu


Summary generated: Ether is a decentralized application platform among cryptocurrencies. It allows anyone to deploy dec...
Summarizing document: Digital currency


Device set to use cpu


Summary generated: Digital currency include cryptocurrency virtual currency central bank digital currency digital curre...
Processing query ID: {'query_id': 2, 'query': 'Blockchain technology explained', 'narrative': "The user wants to understand blockchain's structure, decentralization, and use cases beyond cryptocurrencies. Relevant documents cover consensus mechanisms, distributed ledgers, and industry applications. Documents focusing only on trading are irrelevant.", 'documents': [{'title': 'Blockchain', 'url': 'https://en.wikipedia.org/wiki/Blockchain', 'content': 'blockchain distributed ledger growing list record block securely linked together via cryptographic hash block contains cryptographic hash previous block timestamp transaction data generally represented merkle tree data node represented leaf since block contains information previous block effectively form chain compare linked list data structure additional block linking one consequently blockchain transaction resistant a

Device set to use cpu


Summary generated: Blockchains may considered secure design exemplify distributed computing system high byzantine fault...
Summarizing document: Distributed ledger


Device set to use cpu


Summary generated:  distributed ledger also called shared ledger distributed ledger technology dlt system whereby repli...
Summarizing document: Consensus (computer science)


Device set to use cpu


Summary generated:  distributed computing multi agent system achieve overall system reliability presence number faulty ...
Summarizing document: Smart contract


Device set to use cpu
Your max_length is set to 100, but your input_length is only 68. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


Summary generated: Smart contract refers to traditional natural language legally binding agreement selected term expres...
Summarizing document: Decentralized application


Device set to use cpu


Summary generated:  decentralised application dapp dapp  dapp application operate autonomously typically use smart cont...
Processing query ID: {'query_id': 3, 'query': 'Cryptocurrency Security Best Practices', 'narrative': 'The user wants to understand how to securely store and manage cryptocurrencies. Relevant documents should cover wallet types, security measures, and common risks. Documents about cryptocurrency price speculation without security focus should be considered irrelevant.', 'documents': [{'title': 'Cryptocurrency wallet', 'url': 'https://en.wikipedia.org/wiki/Cryptocurrency_wallet', 'content': 'cryptocurrency wallet device physical medium program online service store public private key cryptocurrency transaction addition basic function storing key cryptocurrency wallet often offer functionality encrypting signing information signing example result executing smart contract cryptocurrency transaction see bitcoin transaction image identification legally signing document se

Device set to use cpu
Your max_length is set to 100, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


Summary generated: Bitcoin introduced first cryptocurrency following principle outlined satoshi nakamoto paper bitcoin ...
Summarizing document: Public-key cryptography


Device set to use cpu


Summary generated: Public key cryptography depends keeping private key secret public key openly distributed without com...
Summarizing document: Cryptographic hash function


Device set to use cpu


Summary generated:  cryptographic hash function chf hash algorithm map arbitrary binary string binary string fixed size...
Processing query ID: {'query_id': 4, 'query': 'What is Cryptocurrency Mining?', 'narrative': 'The user wants to understand how cryptocurrency mining works, including the technical processes, equipment needed, and environmental impacts. Relevant documents should cover mining algorithms, hardware, and energy considerations. Documents about cryptocurrency investment without mining focus should be considered irrelevant.', 'documents': [{'title': 'Cryptocurrency mining', 'url': 'https://en.wikipedia.org/wiki/GPU_mining', 'content': 'gpu mining use graphic processing unit gpus mine proof work cryptocurrencies bitcoin miner receive reward performing computationally intensive work calculating hash amend verify transaction open decentralized ledger gpus especially performant calculating hash concern gpu mining rise gpu mining cryptocurrency sparked various discussion concer

Device set to use cpu
Your max_length is set to 100, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


Summary generated:  gpu mining use graphic processing unit gpus mine proof work cryptocurrencies bitcoin miner receive ...
Summarizing document: Proof-of-work


Device set to use cpu


Summary generated: Proof work abbreviated pow form cryptographic proof one party prover prof others verifier certain am...
Summarizing document: Mining pool


Device set to use cpu


Summary generated:  cryptocurrency mining mining pool pooling resource miner share processing power network split rewar...
Summarizing document: Application-specific integrated circuit


Device set to use cpu


Summary generated: Asics often include entire microprocessor memory block including rom ram eeprom flash memory large b...
Summarizing document: GPU mining


Device set to use cpu
Your max_length is set to 100, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


Summary generated:  gpu mining use graphic processing unit gpus mine proof work cryptocurrencies bitcoin miner receive ...
Processing query ID: {'query_id': 5, 'query': 'Stablecoins and Their Role in Crypto', 'narrative': 'The user wants to understand what stablecoins are, their types, and their importance in the cryptocurrency ecosystem. Relevant documents should cover different stablecoin mechanisms, use cases, and regulatory considerations. Documents about volatile cryptocurrencies without stablecoin focus should be considered irrelevant.', 'documents': [{'title': 'Stablecoin', 'url': 'https://en.wikipedia.org/wiki/Stablecoin', 'content': 'stablecoin type cryptocurrency value digital asset supposed pegged reference asset either fiat money exchange traded commodity precious metal industrial metal another cryptocurrency theory backing reference asset could make stablecoin value track value peg subject radical change value common market many digital asset practice stablecoin issuer yet

Device set to use cpu


Summary generated:  stablecoin type cryptocurrency value digital asset supposed pegged reference asset either fiat mone...
Summarizing document: Tether (cryptocurrency)


Device set to use cpu
Your max_length is set to 100, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Summary generated: Tether is one of fourteen protocol blockchains tether minted tether face criticism regarding transpa...
Summarizing document: USD Coin


Device set to use cpu


Summary generated:  usd coin usdc cryptocurrency stablecoin issued circle pegged united state dollar distinct central b...
Processing query ID: {'query_id': 6, 'query': 'Investment strategies for beginners', 'narrative': 'The user is seeking basic information about getting started with investing, including fundamental concepts, entry-level investment options, and risk management approaches. Relevant documents should cover investment basics, starter portfolios, and simple explanations of investment vehicles suitable for novices. Documents about complex advanced trading strategies should be considered less relevant.', 'documents': [{'title': 'Investment', 'url': 'https://en.wikipedia.org/wiki/Investment', 'content': 'investment traditionally defined commitment resource achieve later benefit investment involves money defined commitment money receive money later broader viewpoint investment defined tailor pattern expenditure receipt resource optimise desirable pattern flow expenditure rece

Device set to use cpu


Summary generated: Investing involves money defined commitment money receive money later broader viewpoint. Investment ...
Summarizing document: Mutual fund


Device set to use cpu


Summary generated:  mutual fund investment fund pool money many investor purchase security term typically used united s...
Summarizing document: Exchange-traded fund


Device set to use cpu


Summary generated:  exchange traded fund etf type investment fund also exchange traded product e traded stock exchange ...
Summarizing document: Index fund


Device set to use cpu
Your max_length is set to 100, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


Summary generated:  index fund also index tracker mutual fund exchange traded fund etf designed follow certain preset r...
Summarizing document: Dollar cost averaging


Device set to use cpu


Summary generated: Dollar cost averaging is an investment strategy aim apply value investing principle regular investme...
Summarizing document: Risk-return Spectrum


Device set to use cpu


Summary generated: Risk return spectrum also called risk return tradeoff risk reward relationship amount return gained ...
Summarizing document: Asset allocation


Device set to use cpu


Summary generated:  asset allocation implementation investment strategy attempt balance risk versus reward adjusting pe...
Summarizing document: Real estate investing


Device set to use cpu
Your max_length is set to 100, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


Summary generated:  real estate investing involves purchasing owning managing renting selling real estate generate prof...
Summarizing document: Real estate investment trust


Device set to use cpu


Summary generated: Real estate investment trust reit pronounced reet company owns case operates income producing real e...
Summarizing document: Property management


Device set to use cpu
Your max_length is set to 100, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)


Summary generated: Property management involves process system workforce required manage life cycle acquired property d...
Summarizing document: Rental property


Device set to use cpu
Your max_length is set to 100, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


Summary generated: Renting also known hiring letting agreement payment made use good service property owned another fix...
Summarizing document: Mortgage


Device set to use cpu


Summary generated: Mortgage loan simply mortgage civil law jurisdiction known also hypothec loan loan used either purch...
Processing query ID: {'query_id': 8, 'query': 'How to build a diversified investment portfolio', 'narrative': 'The user wants to understand the principles of portfolio diversification and how to construct a well-diversified investment portfolio. Relevant documents should cover asset allocation, diversification strategies, and correlation concepts. Documents about specific individual stocks without diversification context should be considered less relevant.', 'documents': [{'title': 'Modern portfolio theory', 'url': 'https://en.wikipedia.org/wiki/Modern_portfolio_theory', 'content': 'modern portfolio theory mpt mean variance analysis mathematical framework assembling portfolio asset expected return maximized given level risk formalization extension diversification investing idea owning different kind financial asset less risky owning one type key insight asset risk 

Device set to use cpu


Summary generated:  modern portfolio theory mpt mean variance analysis mathematical framework assembling portfolio asse...
Summarizing document: Diversification (finance)


Device set to use cpu
Your max_length is set to 100, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Summary generated: Finance diversification process allocating capital way reduces exposure one particular asset risk co...
Summarizing document: Asset allocation


Device set to use cpu


Summary generated:  asset allocation implementation investment strategy attempt balance risk versus reward adjusting pe...
Summarizing document: Risk parity


Device set to use cpu


Summary generated: Risk parity approach asserts asset allocation adjusted leveraged deleveraged risk level risk parity ...
Processing query ID: {'query_id': 9, 'query': 'Passive vs Active investing comparison', 'narrative': 'The user wants to understand the differences between passive and active investing approaches. Relevant documents should cover passive and active investment strategies, performance comparisons, fee differences, and considerations for choosing between them. Documents about specific stocks without contextual comparison of approaches should be considered less relevant.', 'documents': [{'title': 'Passive management', 'url': 'https://en.wikipedia.org/wiki/Passive_management', 'content': 'passive management also called passive investing investing strategy track market weighted index portfolio passive management common equity market index fund track stock market index becoming common investment type including bond commodity hedge fund substantial increase passive investing

Device set to use cpu


Summary generated:  passive management also called passive investing investing strategy track market weighted index por...
Summarizing document: Active management


Device set to use cpu
Your max_length is set to 100, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


Summary generated:  active management also called active investing approach investing actively managed portfolio invest...
Summarizing document: Index fund


Device set to use cpu
Your max_length is set to 100, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


Summary generated:  index fund also index tracker mutual fund exchange traded fund etf designed follow certain preset r...
Summarizing document: Exchange-traded fund


Device set to use cpu


Summary generated:  exchange traded fund etf type investment fund also exchange traded product e traded stock exchange ...
Summarizing document: Mutual fund


Device set to use cpu


Summary generated:  mutual fund investment fund pool money many investor purchase security term typically used united s...
Summarizing document: Expense ratio


Device set to use cpu


Summary generated: Investor must consider choosing fund significantly affect return factor influencing expense ratio in...
Summarizing document: Efficient-market hypothesis


Device set to use cpu


Summary generated: Emh formulated term risk adjustment make testable prediction coupled particular model risk result re...
Processing query ID: {'query_id': 10, 'query': 'Best Retirement investment planning', 'narrative': 'The user is looking for information about investment strategies specifically for retirement planning. Relevant documents should cover retirement accounts, long-term investment approaches, and retirement-specific financial planning. Documents about short-term trading strategies should be considered less relevant.', 'documents': [{'title': '401(k)', 'url': 'https://en.wikipedia.org/wiki/401(k)', 'content': 'united state k plan employer sponsored defined contribution personal pension saving account defined subsection k u internal revenue code periodic employee contribution come directly paycheck may matched employer pre tax option make k plan attractive employee many employer offer option full time worker k payable general ledger account contains amount k plan pension p

Device set to use cpu


Summary generated: K plan attractive employee many employer offer option full time worker k payable general ledger acco...
Summarizing document: Individual retirement account


Device set to use cpu


Summary generated:  individual retirement account ira united state form pension provided many financial institution pro...
Summarizing document: Pension


Device set to use cpu


Summary generated: Retirement work pension may either defined benefit plan defined periodic payment made retirement spo...
Summarizing document: Retirement planning


Device set to use cpu


Summary generated:  retirement planning financial context refers allocation saving revenue retirement goal retirement p...
Summarizing document: Roth IRA


Device set to use cpu
Your max_length is set to 100, but your input_length is only 72. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


Summary generated: Roh ira individual retirement account containing investment security usually common stock bond often...
Summarizing document: Volcanic eruption


Device set to use cpu


Summary generated: Volcano may exhibit one characteristic type eruption period activity others may display entire seque...
Summarizing document: Volcano


Device set to use cpu
Your max_length is set to 100, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


Summary generated:  volcano commonly defined vent fissure crust planetary mass object earth allows hot lava volcanic as...
Summarizing document: Types of volcanic eruptions


Device set to use cpu


Summary generated: Volcano may exhibit one characteristic type eruption period activity others may display entire seque...
Summarizing document: Plate tectonics


Device set to use cpu


Summary generated: Tectonic plate composed oceanic lithosphere thicker continental lithosphere topped kind crust along ...
Summarizing document: Magma


Device set to use cpu


Summary generated:  magma sometimes colloquially incorrectly referred lava found beneath surface earth evidence magmati...
Processing query ID: {'query_id': 12, 'query': 'Different types of volcanoes and their structures', 'narrative': 'The user wants to understand the various types of volcanic structures. Relevant documents should explain shield volcanoes, stratovolcanoes, cinder cones, and other volcanic landforms. Documents about volcanic rocks without structural explanations should be considered less relevant.', 'documents': [{'title': 'Shield volcano', 'url': 'https://en.wikipedia.org/wiki/Shield_volcano', 'content': 'shield volcano type volcano named low profile resembling shield lying ground formed eruption highly fluid low viscosity lava travel farther form thinner flow viscous lava erupted stratovolcano repeated eruption result steady accumulation broad sheet lava building shield volcano distinctive form shield volcano found wherever fluid low silica lava reach surface rocky p

Device set to use cpu


Summary generated:  shield volcano type volcano named low profile resembling shield lying ground formed eruption highly...
Summarizing document: Stratovolcano


Device set to use cpu
Your max_length is set to 100, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


Summary generated: Stratovolcano also known composite volcano typically conical volcano built many alternating layer st...
Summarizing document: Cinder cone


Device set to use cpu


Summary generated: Cinder cone usually basaltic andesitic composition often glassy contains numerous gas bubble frozen ...
Summarizing document: Volcanic hazards


Device set to use cpu
Your max_length is set to 100, but your input_length is only 70. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


Summary generated:  volcanic hazard probability volcanic eruption related geophysical event occur given geographic area...
Summarizing document: Geothermal energy


Device set to use cpu


Summary generated: Geothermal energy thermal energy extracted earth crust combine energy formation planet radioactive d...
Summarizing document: Volcanic island


Device set to use cpu


Summary generated:  volcanic island usually range size square kilometre sq mi island certain size usually fresh groundw...
Summarizing document: Island arc


Device set to use cpu


Summary generated:  island arc long chain active volcano intense seismic activity found along convergent tectonic plate...
Summarizing document: Seamount


Device set to use cpu


Summary generated: Seamounts follow distinctive evolutionary pattern eruption build subsidence erosion recent year seve...
Summarizing document: Hawaiian Islands


Device set to use cpu


Summary generated: Hawaiian island hawaiian mokupuni hawaiʻi archipelago eight major volcanic island several atoll nume...
Summarizing document: Prediction of volcanic activity


Device set to use cpu


Summary generated: Volcano awaken prepare erupt important link eruption volcano normally continuing low level seismic a...
Summarizing document: Seismology


Device set to use cpu
Your max_length is set to 100, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Summary generated: Earthquake is study scientific study earthquake generally quake generation propagation elastic wave ...
Summarizing document: United States presidential election


Device set to use cpu


Summary generated: Presidential election differ many republic around world operating either presidential system semi pr...
Summarizing document: United States Electoral College


Device set to use cpu


Summary generated:  united state electoral college group presidential elector formed every four year presidential elect...
Summarizing document: Primary election


Device set to use cpu


Summary generated: Primary election primary election held determine candidate run upcoming general election partisan pr...
Summarizing document: United States presidential primary


Device set to use cpu


Summary generated: Primary election caucus usually indirect election instead voter directly selecting particular person...
Summarizing document: 2020 United States presidential election


Device set to use cpu


Summary generated: U.S. presidential election held united state november democratic ticket former vice president joe bi...
Processing query ID: {'query_id': 17, 'query': 'How does the Electoral College work?', 'narrative': 'The user wants a detailed explanation of the Electoral College system, including elector selection, voting procedures, and historical rationale. Relevant documents cover Electoral College mechanics, criticisms, and constitutional basis. Documents about popular vote statistics without Electoral College context are irrelevant.', 'documents': [{'title': 'United States Electoral College', 'url': 'https://en.wikipedia.org/wiki/United_States_Electoral_College', 'content': 'united state electoral college group presidential elector formed every four year presidential election sole purpose voting president vice president process described article two constitution number electoral vote exercised state equal state congressional delegation number senator two plus number represe

Device set to use cpu


Summary generated:  united state electoral college group presidential elector formed every four year presidential elect...
Summarizing document: Twelfth Amendment to the United States Constitution


Device set to use cpu
Your max_length is set to 100, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


Summary generated:  twelfth amendment amendment xii united state constitution provides procedure electing president vic...
Processing query ID: {'query_id': 18, 'query': 'Historical evolution of the US election process', 'narrative': 'The user seeks to understand how presidential elections have changed since 1789, including amendments and voting rights expansions. Relevant documents cover suffrage movements, constitutional changes, and electoral reforms. Documents about current elections only are less relevant.', 'documents': [{'title': 'History of the United States Constitution', 'url': 'https://en.wikipedia.org/wiki/History_of_the_United_States_Constitution', 'content': 'united state constitution served supreme law united state since taking effect document written philadelphia convention ratified series state convention held since constitution amended twenty seven time particularly important amendment include ten amendment united state bill right three reconstruction amendment consti

Device set to use cpu


Summary generated: United state constitution served supreme law united state since taking effect document written phila...
Summarizing document: President of the United States


Device set to use cpu


Summary generated: Presidential power ebbed flowed time presidency played increasing role american political life since...
Summarizing document: Voting Rights Act of 1965


Device set to use cpu


Summary generated: Voting right act considered effective piece federal civil right legislation ever enacted country nat...
Processing query ID: {'query_id': 19, 'query': 'How are electoral votes distributed by state?', 'narrative': 'The user seeks to understand the distribution mechanism of electoral votes across U.S. states. Relevant documents explain how the number of electoral votes per state is determined based on congressional representation, and may include historical context, population influence, and discussions about fairness or reform. Documents solely about state election results are less relevant.', 'documents': [{'title': 'United States Electoral College', 'url': 'https://en.wikipedia.org/wiki/United_States_Electoral_College', 'content': 'united state electoral college group presidential elector formed every four year presidential election sole purpose voting president vice president process described article two constitution number electoral vote exercised state equal sta

Device set to use cpu


Summary generated:  united state electoral college group presidential elector formed every four year presidential elect...
Summarizing document: United States congressional apportionment


Device set to use cpu


Summary generated: united state congressional apportionment process seat united state house representative distributed ...
Processing query ID: {'query_id': 20, 'query': 'What is the difference between primaries and caucuses in the U.S.?', 'narrative': 'The user wants to understand the differences between primaries and caucuses in the U.S. election process. Relevant documents should explain the mechanics, types, and significance of each method in selecting party nominees. Documents about general election processes without primary or caucus context should be considered less relevant.', 'documents': [{'title': 'Primary election', 'url': 'https://en.wikipedia.org/wiki/Primary_election', 'content': 'primary election primary election held determine candidate run upcoming general election partisan primary political party selects candidate depending state party may open primary voter eligible participate closed primary member political party vote less common nonpartisan primary candidate run 

Device set to use cpu


Summary generated: Primary election primary election held determine candidate run upcoming general election partisan pr...
Summarizing document: Caucus


Device set to use cpu
Your max_length is set to 100, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


Summary generated: Caucus generally refers regular meeting member parliament mp belong parliamentary party party caucus...
Summarizing document: United States presidential primary


Device set to use cpu


Summary generated: Primary election caucus usually indirect election instead voter directly selecting particular person...
Processing query ID: {'query_id': 21, 'query': 'Benefits of meditation', 'narrative': 'The user wants to learn about the potential benefits of regular meditation practice. Relevant documents should cover physical, mental, and emotional benefits of meditation, scientific research on meditation effects, and different meditation techniques. Documents focusing only on spiritual or religious aspects without discussing benefits should be considered less relevant.', 'documents': [{'title': 'Meditation', 'url': 'https://en.wikipedia.org/wiki/Meditation', 'content': 'meditation practice individual us technique train attention awareness detach reflexive discursive thinking achieving mentally clear emotionally calm stable state judging meditation process technique broadly classified focused concentrative open monitoring method focused method involve attention specific object 

Device set to use cpu


Summary generated:  meditation practiced numerous religious tradition though also practised independently religious spi...
Processing query ID: {'query_id': 22, 'query': 'Meditation techniques for stress reduction', 'narrative': 'The user seeks effective meditation methods to alleviate stress. Relevant documents cover mindfulness, breathing exercises, and body scan techniques. Documents about non-stress-related meditation practices are less relevant.', 'documents': [{'title': 'Mindfulness', 'url': 'https://en.wikipedia.org/wiki/Mindfulness', 'content': 'mindfulness cognitive skill usually developed meditation sustaining meta attentive awareness towards content one mind present moment mindfulness derives sati significant element hindu buddhist tradition based chan guān tibetan meditation technique though definition technique mindfulness wide ranging buddhist tradition describe constitutes mindfulness perception past present future arise cease momentary sense impression mental phenomenon

Device set to use cpu


Summary generated:  mindfulness cognitive skill usually developed meditation sustaining meta attentive awareness toward...
Summarizing document: Progressive muscle relaxation


Device set to use cpu
Your max_length is set to 100, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


Summary generated:  progressive muscle relaxation pmr method deep muscle relaxation involve medication meaning non phar...
Summarizing document: Yoga


Device set to use cpu


Summary generated: Yoga originated philosophy ancient india aimed controlling body mind attain various salvation goal p...
Summarizing document: Transcendental Meditation technique


Device set to use cpu
Your max_length is set to 100, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


Summary generated:  transcendental meditation developed indian spiritual figure maharishi mahesh yogi us private mantra...
Summarizing document: Academic achievement


Device set to use cpu
Your max_length is set to 100, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Summary generated: School achievement california achievement school measured academic performance index academic achiev...
Summarizing document: Attention


Device set to use cpu
Your max_length is set to 100, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


Summary generated:  attention focus concentration awareness phenomenon exclusion stimulus selective concentration discr...
Processing query ID: {'query_id': 24, 'query': 'Cultural origins of meditation practices', 'narrative': 'The user wants historical context without spiritual focus, covering practices across civilizations. Relevant documents cover ancient India, China, and secular adaptations. Religious doctrines are less relevant.', 'documents': [{'title': 'Yoga', 'url': 'https://en.wikipedia.org/wiki/Yoga', 'content': 'yoga uk u sanskrit य ग yoga joːɡɐ lit yoke union group physical mental spiritual practice discipline originated philosophy ancient india aimed controlling body mind attain various salvation goal practiced hindu jain buddhist tradition yoga may pre vedic origin first attested early first millennium bce developed various tradition eastern ganges basin drew common body practice including vedic element yoga like practice mentioned rigveda number early upanishad systemat

Device set to use cpu


Summary generated: Yoga originated philosophy ancient india aimed controlling body mind attain various salvation goal p...
Summarizing document: Taoist meditation


Device set to use cpu
Your max_length is set to 100, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


Summary generated:  taoist meditation also spelled daoist refers traditional meditative practice associated with chines...
Processing query ID: {'query_id': 25, 'query': 'What are the scientifically proven benefits of meditation?', 'narrative': 'The user seeks peer-reviewed or widely accepted scientific findings on the physical and psychological benefits of meditation. Relevant documents include neuroscience studies, clinical trials, or summaries of physiological effects. Documents that focus only on anecdotal or spiritual claims are less relevant.', 'documents': [{'title': 'Effects of meditation', 'url': 'https://en.wikipedia.org/wiki/Effects_of_meditation', 'content': 'psychological physiological effect meditation studied recent year study meditation increasingly involved use modern instrument functional magnetic resonance imaging electroencephalography able observe brain physiology neural activity living subject either act meditation meditation correlation thus established meditativ

Device set to use cpu


Summary generated:  meditation generally deemed useful recent meta analysis show small moderate effect size mean effect...
Summarizing document: Mindfulness-based stress reduction


Device set to use cpu


Summary generated:  mindfulness based stress reduction mbsr educational program designed learning mindfulness discoveri...
Summarizing document: Brain activity and meditation


Device set to use cpu


Summary generated:  meditation effect brain activity central nervous system became focus collaborative research neurosc...
Summarizing document: Zen and the Brain


Device set to use cpu


Summary generated:  zen brain toward understanding meditation consciousness book neurologist zen practitioner james h a...
Summarizing document: Renewable energy


Device set to use cpu


Summary generated:  renewable energy also called green energy energy made renewable natural resource replenished human ...
Summarizing document: Solar energy


Device set to use cpu


Summary generated: Renewable energy technology broadly characterized either passive solar active solar depending captur...
Summarizing document: Wind power


Device set to use cpu
Your max_length is set to 100, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Summary generated: Wind power generated almost completely using wind turbine generally grouped wind farm connected elec...
Summarizing document: Hydroelectricity


Device set to use cpu


Summary generated: Hydropower used since ancient time grind flour perform task late th century hydraulic power provided...
Summarizing document: Energy storage


Device set to use cpu


Summary generated:  energy storage capture energy produced one time use later time reduce imbalance energy demand energ...
Summarizing document: Lithium-ion battery


Device set to use cpu


Summary generated: Li ion battery enabled portable consumer electronics laptop computer cellular phone electric car li ...
Summarizing document: Pumped-storage hydroelectricity


Device set to use cpu


Summary generated: Pumped storage hydroelectricity allows energy intermittent source solar wind renewables excess elect...
Summarizing document: Hydrogen storage


Device set to use cpu


Summary generated: Several method exist storing hydrogen including mechanical approach using high pressure low temperat...
Processing query ID: {'query_id': 28, 'query': 'Global renewable energy adoption rates', 'narrative': 'The user seeks data on countries leading in renewables (e.g., Germany, China) and adoption challenges. Relevant documents cover policy frameworks and energy transitions. Documents about non-renewable adoption are irrelevant.', 'documents': [{'title': 'List of renewable energy topics by country and territory', 'url': 'https://en.wikipedia.org/wiki/List_of_renewable_energy_topics_by_country_and_territory', 'content': 'list renewable energy topic country territory link used compare development renewable energy different country territory help encourage new writer participate writing development country country interest list refers renewable energy general well solar power wind power geothermal energy biofuel hydropower china germany japan india four world largest eco

Device set to use cpu


Summary generated: List refers renewable energy general well solar power wind power geothermal energy biofuel hydropowe...
Summarizing document: Energiewende


Device set to use cpu


Summary generated: Germany new system intends rely heavily renewable energy particularly wind photovoltaics hydroelectr...
Summarizing document: Renewable energy in China


Device set to use cpu


Summary generated: China world top electricity producer renewable energy source china renewable energy sector growing f...
Processing query ID: {'query_id': 29, 'query': 'Environmental impact of renewable energy', 'narrative': 'The user wants to understand the ecological effects of renewable energy sources. Relevant documents should cover land use, wildlife impacts, and lifecycle assessments. Documents about fossil fuel impacts without renewable context are less relevant.', 'documents': [{'title': 'Environmental impact of renewable energy', 'url': 'https://en.wikipedia.org/wiki/Environmental_impact_of_electricity_generation', 'content': 'electric power system consist generation plant different energy source transmission network distribution line component environmental impact multiple stage development use including construction generation electricity decommissioning disposal impact split operational impact fuel sourcing global atmospheric localized pollution construction impact manufa

Device set to use cpu
Your max_length is set to 100, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


Summary generated: Electric power system consist generation plant different energy source transmission network distribu...
Summarizing document: Life cycle assessment


Device set to use cpu
Your max_length is set to 100, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Summary generated: Lca study involves thorough inventory energy material required across supply chain value chain produ...
Summarizing document: Solar power


Device set to use cpu


Summary generated: Solar power also known solar electricity conversion energy sunlight electricity either directly usin...
Summarizing document: Wind power


Device set to use cpu
Your max_length is set to 100, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Summary generated: Wind power generated almost completely using wind turbine generally grouped wind farm connected elec...
Summarizing document: Biofuel


Device set to use cpu


Summary generated:  biofuel fuel produced short time span biomass rather slow natural process involved formation fossil...
Summarizing document: Energy transition


Device set to use cpu


Summary generated:  energy transition energy system transformation major structural change energy supply consumption en...
Processing query ID: {'query_id': 31, 'query': 'How does excessive social media use impact adolescent mental health?', 'narrative': 'User is interested in understanding the psychological effects of social media platforms on teenagers, including issues like anxiety, depression, self-esteem, and cyberbullying.', 'documents': [{'title': 'Social media and mental health', 'url': 'https://en.wikipedia.org/wiki/Digital_media_use_and_mental_health', 'content': 'relationship digital medium use mental health investigated various researcher predominantly psychologist sociologist anthropologist medical expert especially since mid growth world wide web rise text messaging significant body research explored overuse phenomenon commonly known digital addiction digital dependency phenomenon manifest differently many society culture expert investigated benefit moderate digital mediu

Device set to use cpu


Summary generated: Overuse phenomenon commonly known digital addiction digital dependency phenomenon manifest different...
Summarizing document: Cyberbullying


Device set to use cpu


Summary generated:  cyberbullying cyberharassment online bullying form bullying harassment using electronic mean since ...
Processing query ID: {'query_id': 32, 'query': 'Social media addiction and its effects', 'narrative': 'User is looking for information on the addictive nature of social media platforms and its consequences on daily life, productivity, and relationships.', 'documents': [{'title': 'Internet addiction disorder', 'url': 'https://en.wikipedia.org/wiki/Internet_addiction_disorder', 'content': 'internet addiction disorder iad also known problematic internet use pathological internet use problematic compulsive use internet particularly social medium impairs individual function prolonged period time young people particular risk developing internet addiction disorder case study highlighting student whose academic performance decline spend time online experience health consequence loss sleep stay continue scrolling chatting gaming excessive internet use recognized disorder am

Device set to use cpu


Summary generated: Internet addiction disorder iad also known problematic internet use pathological internet use proble...
Summarizing document: Social media addiction


Device set to use cpu


Summary generated:  problematic social medium use refers use social medium virtual online community engagement often ex...
Summarizing document: Digital detox


Device set to use cpu


Summary generated:  digital detox deliberate break digital device mitigate screen overuse promote offline activity emer...
Processing query ID: {'query_id': 33, 'query': 'Impact of social media on body image', 'narrative': 'User wants to understand how social media influences perceptions of body image, including issues like eating disorders, self-esteem, and societal standards.', 'documents': [{'title': 'Body image', 'url': 'https://en.wikipedia.org/wiki/Body_image', 'content': 'body image person thought feeling perception aesthetic sexual attractiveness body concept body image used several discipline including neuroscience psychology medicine psychiatry psychoanalysis philosophy cultural feminist study medium also often us term across discipline single consensus definition broadly speaking body image consists way people view memory experience assumption comparison appearance overall attitude towards respective height shape weight shaped prevalent social cultural ideal body image negat

Device set to use cpu
Your max_length is set to 100, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Summary generated: Body image consists way people view memory experience assumption comparison appearance overall attit...
Summarizing document: Eating disorder


Device set to use cpu


Summary generated: Eating disorder mental disorder defined abnormal eating behavior adversely affect person physical me...
Summarizing document: Self-esteem


Device set to use cpu


Summary generated: Self esteem encompasses belief oneself example loved worthy well emotional state triumph despair pri...
Summarizing document: Social comparison theory


Device set to use cpu
Your max_length is set to 100, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


Summary generated:  social comparison theory initially proposed social psychologist leon festinger center belief indivi...
Summarizing document: Misinformation


Device set to use cpu


Summary generated:  misinformation incorrect misleading information misinformation disinformation interchangeable term ...
Summarizing document: Fake news


Device set to use cpu


Summary generated: Fake news involves spreading false information harmful intent sometimes generated propagated hostile...
Summarizing document: Disinformation


Device set to use cpu


Summary generated:  disinformation misleading content deliberately spread deceive people secure economic political gain...
Processing query ID: {'query_id': 35, 'query': "Social media's role in political polarization", 'narrative': 'User wants to explore how social media contributes to political polarization and echo chambers, including its effects on public discourse and democracy.', 'documents': [{'title': 'Political polarization', 'url': 'https://en.wikipedia.org/wiki/Political_polarization', 'content': 'political polarization spelled polarisation british english australian english new zealand english divergence political attitude away center towards ideological extreme scholar distinguish ideological polarization difference policy position affective polarization emotional dislike distrust political group discussion polarization political science consider polarization context political party democratic system government two party system political polarization usually embodies tensio

Device set to use cpu


Summary generated: Polarisation is the study of political polarization. Political polarization usually embodies tension...
Summarizing document: Echo chamber (media)


Device set to use cpu
Your max_length is set to 100, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


Summary generated:  echo chamber may increase social political polarization extremism social medium thought echo chambe...
Summarizing document: Social media and politics


Device set to use cpu


Summary generated: Online medium audience member largely passive consumer content creation dominated small number socia...
Summarizing document: Video game


Device set to use cpu


Summary generated: Video game industry expanded onto mobile gaming mobile device smartphones tablet computer virtual au...
Summarizing document: Cognitive development


Device set to use cpu


Summary generated: Cognitive development defined emergence ability consciously cognize understand articulate understand...
Summarizing document: Child development


Device set to use cpu


Summary generated: Child development involves biological psychological emotional change occur human being birth conclus...
Processing query ID: {'query_id': 37, 'query': 'How does bilingualism affect cognitive skills?', 'narrative': 'User is interested in understanding the cognitive advantages or disadvantages of being bilingual, including effects on memory, attention, and problem-solving.', 'documents': [{'title': 'Bilingualism', 'url': 'https://en.wikipedia.org/wiki/Multilingualism', 'content': 'multilingualism use one language either individual speaker group speaker language two usually called bilingualism believed multilingual speaker outnumber monolingual speaker world population half european claim speak least one language mother tongue many read write one language multilingual advantageous people wanting participate trade globalization cultural openness owing ease access information facilitated internet individual exposure multiple language become increasingly possible people sp

Device set to use cpu


Summary generated:  multilingualism use one language either individual speaker group speaker language two usually calle...
Summarizing document: Cognitive skills


Device set to use cpu


Summary generated: Cognitive functioning refers person ability process thought defined ability individual perform vario...
Summarizing document: Memory


Device set to use cpu
Your max_length is set to 100, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


Summary generated:  memory faculty mind data information encoded stored retrieved needed retention information time pur...
Processing query ID: {'query_id': 38, 'query': 'The impact of early childhood education on cognitive development', 'narrative': 'User seeks to understand how early childhood education programs influence cognitive skills, including language, math, and social skills.', 'documents': [{'title': 'Early childhood education', 'url': 'https://en.wikipedia.org/wiki/Early_childhood_education', 'content': 'early childhood education ece also known nursery education branch education theory relates teaching child formally informally birth age eight traditionally equivalent third grade ece described important period child development ece emerged field study enlightenment particularly european country high literacy rate continued grow nineteenth century universal primary education became norm western world recent year early childhood education become prevalent public policy issue 

Device set to use cpu


Summary generated:  early childhood education ece also known nursery education branch education theory relates teaching...
Summarizing document: Cognitive development


Device set to use cpu


Summary generated: Cognitive development defined emergence ability consciously cognize understand articulate understand...
Summarizing document: Language development


Device set to use cpu


Summary generated: language development human process start early life infant start without knowing language yet month ...
Processing query ID: {'query_id': 39, 'query': 'The role of nutrition in cognitive development', 'narrative': 'User is interested in understanding how nutrition affects cognitive skills, including memory, attention, and learning abilities.', 'documents': [{'title': 'Nutrition', 'url': 'https://en.wikipedia.org/wiki/Nutrition', 'content': 'nutrition biochemical physiological process organism us food support life provides organism nutrient metabolized create energy chemical structure failure obtain required amount nutrient cause malnutrition nutritional science study nutrition though typically emphasizes human nutrition type organism determines nutrient need obtains organism obtain nutrient consuming organic matter consuming inorganic matter absorbing light combination produce nutrient internally consuming basic element must consume organism obtain pre existing nutri

Device set to use cpu
Your max_length is set to 100, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


Summary generated:  nutrition biochemical physiological process organism us food support life provides organism nutrien...
Summarizing document: Cognitive development


Device set to use cpu


Summary generated: Cognitive development defined emergence ability consciously cognize understand articulate understand...
Summarizing document: Memory


Device set to use cpu
Your max_length is set to 100, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


Summary generated:  memory faculty mind data information encoded stored retrieved needed retention information time pur...
Processing query ID: {'query_id': 40, 'query': 'The effects of screen time on cognitive development in children', 'narrative': 'User seeks to understand how screen time affects cognitive skills, including attention, memory, and learning abilities.', 'documents': [{'title': 'Screen time', 'url': 'https://en.wikipedia.org/wiki/Screen_time', 'content': 'screen time amount time spent using electronic device display screen smartphone computer television video game console tablet concept significant research related concept digital medium use mental health screen time correlated mental physical harm child development positive negative health effect screen time particular individual influenced level content exposure prevent harmful excess screen time government placed regulation usage history statistic first electronic screen cathode ray tube crt invented commercialized c

Device set to use cpu


Summary generated:  screen time amount time spent using electronic device display screen smartphone computer television...
Summarizing document: Cognitive development


Device set to use cpu


Summary generated: Cognitive development defined emergence ability consciously cognize understand articulate understand...
Summarizing document: Attention (psychology)


Device set to use cpu
Your max_length is set to 100, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


Summary generated:  attention focus concentration awareness phenomenon exclusion stimulus selective concentration discr...
Summarizing document: Sleep deprivation in higher education


Device set to use cpu
Your max_length is set to 100, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)


Summary generated:  sleep deprivation condition enough sleep common health issue student higher education issue several...
Summarizing document: Sleep deprivation


Device set to use cpu


Summary generated:  sleep deprivation also known sleep insufficiency sleeplessness condition adequate duration quality ...
Summarizing document: Cognitive function


Device set to use cpu


Summary generated: Cognitive functioning refers person ability process thought defined ability individual perform vario...
Summarizing document: Effects on sleep and mental health


Device set to use cpu


Summary generated: Polar night phenomenon occurs northernmost southernmost region earth sun remains horizon hour occurs...
Summarizing document: Anxiety disorder


Device set to use cpu


Summary generated: Anxiety is a significant uncontrollable feeling anxiety fear person social occupational personal fun...
Summarizing document: Depression (mood)


Device set to use cpu
Your max_length is set to 100, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


Summary generated:  depression is a mental state low mood aversion activity affect global population million people wor...
Processing query ID: {'query_id': 43, 'query': 'How does sleep affect physical health?', 'narrative': 'User seeks to understand the impact of sleep on physical health, including its role in immune function, metabolism, and chronic diseases.', 'documents': [{'title': 'Sleep and health', 'url': 'https://en.wikipedia.org/wiki/Sleep', 'content': 'sleep state reduced mental physical activity consciousness altered certain sensory activity inhibited sleep marked decrease muscle activity interaction surrounding environment sleep differs wakefulness term ability react stimulus still involves active brain pattern making reactive coma disorder consciousness sleep occurs repeating period body alternate two distinct mode rapid eye movement sleep rem non rem sleep although rem stand rapid eye movement mode sleep many aspect including virtual paralysis body dream succession image

Device set to use cpu
Your max_length is set to 100, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


Summary generated: Human may suffer various sleep disorder including dyssomnias insomnia hypersomnia narcolepsy sleep a...
Summarizing document: Immune system


Device set to use cpu


Summary generated:  immune system network biological system protects organism disease detects responds wide variety pat...
Summarizing document: Metabolism


Device set to use cpu


Summary generated:  metabolic reaction is a chemical reaction that occurs in a living organism. metabolic reaction may ...
Summarizing document: Sleep disorder


Device set to use cpu


Summary generated:  sleep disorder somnipathy medical disorder affecting individual sleep pattern sometimes impacting p...
Summarizing document: Insomnia


Device set to use cpu


Summary generated:  insomnia also known sleeplessness sleep disorder people difficulty sleeping may difficulty falling ...
Summarizing document: Sleep apnea


Device set to use cpu


Summary generated:  sleep apnea is a chronic breathing disorder that disrupts normal sleep. Sleep apnea affects one mil...
Processing query ID: {'query_id': 45, 'query': 'Sleep hygiene practices for better sleep quality', 'narrative': 'User seeks to learn about effective sleep hygiene practices that can improve sleep quality and duration.', 'documents': [{'title': 'Sleep hygiene', 'url': 'https://en.wikipedia.org/wiki/Sleep_hygiene', 'content': 'sleep hygiene behavioral environmental practice developed late method help people mild moderate insomnia clinician assess sleep hygiene people insomnia condition depression offer recommendation based assessment sleep hygiene recommendation include establishing regular sleep schedule using nap care exercising physically mentally close bedtime limiting worry limiting exposure light hour sleep getting bed sleep come using bed anything sleep sex avoiding alcohol well nicotine caffeine stimulant hour bedtime peaceful comfortable dark sleep environme

Device set to use cpu
Your max_length is set to 100, but your input_length is only 80. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


Summary generated: Sleep hygiene recommendation include establishing regular sleep schedule using nap care exercising p...
Summarizing document: Circadian rhythm


Device set to use cpu


Summary generated:  circadian rhythm refers process originates within organism e endogenous responds environment entrai...
Processing query ID: {'query_id': 46, 'query': 'History of human spaceflight', 'narrative': 'The user wants to understand the major milestones in human space exploration from the first launch to the present. Relevant documents should discuss key missions, astronauts, and space agencies.', 'documents': [{'title': 'Human spaceflight', 'url': 'https://en.wikipedia.org/wiki/Human_spaceflight', 'content': 'human spaceflight also referred manned spaceflight crewed spaceflight spaceflight crew passenger aboard spacecraft often spacecraft operated directly onboard human crew spacecraft also remotely operated ground station earth autonomously without direct human involvement people trained spaceflight called astronaut american cosmonaut russian taikonauts chinese non professional referred spaceflight participant spacefarers first human space soviet cosmonaut yuri gagarin la

Device set to use cpu


Summary generated: Human spaceflight also referred manned spaceflight crewed spaceflight. Spaceflight crew passenger ab...
Summarizing document: Vostok 1


Device set to use cpu


Summary generated: Vostok programme first human orbital spaceflight history vostok ka space capsule launched baikonur c...
Summarizing document: Apollo program


Device set to use cpu


Summary generated:  apollo program also known project apollo united state human spaceflight program led nasa successful...
Processing query ID: {'query_id': 47, 'query': 'Current space missions and projects', 'narrative': 'The user is looking for information on active space missions such as Artemis, Mars rovers, and private space flights. Documents should focus on ongoing or upcoming efforts in space exploration.', 'documents': [{'title': 'Artemis program', 'url': 'https://en.wikipedia.org/wiki/Artemis_program', 'content': 'artemis program moon exploration program led united state national aeronautics space administration nasa formally established via space policy directive intended reestablish human presence moon first time since apollo mission program stated long term goal establish permanent base moon facilitate human mission mar two principal element artemis program derived cancelled constellation program orion spacecraft esm instead u built service module space launch system solid

Device set to use cpu


Summary generated:  artemis program moon exploration program led united state national aeronautics space administration...
Summarizing document: Mars 2020


Device set to use cpu


Summary generated:  mar nasa mission includes rover perseverance retired small robotic helicopter ingenuity associated ...
Summarizing document: James Webb Space Telescope


Device set to use cpu


Summary generated: James webb space telescope designed conduct infrared astronomy largest telescope space equipped high...
Summarizing document: SpaceX


Device set to use cpu
Your max_length is set to 100, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


Summary generated:  space exploration technology corp commonly referred spacex american space technology company headqu...
Summarizing document: Blue Origin


Device set to use cpu


Summary generated:  blue origin enterprise l p american space technology company headquartered kent washington company ...
Summarizing document: Virgin Galactic


Device set to use cpu


Summary generated:  virgin galactic holding inc british american spaceflight company founded richard branson virgin gro...
Summarizing document: Commercial spaceflight


Device set to use cpu


Summary generated: Private spaceflight accomplishment date include flying suborbital spaceplanes spaceshipone spaceship...
Processing query ID: {'query_id': 49, 'query': 'Challenges of long-term space travel', 'narrative': 'The user seeks information on health risks, life support, and engineering challenges faced during prolonged space missions like those to Mars.', 'documents': [{'title': 'Human mission to Mars', 'url': 'https://en.wikipedia.org/wiki/Human_mission_to_Mars', 'content': 'idea sending human mar subject aerospace engineering scientific study since late part broader exploration mar long term proposal included sending settler terraforming planet currently robotic lander rover helicopter mar farthest human beyond earth moon u national aeronautics space administration nasa apollo program ended conceptual proposal mission would involve human explorer started early planned mission typically stated taking place year time drafted list crewed mar mission plan show various mission 

Device set to use cpu
Your max_length is set to 100, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Summary generated: Long term proposal included sending settler terraforming planet currently robotic lander rover helic...
Summarizing document: Space medicine


Device set to use cpu


Summary generated:  space medicine subspecialty emergency medicine fellowship training pathway evolved aerospace medici...
Summarizing document: Life support system


Device set to use cpu


Summary generated: Life support system combination equipment allows survival environment situation would support life a...
Summarizing document: International Space Station


Device set to use cpu


Summary generated: Largest space station ever constructed primarily serf platform conducting scientific experiment micr...
Summarizing document: European Space Agency


Device set to use cpu


Summary generated: European space agency esa member international organization devoted space exploration headquarters p...
Saving processed dataset to data/bart_summarized.json...
Done!


In [28]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = 0 if torch.backends.mps.is_available() else -1  # -1 = CPU

def summarize_documents(model_name, model_id, content):

    def longformer_summarize(text, tokenizer, model):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask
        global_attention_mask = torch.zeros_like(input_ids)
        global_attention_mask[:, 0] = 1

        summary_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            max_length=150,
            num_beams=4
        )
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    if model_name == "longformer":
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

        summary = longformer_summarize(content, tokenizer, model)
    else:
        summarizer = pipeline("summarization", model=model_id, tokenizer=model_id, device=-1)
        summary = summarizer(content['content'], max_length=100, min_length=30, do_sample=False)

    return model, summary

In [29]:
for name, model_id in models.items():
    print(f"Loading {name} model...")
    docs = [dataset[0]['documents'][0]] # A list of one document
    model, summary = summarize_documents(name, model_id, docs)
    print(f"Model {name} loaded successfully.")
    print(f"Summary for {dataset[0]['documents'][0]['title']}:")
    print(summary)
    print("\n")

Loading pegasus model...


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


TypeError: list indices must be integers or slices, not str

In [None]:


models = {
    "pegasus": "google/pegasus-xsum",
    "bart": "facebook/bart-large-cnn",
    "t5": "t5-base",
    "longformer": "allenai/led-base-16384"
}

summarized_dataset = dataset

# Longformer summarizer function
def longformer_summarize(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    global_attention_mask = torch.zeros_like(input_ids)
    global_attention_mask[:, 0] = 1  # global attention on first token

    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        global_attention_mask=global_attention_mask,
        max_length=150,
        num_beams=4
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Run for each model
for name, model_id in models.items():
    summarized_docs = []

    if name == "longformer":
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

        for doc in documents:
            summary = longformer_summarize(doc["content"], tokenizer, model)
            summarized_docs.append({
                "title": doc["title"],
                "url": doc["url"],
                "summary": summary
            })
    else:
        summarizer = pipeline("summarization", model=model_id, tokenizer=model_id)
        for doc in documents:
            summary = summarizer(doc["content"], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            summarized_docs.append({
                "title": doc["title"],
                "url": doc["url"],
                "summary": summary
            })

    filename = f"summarized_output_{name}.json"
    with open(filename, "w") as f:
        json.dump(output_data, f, indent=2)

    print(f"✅ Saved: {filename}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Your max_length is set to 100, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 100, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


✅ Saved: summarized_output_pegasus.json


Device set to use mps:0
Your max_length is set to 100, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 100, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ Saved: summarized_output_bart.json


Device set to use mps:0
Your max_length is set to 100, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 100, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


✅ Saved: summarized_output_t5.json


In [None]:
%pip install torch