In [10]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [11]:
# Ensure you have the VADER lexicon downloaded
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/izzymohamed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [12]:
def fetch_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch the webpage: Status code {response.status_code}")

In [13]:
def extract_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    page_text = ' '.join([para.get_text() for para in paragraphs])
    return page_text

In [14]:
def chunk_text(text, tokenizer, max_length=512):
    # Tokenize text and split into chunks of max_length tokens
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    chunks = []
    for i in range(0, len(inputs["input_ids"][0]), max_length):
        chunk = tokenizer.decode(inputs["input_ids"][0][i:i + max_length], skip_special_tokens=True)
        chunks.append(chunk)
    return chunks

In [19]:
def summarize_text(text):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
    
    chunks = chunk_text(text, tokenizer)
    summaries = [summarizer(chunk, max_length=500, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
    return ' '.join(summaries)

In [20]:
def perform_sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores

In [21]:
def main(url):
    # Fetch the webpage content
    html_content = fetch_webpage(url)
    
    # Extract text from the HTML content
    page_text = extract_text(html_content)
    
    # Summarize the text
    summary = summarize_text(page_text)
    
    # Perform sentiment analysis on the summary
    sentiment = perform_sentiment_analysis(summary)
    
    return {
        'summary': summary,
        'sentiment': sentiment
    }

In [22]:
# Example usage
url = 'https://www.atlanticcouncil.org/in-depth-research-reports/report/egypt-stability-gcc-priority/'  # Replace with the actual URL

result = main(url)
print("Summary:", result['summary'])
print("Sentiment:", result['sentiment'])

Summary: New Atlanticist is where top experts and policymakers offer exclusive insight on the most pressing global challenges. UkraineAlert provides regular news and analysis on developments in Ukraine’s politics, economy, civil society, and culture. MENASource offers the latest news from across the Middle East.
Sentiment: {'neg': 0.0, 'neu': 0.892, 'pos': 0.108, 'compound': 0.4336}
