In [None]:
pip install feedparser beautifulsoup4 requests transformers

In [None]:
import feedparser
import requests
from bs4 import BeautifulSoup

# Dictionary of websites and their RSS feed URLs
RSS_FEEDS = {
    "POWER Magazine": "https://www.powermag.com/feed/",
    "Power Engineering": "https://www.power-eng.com/rss/",
    "Power Technology": "https://www.power-technology.com/feed/",
    "Energy News Network": "https://energynews.us/feed/",
    "Energy-Storage.news": "https://www.energy-storage.news/feed/",
    "Power Info Today": "https://www.powerinfotoday.com/feed/",
    "Renewable Energy World": "https://www.renewableenergyworld.com/feed/",
}

def fetch_links(feed_url, site_name):
    """
    Fetch news article links from an RSS feed.

    Parameters:
    feed_url (str): The URL of the RSS feed.
    site_name (str): The name of the site being fetched.

    Returns:
    list: A list of tuples containing the site name, title, and link.
    """
    try:
        feed = feedparser.parse(feed_url)

        if feed.bozo or 'entries' not in feed or not feed.entries:
            print(f"No valid entries found in RSS feed for {site_name}.")
            return []

        return [
            (site_name, entry.title, entry.link)
            for entry in feed.entries
        ]
    except Exception as e:
        print(f"Error fetching links for {site_name}: {e}")
        return []

def fetch_main_content(url):
    """
    Fetch and extract the main news content from a webpage.

    Parameters:
        url (str): The URL of the webpage.

    Returns:
        str: Extracted main content text.
    """
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # List of potential main content tags and classes
        potential_tags = [
            ('article', None),
            ('div', {'class': 'content'}),
            ('div', {'class': 'article-content'}),
            ('div', {'class': 'entry-content'}),
            ('div', {'class': 'post-content'}),
            ('div', {'class': 'main-content'}),
        ]

        for tag, attrs in potential_tags:
            main_content = soup.find(tag, attrs)
            if main_content:
                paragraphs = main_content.find_all('p')
                if paragraphs:
                    text = ' '.join([p.get_text() for p in paragraphs]).strip()

                    # Check for "Select Country" and truncate if found
                    if "Select Country" in text:
                        text = text.split("Select Country")[0].strip()

                    return text

        # Fallback to all <p> tags
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs]).strip()

        # Check for "Select Country" and truncate if found
        if "Select Country" in text:
            text = text.split("Select Country")[0].strip()

        return text

    except Exception as e:
        print(f"Error fetching webpage content from {url}: {e}")
        return ""

if __name__ == "__main__":
    all_articles = []

    print("Fetching news links from all sites...")
    for site_name, feed_url in RSS_FEEDS.items():
        links = fetch_links(feed_url, site_name)
        print(f"\n{site_name} - Found {len(links)} articles.")

        for site, title, link in links:
            print(f"\nTitle: {title}")
            print(f"Link: {link}")

            article_text = fetch_main_content(link)

            if article_text:
                print(f"Extracted Main Content:\n{article_text[:5000]}")  # Print full content or truncate manually if needed
                all_articles.append((site, title, article_text, link))
            else:
                print("Failed to extract content.")

            print("-" * 80)

    print(f"\nTotal Articles Extracted: {len(all_articles)}")


Fetching news links from all sites...

POWER Magazine - Found 10 articles.

Title: Russia Hits Ukrainian Power Stations with Christmas Day Attacks
Link: https://www.powermag.com/russia-hits-ukrainian-power-stations-with-christmas-day-attacks/
Extracted Main Content:
DTEK, Ukraine’s largest private energy company, reported power outages across large areas of the country after Russia fired missiles against DTEK power stations in strikes on Christmas day. DTEK said it was the 13th massive attack on Ukrainian energy systems this year, and the 10th to target DTEK power stations. At least one long-time power engineer—Dmytro Petlenko, who worked at one of DTEK’s thermal power plants—was killed during the attacks. Petlenko had reportedly spent more than 30 years working in Ukraine’s energy sector including at the Zaporizhzhia thermal power plant and more recently at another DTEK facility after relocating as a result of the war. The latest Russian attack caused serious damage to Ukraine’s power

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import itertools

# Function to compute similarity and find top 10 similar articles
def find_top_similar_articles(articles, top_n=10):
    """
    Compare main content and title of news articles and find the top N most similar pairs.

    Parameters:
    articles (list): A list of tuples containing site_name, title, main_content, and link.
    top_n (int): Number of top similar article pairs to return.

    Returns:
    list: A list of tuples containing the similarity score and the two article details.
    """
    if len(articles) < 2:
        print("Not enough articles to compare.")
        return []

    # Combine title and main content for comparison
    combined_texts = [f"{title} {main_content}" for _, title, main_content, _ in articles]

    # Generate TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(combined_texts)

    # Compute pairwise cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Generate all possible pairs and their similarity scores
    article_pairs = []
    for i, j in itertools.combinations(range(len(articles)), 2):
        similarity_score = similarity_matrix[i, j]
        article_pairs.append((similarity_score, articles[i], articles[j]))

    # Sort by similarity score in descending order and get top N
    top_similar_articles = sorted(article_pairs, key=lambda x: x[0], reverse=True)[:top_n]

    return top_similar_articles

# Sample Usage
if __name__ == "__main__":
    # Assuming all_articles is already populated with (site_name, title, main_content, link)
    # all_articles = [(site_name, title, main_content, link), ...]

    print("Comparing articles for similarity...")
    top_similarities = find_top_similar_articles(all_articles, top_n=10)

    print("\nTop 10 Most Similar Articles:")
    for idx, (score, article1, article2) in enumerate(top_similarities, start=1):
        print(f"\n{idx}. Similarity Score: {score:.4f}")
        print(f"Article 1 - Site: {article1[0]}, Title: {article1[1]}")
        print(f"Article 2 - Site: {article2[0]}, Title: {article2[1]}")
        print("-" * 80)


Comparing articles for similarity...

Top 10 Most Similar Articles:

1. Similarity Score: 0.9911
Article 1 - Site: Energy News Network, Title: How one nonprofit is working to build support for solar — and added benefits for communities — in rural North Carolina
Article 2 - Site: Renewable Energy World, Title: How one nonprofit is working to build support for solar — and added benefits for communities — in rural North Carolina
--------------------------------------------------------------------------------

2. Similarity Score: 0.9888
Article 1 - Site: Power Engineering, Title: Ameren Missouri brings 500 MW of new solar online
Article 2 - Site: Renewable Energy World, Title: Ameren Missouri brings 500 MW of new solar online
--------------------------------------------------------------------------------

3. Similarity Score: 0.5444
Article 1 - Site: POWER Magazine, Title: Flamanville 3 Reactor Online in France After 12-Year Delay
Article 2 - Site: Power Technology, Title: France connect

In [None]:
# Initialize main_content as a list
main_content = []
main_title = []
for idx, (score, article1, article2) in enumerate(top_similarities, start=1):
    main_title.append(article1[1])  # Add main title
    main_content.append(article2[2])  # Add main content

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def generate_summary(text):
    """
    Generate a summary for a single article's main content using BART.

    Parameters:
        text (str): The main content of the article.

    Returns:
        str: A summary of the main content.
    """
    # Tokenize the text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=400,
        min_length=100,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# File path in Google Drive
output_file_path = '/content/drive/MyDrive/summaries output.txt'

# Open the file for writing
with open(output_file_path, 'w') as file:
    for i in range(len(main_content)):
        # Write the title to the file
        file.write(f"Title {i+1}: {main_title[i]}\n")

        # Generate the summary
        summary = generate_summary(main_content[i])

        # Write the summary to the file
        file.write("Summary:\n")
        file.write(summary + '\n\n')
        file.write("-" * 80 + '\n')  # Add a separator line for clarity

print(f"Summaries saved to {output_file_path}")


Summaries saved to /content/drive/MyDrive/summaries output.txt
