In [1]:
pip install feedparser nltk scikit-learn pandas




In [5]:
"""
AI-Powered Personalized Newsletter Generator

This script fetches articles from RSS feeds, categorizes them using NLP,
and generates personalized newsletters based on user preferences.
"""

import feedparser
import nltk
import datetime
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import os

# Download necessary NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

class RSSFeedReader:
    """Handles fetching and parsing RSS feeds."""
    
    def __init__(self):
        self.feeds = {
            'general': [
                'http://feeds.bbci.co.uk/news/world/rss.xml',
                'https://rss.nytimes.com/services/xml/rss/nyt/World.xml',
                'https://www.reuters.com/rss/world'
            ],
            'technology': [
                'https://techcrunch.com/feed/',
                'https://www.wired.com/feed/rss',
                'https://www.technologyreview.com/feed/'
            ],
            'finance': [
                'https://www.bloomberg.com/feed/podcast/etf-report',
                'https://www.cnbc.com/id/10001147/device/rss/rss.html',
                'https://www.ft.com/rss/home'
            ],
            'sports': [
                'https://www.espn.com/espn/rss/news',
                'http://feeds.bbci.co.uk/sport/rss.xml',
                'https://www.skysports.com/rss/0'
            ],
            'entertainment': [
                'https://variety.com/feed/',
                'https://www.hollywoodreporter.com/feed/',
                'https://www.billboard.com/feed/'
            ],
            'science': [
                'https://www.nasa.gov/rss/dyn/breaking_news.rss',
                'https://www.sciencedaily.com/rss/all.xml',
                'https://arstechnica.com/science/feed/'
            ]
        }
    
    def fetch_articles(self, categories=None, max_articles_per_feed=5):
        """
        Fetches articles from RSS feeds.
        
        Args:
            categories: List of categories to fetch articles from. If None, fetch from all categories.
            max_articles_per_feed: Maximum number of articles to fetch from each feed.
            
        Returns:
            List of articles, each being a dictionary with keys: title, link, summary, source, category.
        """
        if categories is None:
            categories = list(self.feeds.keys())
        
        articles = []
        
        for category in categories:
            if category not in self.feeds:
                continue
                
            for feed_url in self.feeds[category]:
                try:
                    # Parse the feed
                    feed = feedparser.parse(feed_url)
                    
                    # Extract feed name from boilerplate
                    feed_name = feed.feed.get('title', feed_url)
                    
                    # Get articles
                    for i, entry in enumerate(feed.entries):
                        if i >= max_articles_per_feed:
                            break
                            
                        # Extract article information
                        article = {
                            'title': entry.get('title', 'No title'),
                            'link': entry.get('link', ''),
                            'summary': entry.get('summary', entry.get('description', 'No summary available')),
                            'source': feed_name,
                            'category': category,
                            'published': entry.get('published', datetime.datetime.now().strftime('%Y-%m-%d')),
                        }
                        
                        articles.append(article)
                        
                except Exception as e:
                    print(f"Error fetching from {feed_url}: {e}")
        
        return articles

class ArticleAnalyzer:
    """Uses NLP to analyze and categorize articles."""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer = TfidfVectorizer(stop_words='english')
        
    def preprocess_text(self, text):
        """Preprocesses text for NLP analysis."""
        # Convert to lowercase and tokenize
        tokens = word_tokenize(text.lower())
        
        # Remove stopwords and non-alphabetic tokens
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
        
        return ' '.join(filtered_tokens)
    
    def extract_keywords(self, text, num_keywords=5):
        """Extracts the most significant keywords from text."""
        processed_text = self.preprocess_text(text)
        
        # Create a document-term matrix
        tfidf_matrix = self.vectorizer.fit_transform([processed_text])
        
        # Get feature names (terms)
        feature_names = self.vectorizer.get_feature_names_out()
        
        # Get scores for each term
        scores = zip(feature_names, tfidf_matrix.toarray()[0])
        
        # Sort terms by score and get top keywords
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
        
        return [keyword for keyword, score in sorted_scores[:num_keywords]]
    
    def calculate_relevance_score(self, article, user_interests):
        """
        Calculates relevance score of an article to user interests.
        
        Args:
            article: Dictionary containing article information
            user_interests: List of user interest keywords
            
        Returns:
            Float between 0 and 1 indicating relevance
        """
        # Combine title and summary for analysis
        article_text = article['title'] + ' ' + article['summary']
        article_text = self.preprocess_text(article_text)
        
        # Extract keywords from the article
        article_keywords = self.extract_keywords(article_text, num_keywords=10)
        
        # Calculate overlap between article keywords and user interests
        matching_keywords = sum(1 for keyword in article_keywords if any(interest.lower() in keyword.lower() for interest in user_interests))
        
        # Calculate a relevance score (0 to 1)
        if not article_keywords:
            return 0
        
        return matching_keywords / len(article_keywords)

class NewsletterGenerator:
    """Generates personalized newsletters based on user preferences."""
    
    def __init__(self, feed_reader, article_analyzer):
        self.feed_reader = feed_reader
        self.article_analyzer = article_analyzer
        self.user_personas = {
            "Alex Parker": {
                "interests": ["AI", "cybersecurity", "blockchain", "startups", "programming"],
                "sources": ["TechCrunch", "Wired", "Ars Technica", "MIT Technology Review"],
                "categories": ["technology", "general"]
            },
            "Priya Sharma": {
                "interests": ["global markets", "startups", "fintech", "cryptocurrency", "economics"],
                "sources": ["Bloomberg", "Financial Times", "Forbes", "CoinDesk"],
                "categories": ["finance", "technology"]
            },
            "Marco Rossi": {
                "interests": ["football", "F1", "NBA", "Olympic sports", "esports"],
                "sources": ["ESPN", "BBC Sport", "Sky Sports"],
                "categories": ["sports"]
            },
            "Lisa Thompson": {
                "interests": ["movies", "celebrity", "TV shows", "music", "books"],
                "sources": ["Variety", "Rolling Stone", "Billboard", "Hollywood Reporter"],
                "categories": ["entertainment"]
            },
            "David Martinez": {
                "interests": ["space exploration", "AI", "biotech", "physics", "renewable energy"],
                "sources": ["NASA", "Science Daily", "Nature", "Ars Technica"],
                "categories": ["science", "technology"]
            }
        }
    
    def generate_newsletter(self, user_name, max_articles=10):
        """
        Generates a personalized newsletter for a user.
        
        Args:
            user_name: Name of the user to generate newsletter for
            max_articles: Maximum number of articles to include in the newsletter
            
        Returns:
            Markdown-formatted newsletter as a string
        """
        # Check if user exists
        if user_name not in self.user_personas:
            return f"Error: User '{user_name}' not found."
        
        user = self.user_personas[user_name]
        
        # Fetch articles
        all_articles = self.feed_reader.fetch_articles(categories=user['categories'])
        
        # Score and rank articles based on user interests
        scored_articles = []
        for article in all_articles:
            relevance_score = self.article_analyzer.calculate_relevance_score(article, user['interests'])
            
            # Boost score for preferred sources
            source_match = any(preferred_source.lower() in article['source'].lower() for preferred_source in user['sources'])
            if source_match:
                relevance_score += 0.2
                
            # Cap the score at 1.0
            relevance_score = min(relevance_score, 1.0)
            
            # Only include articles with some relevance
            if relevance_score > 0.1:
                article['relevance_score'] = relevance_score
                scored_articles.append(article)
        
        # Sort by relevance score and limit to max_articles
        ranked_articles = sorted(scored_articles, key=lambda x: x['relevance_score'], reverse=True)[:max_articles]
        
        # Group articles by category
        categorized_articles = {}
        for article in ranked_articles:
            category = article['category']
            if category not in categorized_articles:
                categorized_articles[category] = []
            categorized_articles[category].append(article)
        
        # Generate the newsletter in Markdown format
        newsletter = self._format_newsletter(user_name, categorized_articles)
        
        return newsletter
    
    def _format_newsletter(self, user_name, categorized_articles):
        """Formats the newsletter in Markdown."""
        today = datetime.datetime.now().strftime('%A, %B %d, %Y')
        
        # Start with the header
        newsletter = f"# Personalized Newsletter for {user_name}\n\n"
        newsletter += f"**Date:** {today}\n\n"
        
        # Add top headlines section
        newsletter += "## 🔥 Top Headlines For You\n\n"
        
        # Get the top 3 articles across all categories
        all_articles = []
        for category, articles in categorized_articles.items():
            all_articles.extend(articles)
        
        top_articles = sorted(all_articles, key=lambda x: x['relevance_score'], reverse=True)[:3]
        
        for article in top_articles:
            newsletter += f"- **[{article['title']}]({article['link']})** - {article['source']}\n"
        
        newsletter += "\n"
        
        # Add sections for each category
        for category, articles in categorized_articles.items():
            # Skip if the category has no articles
            if not articles:
                continue
                
            # Format the category name
            category_name = category.capitalize()
            
            # Add category header with emoji
            emoji_map = {
                'technology': '💻',
                'finance': '💰',
                'sports': '🏆',
                'entertainment': '🎬',
                'science': '🔬',
                'general': '🌎'
            }
            emoji = emoji_map.get(category, '📰')
            
            newsletter += f"## {emoji} {category_name}\n\n"
            
            # Add articles
            for article in articles:
                # Generate a short summary
                summary = article['summary']
                if len(summary) > 200:
                    summary = summary[:197] + '...'
                
                newsletter += f"### [{article['title']}]({article['link']})\n\n"
                newsletter += f"**Source:** {article['source']}  \n"
                newsletter += f"**Published:** {article.get('published', 'Unknown date')}  \n\n"
                newsletter += f"{summary}\n\n"
                newsletter += f"[Read more]({article['link']})\n\n"
                newsletter += "---\n\n"
        
        # Add footer
        newsletter += "## 📬 Newsletter Preferences\n\n"
        newsletter += f"This newsletter was curated based on your interests: {', '.join(self.user_personas[user_name]['interests'])}.\n\n"
        newsletter += "Thank you for reading your personalized newsletter today!\n"
        
        return newsletter

def main():
    # Create instances of the classes
    feed_reader = RSSFeedReader()
    article_analyzer = ArticleAnalyzer()
    newsletter_generator = NewsletterGenerator(feed_reader, article_analyzer)
    
    # Generate newsletters for all users
    for user_name in newsletter_generator.user_personas.keys():
        print(f"Generating newsletter for {user_name}...")
        
        # Generate the newsletter
        newsletter = newsletter_generator.generate_newsletter(user_name)
        
        # Create the output directory if it doesn't exist
        os.makedirs('newsletters', exist_ok=True)
        
        # Save the newsletter to a file
        filename = f"newsletters/{user_name.replace(' ', '_').lower()}_newsletter.md"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(newsletter)
        
        print(f"Newsletter saved to {filename}")
    
    # Add this code to show the absolute path of the newsletters directory
    absolute_path = os.path.abspath('newsletters')
    print("\n" + "="*50)
    print(f"Newsletters are saved at: {absolute_path}")
    print("You can open this directory to view the generated newsletters.")
    print("="*50)
    
    # Create a simple HTML index file to make it easier to view the newsletters
    index_html = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>AI-Generated Newsletters</title>
        <style>
            body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
            h1 { color: #333; }
            .newsletter-link { 
                display: block; 
                margin: 15px 0; 
                padding: 15px; 
                background-color: #f5f5f5; 
                border-radius: 5px;
                text-decoration: none;
                color: #333;
                font-weight: bold;
            }
            .newsletter-link:hover { background-color: #e0e0e0; }
            .info { margin-top: 30px; padding: 15px; background-color: #e6f7ff; border-radius: 5px; }
        </style>
    </head>
    <body>
        <h1>AI-Generated Personalized Newsletters</h1>
        <p>Click on a newsletter below to view it:</p>
    """
    
    # Add links to each newsletter
    for user_name in newsletter_generator.user_personas.keys():
        file_name = f"{user_name.replace(' ', '_').lower()}_newsletter.md"
        index_html += f'<a href="{file_name}" class="newsletter-link">{user_name}\'s Newsletter</a>\n'
    
    index_html += """
        <div class="info">
            <p><strong>Note:</strong> These are Markdown (.md) files. If they open as plain text, you may want to use a Markdown viewer or editor for better formatting.</p>
        </div>
    </body>
    </html>
    """
    
    # Save the index.html file
    with open('newsletters/index.html', 'w', encoding='utf-8') as f:
        f.write(index_html)
    
    print(f"\nAn index.html file has been created in the newsletters directory.")
    print(f"You can open this file in a web browser to easily access all newsletters.")

if __name__ == "__main__":
    main()

Generating newsletter for Alex Parker...
Newsletter saved to newsletters/alex_parker_newsletter.md
Generating newsletter for Priya Sharma...
Newsletter saved to newsletters/priya_sharma_newsletter.md
Generating newsletter for Marco Rossi...
Newsletter saved to newsletters/marco_rossi_newsletter.md
Generating newsletter for Lisa Thompson...
Newsletter saved to newsletters/lisa_thompson_newsletter.md
Generating newsletter for David Martinez...
Newsletter saved to newsletters/david_martinez_newsletter.md

Newsletters are saved at: C:\Users\MANSI\newsletters
You can open this directory to view the generated newsletters.

An index.html file has been created in the newsletters directory.
You can open this file in a web browser to easily access all newsletters.
