In [1]:
import feedparser
from datetime import datetime
from sqlalchemy import create_engine, Column, String, DateTime, text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from celery import Celery
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


app = Celery('tasks', broker='pyamqp://guest:guest@localhost//')


Base = declarative_base()

class NewsArticle(Base):
    __tablename__ = 'news_articles'

    id = Column(String, primary_key=True)
    title = Column(String)
    content = Column(String)
    pub_date = Column(DateTime)
    source_url = Column(String)
    category = Column(String)


stop_words = set(stopwords.words('english'))


categories_keywords = {
    'Terrorism/Protest/Political Unrest/Riot': ['terrorism', 'protest', 'political unrest', 'riot'],
    'Positive/Uplifting': ['positive', 'uplifting'],
    'Natural Disasters': ['natural disaster', 'earthquake', 'hurricane', 'flood']
}


def classify_category(article):
    for category, keywords in categories_keywords.items():
        tokens = word_tokenize(article.content.lower() + ' ' + article.title.lower())
        if any(keyword in tokens for keyword in keywords):
            return category
    return 'Others'

def parse_and_store(feed_url):
    feed = feedparser.parse(feed_url)
    Session = sessionmaker(bind=engine)
    session = Session()

    for entry in feed.entries:
        article_id = entry.get('id', '')
        title = entry.get('title', '')
        content = entry.get('summary', '')
        pub_date_struct = entry.get('published_parsed', entry.get('published', None))
        pub_date = datetime(*pub_date_struct[:6]) if pub_date_struct else None
        source_url = entry.get('link', '')

       
        if not session.query(NewsArticle).filter_by(id=article_id).first():
           
            category = classify_category(NewsArticle(id=article_id, title=title, content=content, pub_date=pub_date, source_url=source_url))

      
            new_article = NewsArticle(id=article_id, title=title, content=content, pub_date=pub_date, source_url=source_url, category=category)
            session.add(new_article)

      
            process_article.delay(new_article.id)

    session.commit()
    session.close()

@app.task
def process_article(article_id):
   
    session = Session()
    article = session.query(NewsArticle).filter_by(id=article_id).first()

    article.category = classify_category(article)
    session.commit()


rss_feeds = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]


database_url = 'postgresql://postgres:Rahul%401432@localhost:5432/postgres'
engine = create_engine(database_url)

Base.metadata.create_all(engine)

for feed_url in rss_feeds:
    parse_and_store(feed_url)


ModuleNotFoundError: No module named 'feedparser'