In [11]:
# Install necessary libraries
# !pip install requests beautifulsoup4 spacy textblob sqlalchemy pandas
# !python -m spacy download en_core_web_sm

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import spacy
from textblob import TextBlob
import pandas as pd
from sqlalchemy import create_engine

# Task 1: Data Scraping
def scrape_article(url: str) -> str:
    """
    Fetches and extracts the main text content of a news article from the given URL.
    """
    try:
        # Ensure URL starts with http/https
        if not url.startswith(('http://', 'https://')):
            url = f"https://{url}"

        # Make a request to the URL
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the title and paragraphs
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No Title Found"
        paragraphs = soup.find_all('p')
        content = " ".join(para.get_text(strip=True) for para in paragraphs)

        # Check if content exists
        if not content.strip():
            return "No content found in the article."

        return f"{title}\n\n{content}"
    except requests.exceptions.MissingSchema:
        return "Invalid URL format. Please include http:// or https:// in the URL."
    except requests.exceptions.RequestException as e:
        return f"Error fetching the article: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

# Task 2: Entity Extraction
def extract_entities(text):
    """Extract PERSON and ORG entities using Spacy's Named Entity Recognition (NER)."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = {"PERSON": set(), "ORG": set()}
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].add(ent.text)
    return entities

# Task 3: Sentiment Analysis
def analyze_sentiment(text):
    """Classify the sentiment of the text as positive, negative, or neutral."""
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Task 4: Storage
def store_in_database(url, article_text, entities, sentiment):
    """Store the scraped data, entities, and sentiment analysis results into a database."""
    try:
        # Define a SQLite database
        engine = create_engine('sqlite:///articles.db')
        connection = engine.connect()

        # Prepare the data
        data = {
            "URL": [url],
            "Text": [article_text],
            "Persons": [', '.join(entities["PERSON"])],
            "Organizations": [', '.join(entities["ORG"])],
            "Sentiment": [sentiment]
        }
        df = pd.DataFrame(data)

        # Store the data
        df.to_sql('articles', con=engine, if_exists='append', index=False)
        print("Data successfully stored in the database.")
    except Exception as e:
        print("Error occurred while storing data in the database:", e)

# Task 5: Integrating and Testing
def main():
    url = input("Enter the URL of a news article: ")
    print("\nScraping article...")
    article_text = scrape_article(url)

    if article_text.startswith("Error") or article_text.startswith("Invalid"):
        print(article_text)  # Display the error message
        return

    print("Article successfully scraped!")
    print(article_text[:500] + '...')  # Show the first 500 characters of the article for brevity

    print("\nExtracting entities...")
    entities = extract_entities(article_text)
    print("Entities extracted:")
    print("Persons:", entities["PERSON"])
    print("Organizations:", entities["ORG"])

    print("\nAnalyzing sentiment...")
    sentiment = analyze_sentiment(article_text)
    print("Sentiment of the article:", sentiment)

    print("\nStoring results in database...")
    store_in_database(url, article_text, entities, sentiment)

# Run the main function
if __name__ == "__main__":
    main()


Enter the URL of a news article: https://www.timesnownews.com/entertainment-news/bollywood/explained-how-zakir-hussain-went-from-tabla-prodigy-to-global-musical-icon-article-116343244

Scraping article...
Article successfully scraped!
Explained: How Zakir Hussain Went From Tabla Prodigy To Global Musical Icon

Theme Entertainment Bollywood Box Office Reviews TV Web Series Hollywood Korean Telugu Tamil Kannada Malayalam Movies Bollywood Box Office Reviews TV Web Series Hollywood Korean Telugu Tamil Kannada Malayalam Movies Trending: news entertainment bollywood Updated Dec 16, 2024, 08:34 IST Tabla maestro Zakir Hussain was battling heart-related problems Trending: An introvert with a keen interest in book reading and binge wa...

Extracting entities...
Entities extracted:
Persons: {'Bennett', 'RomCom', 'Tamil Kannada', 'Tabla maestro Zakir Hussain'}
Organizations: {'Bollywood-Hollywood'}

Analyzing sentiment...
Sentiment of the article: Positive

Storing results in database...
Data suc