In [35]:
# ==============================================================================
# CELL 1: Setup and Installations (REVISED)
# ==============================================================================
# This cell installs all the necessary libraries and downloads the required
# language models from NLTK and SpaCy.

# --- Install Python packages ---
!pip install newsapi-python wordcloud -q

# --- Download SpaCy model ---
!python -m spacy download en_core_web_sm -q

# --- Download NLTK models ---
import nltk
print("Starting download of required NLTK models...")

# ADDED 'punkt_tab' TO THIS LIST TO FIX THE LOOKUPERROR
packages = ['stopwords', 'punkt', 'vader_lexicon', 'punkt_tab']

for package in packages:
    try:
        # Define search paths for different package types
        if package == 'stopwords':
            nltk.data.find(f'corpora/{package}')
        elif package.startswith('punkt'): # Handles both 'punkt' and 'punkt_tab'
            nltk.data.find(f'tokenizers/{package}')
        else: # For vader_lexicon and others
            nltk.data.find(f'sentiment/{package}.zip')
        print(f"‚úÖ Package '{package}' is already downloaded.")
    except LookupError:
        print(f"‚¨áÔ∏è  Downloading package '{package}'...")
        nltk.download(package, quiet=True)
        print(f"üëç Download of '{package}' complete.")

print("\n‚úÖ All installations and downloads are complete!")

[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/12.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.5/12.8 MB[0m [31m75.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m11.4/12.8 MB[0m [31m240.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m12.8/12.8 MB[0m [31m262.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m

In [36]:
# ==============================================================================
# CELL 2: Import Libraries
# ==============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import re
from newsapi import NewsApiClient
import os
from datetime import datetime, timedelta

In [48]:
# ==============================================================================
# CELL 3: Section 1 - Dataset Collection Function (NEW QUERY & ALL SOURCES)
# ==============================================================================
def collect_data():
    """
    Fetches news on a specific topic from all available sources.
    """
    print("--- Starting Section 1: Dataset Collection ---")
    api_key = 'e014b82fbd064b5d99a37739a6e2760a' # Your API Key
    newsapi = NewsApiClient(api_key=api_key)

    # 1. THE QUERY HAS BEEN UPDATED to be more specific.
    # Using keywords and boolean operators gives the API the best instructions.
    query = '"pakistan" AND "india"'

    # The date range is still set to the last few days to get the "latest" news
    today = datetime.now()
    yesterday = today - timedelta(days=1)
    three_days_ago = today - timedelta(days=6)
    to_date = yesterday.strftime('%Y-%m-%d')
    from_date = three_days_ago.strftime('%Y-%m-%d')

    print(f"Searching for topic '{query}' from {from_date} to {to_date} across all available sources.")

    all_articles = []
    try:
        articles = newsapi.get_everything(q=query,
                                          from_param=from_date,
                                          to=to_date,
                                          language='en',
                                          sort_by='publishedAt', # Sorts by newest first
                                          page_size=100)
        for article in articles['articles']:
            source_name = article['source']['name'] if article['source'] else 'Unknown'
            all_articles.append({ 'source': source_name, 'title': article['title'], 'description': article['description'], 'publishedAt': article['publishedAt'], 'url': article['url'] })
    except Exception as e:
        print(f"Could not fetch articles. Error: {e}")

    df = pd.DataFrame(all_articles)

    # 2. THE FILTERING SECTION HAS BEEN REMOVED.
    # The code now saves and returns the full DataFrame with all sources.
    if not df.empty:
        df.to_csv('raw_headlines.csv', index=False)
        print(f"\nCollected {len(df)} articles from all sources and saved to raw_headlines.csv")

    return df

In [38]:
# ==============================================================================
# CELL 4: Section 2 - Pre-processing Function
# ==============================================================================
def preprocess_data(df):
    """
    Cleans and prepares the collected text data for analysis.
    """
    print("\n--- Starting Section 2: Pre-processing ---")
    nlp = spacy.load('en_core_web_sm')
    stop_words = set(nltk.corpus.stopwords.words('english'))

    def clean_text(text):
        if not isinstance(text, str): return ""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+|\#', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        return text

    def remove_stopwords(text):
        tokens = nltk.word_tokenize(text)
        return ' '.join([word for word in tokens if word not in stop_words])

    def lemmatize_text(text):
        doc = nlp(text)
        return ' '.join([token.lemma_ for token in doc])

    df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')
    df['cleaned_text'] = df['text'].apply(clean_text).apply(remove_stopwords)
    df['lemmatized_text'] = df['cleaned_text'].apply(lemmatize_text)

    df.drop_duplicates(subset=['title', 'source'], inplace=True)
    df = df[df['lemmatized_text'].str.strip() != '']

    df.to_csv('clean_headlines.csv', index=False)
    print(f"Cleaned data has {len(df)} articles and is saved to clean_headlines.csv")
    return df


In [39]:
# ==============================================================================
# CELL 5: Section 3 - Analysis Function
# ==============================================================================
def analyze_data(df):
    """
    Performs sentiment analysis on the cleaned data.
    """
    print("\n--- Starting Section 3: Analysis ---")

    # --- Sentiment Analysis ---
    sid = SentimentIntensityAnalyzer()
    df['sentiment_scores'] = df['text'].apply(lambda text: sid.polarity_scores(text))
    df = pd.concat([df.drop(['sentiment_scores'], axis=1), df['sentiment_scores'].apply(pd.Series)], axis=1)

    avg_sentiment = df.groupby('source')[['neg', 'neu', 'pos', 'compound']].mean().reset_index()
    print("\n--- Average Sentiment per Outlet ---")
    print(avg_sentiment.to_string())

    return df, avg_sentiment

In [40]:
# ==============================================================================
# CELL 6: Section 4 - Visualization Function
# ==============================================================================
def create_visualizations(df, avg_sentiment):
    """
    Generates and saves word cloud visualizations for each news source.
    """
    print("\n--- Starting Section 4: Visualization ---")
    if not os.path.exists('plots'):
        os.makedirs('plots')

    news_sources = df['source'].unique()

    # 1. Word Clouds
    for source in news_sources:
        text = ' '.join(df[df['source'] == source]['lemmatized_text'])
        if text:
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Word Cloud for {source}')
            plt.savefig(f'plots/word_cloud_{source.replace(" ", "_")}.png')
            print(f"Saved word cloud for {source}.")
            plt.close() # Close the plot to prevent it from displaying in the notebook output

In [49]:
# ==============================================================================
# CELL 7: Main Execution Block
# ==============================================================================
# This final cell runs the entire pipeline from data collection to visualization.

# Step 1: Collect Data
raw_df = collect_data()

# Step 2: Check if data was collected BEFORE proceeding
if raw_df is not None and not raw_df.empty:

    # Step 3: Pre-process Data
    clean_df = preprocess_data(raw_df)

    # Step 4: Analyze Data
    analyzed_df, avg_sentiment = analyze_data(clean_df)

    # Step 5: Create Visualizations
    create_visualizations(analyzed_df, avg_sentiment)

    print("\n‚úÖ Project execution completed successfully!")
    print("\nCheck the file browser on the left to find your CSV files and the 'plots' directory.")
else:
    print("\n‚ùå Project execution stopped because no articles were found. Please try a different search query or check your API key.")


--- Starting Section 1: Dataset Collection ---
Searching for topic '"Mohsin Naqvi" AND "pakistan" AND "india"' from 2025-09-30 to 2025-10-05 across all available sources.

Collected 19 articles from all sources and saved to raw_headlines.csv

--- Starting Section 2: Pre-processing ---
Cleaned data has 19 articles and is saved to clean_headlines.csv

--- Starting Section 3: Analysis ---

--- Average Sentiment per Outlet ---
                          source     neg     neu    pos  compound
0                  ABC News (AU)  0.3330  0.6130  0.053  -0.92870
1             Al Jazeera English  0.2155  0.7425  0.042  -0.69555
2                       BBC News  0.1440  0.6860  0.170   0.27320
3                   BusinessLine  0.1690  0.8310  0.000  -0.25840
4                            CNA  0.0420  0.9010  0.057   0.20230
5                   DW (English)  0.1440  0.8170  0.039  -0.75060
6                 Foreign Policy  0.0820  0.8150  0.103   0.12800
7            Gossiplankanews.com  0.0000  0.7