#**Import Required Libraries**

In [None]:
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil import parser
import os
import time


#**Fetch News HTML Content**

In [None]:
def fetch_news(category):
    url_map = {
        "sport": "https://news.google.com/search?q=Sport&hl=en-IN&gl=IN&ceid=IN%3Aen"
        # "finance": "https://news.google.com/search?q=Finance&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "healthcare": "https://news.google.com/search?q=Health%20Care&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "technology": "https://news.google.com/search?q=Technology&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "entertainment": "https://news.google.com/search?q=Entertainment&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "politics": "https://news.google.com/search?q=Politics&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "world": "https://news.google.com/search?q=World%20News&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "science": "https://news.google.com/search?q=Science&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "business": "https://news.google.com/search?q=Business&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "climate": "https://news.google.com/search?q=Climate%20Change&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "economy": "https://news.google.com/search?q=Economy&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "education": "https://news.google.com/search?q=Education&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "travel": "https://news.google.com/search?q=Travel&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "startups": "https://news.google.com/search?q=Startups&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "real estate": "https://news.google.com/search?q=Real%20Estate&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "automobile": "https://news.google.com/search?q=Automobile&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "cryptocurrency": "https://news.google.com/search?q=Cryptocurrency&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "space": "https://news.google.com/search?q=Space%20Exploration&hl=en-IN&gl=IN&ceid=IN%3Aen",
        # "fashion": "https://news.google.com/search?q=Fashion&hl=en-IN&gl=IN&ceid=IN%3Aen"
    }
    category_lower = category.lower()
    if category_lower in url_map:
        url = url_map[category_lower]
        try:
            web = requests.get(url)
            web.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {category} news: {e}")
            return None
        return BeautifulSoup(web.content, 'html.parser')
    else:
        print(f"Category '{category}' not found.")
        return None


#**Extract News Data**

In [None]:
def extract_news_data(soup):
    headlines = [tag.text for tag in soup.find_all('a', class_='JtKRv')]
    active_links = [
        'https://news.google.com' + tag.get('href') if not tag.get('href').startswith('http') else tag.get('href')
        for tag in soup.find_all('a', class_='JtKRv')

    ]
    time_elements = [
        (parser.parse(tag.get('datetime')), tag.text)
        for tag in soup.find_all('time', class_='hvbAAd')
    ]
    time_elements_sorted = sorted(time_elements, key=lambda x: x[0], reverse=True)
    formatted_times = [(date.strftime('%d %B %Y'), text) for date, text in time_elements_sorted]
    return headlines, active_links, formatted_times


#**Generate Google News URL**

In [None]:
def generate_google_news_url(topic, hl='en-IN', gl='IN', ceid='IN:en'):
    base_url = 'https://news.google.com/search?q='
    topic_query = topic.replace(' ', '%20')
    return f'{base_url}{topic_query}&hl={hl}&gl={gl}&ceid={ceid}'


#**Scrape and Download Images**

In [None]:
def scrape_image_src(soup):
    img_tags = soup.find_all('img', class_='Quavad vwBmvb')
    return [
        f'https://news.google.com{img.get("src")}' if img.get('src').startswith('/') else img.get('src')
        for img in img_tags if img.get('src')
    ]

def download_images(image_src_links, download_folder):
    os.makedirs(download_folder, exist_ok=True)
    for index, img_url in enumerate(image_src_links):
        try:
            img_data = requests.get(img_url).content
            img_filename = os.path.join(download_folder, f'image_{index + 1}.jpg')
            with open(img_filename, 'wb') as img_file:
                img_file.write(img_data)
            print(f"Downloaded image {index + 1}: {img_filename}")
        except Exception as e:
            print(f"Failed to download image {index + 1}: {e}")


#**Automate the Process for All Categories**

In [None]:
def automate_scraping(categories):
    base_folder = 'news'
    os.makedirs(base_folder, exist_ok=True)

    for category in categories:
        print(f"Processing category: {category}")

        # Fetch news content
        soup = fetch_news(category)
        if not soup:
            continue

        # Extract news data
        headlines, active_links, formatted_times = extract_news_data(soup)

        # Scrape and download images
        image_src_links = scrape_image_src(soup)
        image_folder = os.path.join(base_folder, f'{category}_images')
        download_images(image_src_links, image_folder)

        # Combine data and store in a CSV file
        combined_data = []
        for i in range(min(len(headlines), len(active_links), len(formatted_times), len(image_src_links))):
            combined_data.append({
                "Headline": headlines[i],
                "Time": formatted_times[i],
                "News Link": active_links[i],
                "Image Link": image_src_links[i],
                "Image Directory": os.path.join(image_folder, f'image_{i + 1}.jpg')
            })

        # Save data to CSV
        csv_filename = os.path.join(base_folder, f'{category}_news.csv')
        pd.DataFrame(combined_data).to_csv(csv_filename, index=False)
        print(f"Saved {category} news to {csv_filename}")

# List of categories to scrape
categories = [
    "sport", "finance", "healthcare", "technology", "entertainment", "politics",
    "world", "science", "business", "climate", "economy", "education" ,"travel" , "startups" , "real estate" , "automobile" , "cryptocurrency" , "space" , "fashion"
] # Add more categories here
automate_scraping(categories)


Processing category: sport
Downloaded image 1: news/sport_images/image_1.jpg
Downloaded image 2: news/sport_images/image_2.jpg
Downloaded image 3: news/sport_images/image_3.jpg
Downloaded image 4: news/sport_images/image_4.jpg
Downloaded image 5: news/sport_images/image_5.jpg
Downloaded image 6: news/sport_images/image_6.jpg
Downloaded image 7: news/sport_images/image_7.jpg
Downloaded image 8: news/sport_images/image_8.jpg
Downloaded image 9: news/sport_images/image_9.jpg
Downloaded image 10: news/sport_images/image_10.jpg
Downloaded image 11: news/sport_images/image_11.jpg
Downloaded image 12: news/sport_images/image_12.jpg
Downloaded image 13: news/sport_images/image_13.jpg
Downloaded image 14: news/sport_images/image_14.jpg
Downloaded image 15: news/sport_images/image_15.jpg
Downloaded image 16: news/sport_images/image_16.jpg
Downloaded image 17: news/sport_images/image_17.jpg
Downloaded image 18: news/sport_images/image_18.jpg
Downloaded image 19: news/sport_images/image_19.jpg
Dow

#  **TRENDING_NEWS_SCRAP**

In [None]:
!pip install nltk
!pip install spacy
!pip install vaderSentiment
!pip install transformers


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import os
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize Hugging Face sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=0  # Set to 0 for GPU, -1 for CPU
)

# Directory containing category-wise news CSV files
news_dir = "news"

# Output CSV to store top 5 trending headlines from all categories
output_file = "top_trending_news.csv"

# Initialize an empty list to store top 5 trending news from all categories
all_top_trending = []

# Process each CSV file in the `news` directory
for file_name in os.listdir(news_dir):
    if file_name.endswith('.csv'):  # Ensure we process only CSV files
        category = file_name.replace('.csv', '')  # Extract category from file name
        file_path = os.path.join(news_dir, file_name)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Apply sentiment analysis using Hugging Face model
        df['sentiment'] = df['Headline'].apply(lambda x: sentiment_pipeline(x)[0]['label'])
        df['sentiment_score'] = df['Headline'].apply(lambda x: sentiment_pipeline(x)[0]['score'])

        # Apply TF-IDF vectorizer
        vectorizer = TfidfVectorizer(stop_words='english')
        X = vectorizer.fit_transform(df['Headline'])

        # Get TF-IDF scores
        importance_scores = X.sum(axis=0).A1
        words = vectorizer.get_feature_names_out()
        word_importance = dict(zip(words, importance_scores))

        # Calculate combined importance score
        df['importance_score'] = df['sentiment_score'] + df['Headline'].apply(
            lambda x: sum([word_importance.get(word, 0) for word in x.split()]))

        # Select top 5 trending headlines based on combined importance score
        top_trending = df.nlargest(5, 'importance_score')

        # Add category information to the DataFrame
        top_trending['Category'] = category

        # Append the top 5 headlines to the master list
        all_top_trending.append(top_trending)

# Combine all top headlines into a single DataFrame
final_df = pd.concat(all_top_trending, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv(output_file, index=False)

print(f"Top 5 trending news from each csv file saved to {output_file}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Top 5 trending news from each csv file saved to top_trending_news.csv


In [None]:
df = pd.read_csv('top_trending_news.csv')  # Replace with your uploaded file name

# Check the first few rows of the dataframe
df.sample(15)


Unnamed: 0,Headline,Time,News Link,Image Link,Image Directory,sentiment,sentiment_score,importance_score,Category
49,"World News Today highlights on November 27, 20...","('29 November 2024', '4 hours ago')",https://news.google.com./read/CBMiqgFBVV95cUxP...,https://news.google.com/api/attachments/CC8iK0...,news/world_images/image_15.jpg,LABEL_1,0.699376,8.019619,world_news
22,Indian athlete who raised alarm about survival...,"('29 November 2024', '7 hours ago')",https://news.google.com./read/CBMi_gFBVV95cUxP...,https://news.google.com/api/attachments/CC8iK0...,news/sport_images/image_11.jpg,LABEL_1,0.616963,6.670077,sport_news
33,Bengaluru airport’s new tunnel to cut 30 minut...,"('29 November 2024', '3 hours ago')",https://news.google.com./read/CBMihAJBVV95cUxN...,https://news.google.com/api/attachments/CC8iK0...,news/travel_images/image_19.jpg,LABEL_1,0.755582,15.481468,travel_news
59,GenAI startups see 3.4x funding surge in Septe...,"('28 November 2024', '14 hours ago')",https://news.google.com./read/CBMi1wFBVV95cUxO...,https://news.google.com/api/attachments/CC8iK0...,news/startups_images/image_28.jpg,LABEL_2,0.808092,11.159407,startups_news
55,Hero MotoCorp launches programme to support st...,"('29 November 2024', '2 hours ago')",https://news.google.com./read/CBMilAFBVV95cUxP...,https://news.google.com/api/attachments/CC8iK0...,news/startups_images/image_7.jpg,LABEL_1,0.563741,14.705954,startups_news
63,Karnataka school education dept to focus on ab...,"('29 November 2024', '3 hours ago')",https://news.google.com./read/CBMi1gFBVV95cUxO...,https://news.google.com/api/attachments/CC8iK0...,news/education_images/image_16.jpg,LABEL_1,0.804434,13.838621,education_news
50,Video: How to stop election politics from ruin...,"('27 November 2024', 'Yesterday')",https://news.google.com./read/CBMilAFBVV95cUxP...,https://news.google.com/api/attachments/CC8iMk...,news/politics_images/image_45.jpg,LABEL_0,0.623621,10.769618,politics_news
13,Zee Entertainment shareholders reject proposal...,"('29 November 2024', '2 hours ago')",https://news.google.com./read/CBMi4AFBVV95cUxN...,https://news.google.com/api/attachments/CC8iK0...,news/entertainment_images/image_13.jpg,LABEL_1,0.524923,13.793967,entertainment_news
40,Overtime Baku talks fail to deliver on new cli...,"('28 November 2024', '23 hours ago')",https://news.google.com./read/CBMi2gFBVV95cUxQ...,https://news.google.com/api/attachments/CC8iL0...,news/finance_images/image_63.jpg,LABEL_0,0.820597,15.909887,finance_news
9,Best airports for technology and innovation an...,"('13 November 2024', '16 days ago')",https://news.google.com./read/CBMiswFBVV95cUxN...,https://news.google.com/api/attachments/CC8iJ0...,news/technology_images/image_99.jpg,LABEL_2,0.894499,11.872604,technology_news


In [None]:
df = pd.read_csv("top_trending_news.csv")

# Use a better sentiment analysis model
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=0  # Use GPU 0 for faster processing; set to -1 for CPU
)

# Apply the sentiment analysis model to the headlines
# The 'score' may now be replaced with a 'positive', 'negative', or 'neutral' class
df['sentiment'] = df['Headline'].apply(lambda x: sentiment_pipeline(x)[0]['label'])
df['sentiment_score'] = df['Headline'].apply(lambda x: sentiment_pipeline(x)[0]['score'])

# Sort by sentiment scores to find the most polarizing or impactful headlines
df = df.sort_values(by='sentiment_score', ascending=False)

# Select top trending headlines
top_trending = df.head(10)

# Save top trending headlines to CSV
output_file = "top_trending_with_sentiment.csv"
top_trending[['Headline', 'sentiment', 'sentiment_score']].to_csv(output_file, index=False)

print(f"Top 10 trending headlines saved to {output_file}")


Top 10 trending headlines saved to top_trending_with_sentiment.csv


# **TRY_ANOTHER_APPROCH_FOR_SENTIMENTAL_SCORE**

In [None]:
# import os
# import pandas as pd
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Initialize sentiment analyzer
# analyzer = SentimentIntensityAnalyzer()

# # Directory containing category-wise news CSV files
# news_dir = "news"

# # Output CSV to store top 5 trending headlines from all categories
# output_file = "top_trending_news.csv"

# # Initialize an empty list to store top 5 trending news from all categories
# all_top_trending = []

# # Process each CSV file in the `news` directory
# for file_name in os.listdir(news_dir):
#     if file_name.endswith('.csv'):  # Ensure we process only CSV files
#         category = file_name.replace('.csv', '')  # Extract category from file name
#         file_path = os.path.join(news_dir, file_name)

#         # Read the CSV file into a DataFrame
#         df = pd.read_csv(file_path)

#         # Apply sentiment analysis
#         df['sentiment_score'] = df['Headline'].apply(
#             lambda x: analyzer.polarity_scores(x)['compound'])

#         # Apply TF-IDF vectorizer
#         vectorizer = TfidfVectorizer(stop_words='english')
#         X = vectorizer.fit_transform(df['Headline'])

#         # Get TF-IDF scores
#         importance_scores = X.sum(axis=0).A1
#         words = vectorizer.get_feature_names_out()
#         word_importance = dict(zip(words, importance_scores))

#         # Calculate combined importance score
#         df['importance_score'] = df['sentiment_score'] + df['Headline'].apply(
#             lambda x: sum([word_importance.get(word, 0) for word in x.split()]))

#         # Select top 5 trending headlines based on combined importance score
#         top_trending = df.nlargest(5, 'importance_score')

#         # Add category information to the DataFrame
#         top_trending['Category'] = category

#         # Append the top 5 headlines to the master list
#         all_top_trending.append(top_trending)

# # Combine all top headlines into a single DataFrame
# final_df = pd.concat(all_top_trending, ignore_index=True)

# # Save the final DataFrame to a CSV file
# final_df.to_csv(output_file, index=False)

# print(f"Top trending news saved to {output_file}")


In [None]:
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # Initialize the sentiment analyzer
# analyzer = SentimentIntensityAnalyzer()

# # Apply sentiment analysis to the 'News_Headline' column
# df['sentiment_score'] = df['Headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# # Display the sentiment scores
# df[['Headline', 'sentiment_score']].head()


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Apply TF-IDF vectorizer to extract important keywords from headlines
# vectorizer = TfidfVectorizer(stop_words='english')
# X = vectorizer.fit_transform(df['Headline'])

# # Get the words with the highest TF-IDF scores
# importance_scores = X.sum(axis=0).A1
# words = vectorizer.get_feature_names_out()

# # Create a dictionary of word importance scores
# word_importance = dict(zip(words, importance_scores))

# # Sort words by their importance scores
# sorted_words = sorted(word_importance.items(), key=lambda x: x[1], reverse=True)

# # Display the top 10 important words from the headlines
# print("Top 10 important words in headlines:")
# for word, score in sorted_words[:10]:
#     print(f"{word}: {score}")


In [None]:
# # Create a combined score by adding sentiment score and TF-IDF importance score
# df['importance_score'] = df['sentiment_score'] + df['Headline'].apply(
#     lambda x: sum([word_importance.get(word, 0) for word in x.split()])
# )

# # Sort headlines by the combined importance score
# df_sorted_combined = df.sort_values(by='importance_score', ascending=False)

# # Display the top 10 most important/trending headlines
# top_combined_news = df_sorted_combined.head(10)
# for i, row in top_combined_news.iterrows():
#     print(f"{i+1}. {row['Headline']} - Combined Importance Score: {row['importance_score']}")
