In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Define the base URL and add start and end years
base_url = "https://nvidianews.nvidia.com/news"
start_year = 2010
end_year = 2025  # Adjust as needed

# Function to scrape NVIDIA news
def scrape_nvidia_news(base_url, start_year, end_year):
    headlines_data = []

    # Loop through the pages until we hit the end
    page_number = 1
    while True:
        url = f"{base_url}?page={page_number}"
        print(f"Scraping page {page_number}: {url}")

        # Fetch and parse the page content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if there are articles on the page
        articles = soup.find_all('div', class_='index-item-text')  # Find articles in the specified div
        if not articles:
            print("No more articles found, ending scrape.")
            break

        for article in articles:
            # Extract the title from <h3> -> <a> (based on provided HTML)
            title_tag = article.find('h3', class_='index-item-text-title')
            if title_tag:
                title = title_tag.find('a').get_text(strip=True)  # Get text from the <a> tag

                # Extract the date from <span class='index-item-text-info-date'>
                date_tag = article.find('span', class_='index-item-text-info-date')
                if date_tag:
                    date_str = date_tag.get_text(strip=True)
                    try:
                        # Parse date format: "April 03, 2025"
                        date = datetime.strptime(date_str, '%B %d, %Y')

                        # Filter based on the start and end years
                        if start_year <= date.year <= end_year:
                            headlines_data.append({
                                'date': date,
                                'title': title,
                                'ticker': 'NVDA'
                            })
                    except ValueError:
                        continue  # Skip if the date parsing fails

        # Move to the next page
        page_number += 1

    # Convert the list of headlines data into a DataFrame
    df = pd.DataFrame(headlines_data)
    return df

# Call the function and store the result
df = scrape_nvidia_news(base_url, start_year, end_year)

# Display the first few rows of the DataFrame
print(df.head())

# Optionally, save the DataFrame to a CSV file
df.to_csv('nvidia_news_filtered.csv', index=False)

Scraping page 1: https://nvidianews.nvidia.com/news?page=1
Scraping page 2: https://nvidianews.nvidia.com/news?page=2
Scraping page 3: https://nvidianews.nvidia.com/news?page=3
Scraping page 4: https://nvidianews.nvidia.com/news?page=4
Scraping page 5: https://nvidianews.nvidia.com/news?page=5
Scraping page 6: https://nvidianews.nvidia.com/news?page=6
Scraping page 7: https://nvidianews.nvidia.com/news?page=7
Scraping page 8: https://nvidianews.nvidia.com/news?page=8
Scraping page 9: https://nvidianews.nvidia.com/news?page=9
Scraping page 10: https://nvidianews.nvidia.com/news?page=10
Scraping page 11: https://nvidianews.nvidia.com/news?page=11
Scraping page 12: https://nvidianews.nvidia.com/news?page=12
Scraping page 13: https://nvidianews.nvidia.com/news?page=13
Scraping page 14: https://nvidianews.nvidia.com/news?page=14
Scraping page 15: https://nvidianews.nvidia.com/news?page=15
Scraping page 16: https://nvidianews.nvidia.com/news?page=16
Scraping page 17: https://nvidianews.nvidi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/ML/final_project/data/misc/nvidia_news_filtered.csv'
df.to_csv(file_path, index=False)

print(f"File saved to {file_path}")

Mounted at /content/drive
File saved to /content/drive/My Drive/ML/final_project/nvidia_news_filtered.csv
