In [40]:
import requests
import time
from datetime import datetime
import pandas as pd
import shutil
import re

In [30]:
def timestamp_for_date(year, month=None, start=True):
    """
    Convert a year and optional month to a Unix timestamp (Epoch time).
    
    Args:
        year (int): The year to convert
        month (int, optional): The month to convert (1-12). If None, uses entire year.
        start (bool): If True, returns timestamp for beginning of period
                      If False, returns timestamp for end of period
    
    Returns:
        int: Unix timestamp
    """
    if month is None:
        # Handle year-level timestamps
        if start:
            date_obj = datetime(year, 1, 1, 0, 0, 0)
        else:
            date_obj = datetime(year, 12, 31, 23, 59, 59)
    else:
        # Handle month-level timestamps
        if month < 1 or month > 12:
            raise ValueError("Month must be between 1 and 12")
            
        if start:
            date_obj = datetime(year, month, 1, 0, 0, 0)
        else:
            # Determine last day of month
            import calendar
            last_day = calendar.monthrange(year, month)[1]
            date_obj = datetime(year, month, last_day, 23, 59, 59)
    
    return int(date_obj.timestamp())


def get_news_by_ticker_and_period(ticker, year, month=None, api_key=None, page_size=40, wait_time=5):
    """
    Retrieve news for a specific ticker and time period using the Seeking Alpha API.
    Gets ALL available pages of results with a configurable wait time between requests.
    
    Args:
        ticker (str): Stock ticker symbol (e.g., 'AAPL')
        year (int): Year to retrieve news for
        month (int, optional): Specific month (1-12) to retrieve news for. If None, retrieves for entire year.
        api_key (str): Your RapidAPI key
        page_size (int): Number of results per page (max 40 as per API limits)
        wait_time (int): Number of seconds to wait between API requests to avoid rate limiting
    
    Returns:
        list: List of news items
    """
    if api_key is None:
        raise ValueError("API key is required")
    url = "https://seeking-alpha.p.rapidapi.com/news/v2/list-by-symbol"
    
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "seeking-alpha.p.rapidapi.com"
    }
    
    # Convert year/month to start and end timestamps
    since_timestamp = timestamp_for_date(year, month, start=True)
    until_timestamp = timestamp_for_date(year, month, start=False)
    
    all_news = []
    page_number = 1
    more_pages = True
    period_desc = f"{year}" if month is None else f"{year}-{month:02d}"
    
    while more_pages:
        querystring = {
            "id": ticker.lower(),
            "size": str(page_size),
            "number": str(page_number),
            "since": str(since_timestamp),
            "until": str(until_timestamp)
        }
        
        try:
            print(f"Requesting page {page_number} for {ticker} - {period_desc}...")
            response = requests.get(url, headers=headers, params=querystring)
            response.raise_for_status()  # Raise exception for 4XX/5XX responses
            
            data = response.json()
            
            # Check if we have news items
            if 'data' in data and data['data']:
                news_count = len(data['data'])
                all_news.extend(data['data'])
                print(f"Retrieved page {page_number} with {news_count} news items for {ticker} - {period_desc}")
                
                # If fewer items than page_size, we've reached the end
                if news_count < page_size:
                    more_pages = False
                    print(f"Reached end of results (received {news_count} < {page_size})")
                else:
                    page_number += 1
                    # Wait between requests to avoid rate limiting
                    print(f"Waiting {wait_time} seconds before next request...")
                    time.sleep(wait_time)
            else:
                more_pages = False
                print(f"No more news found after page {page_number-1}")
                
        except Exception as e:
            print(f"Error retrieving page {page_number}: {str(e)}")
            print(f"Waiting {wait_time*2} seconds and then retrying...")
            time.sleep(wait_time*2)  # Wait longer on error before retry
            
            # Retry the same page once more
            try:
                response = requests.get(url, headers=headers, params=querystring)
                response.raise_for_status()
                
                data = response.json()
                
                if 'data' in data and data['data']:
                    news_count = len(data['data'])
                    all_news.extend(data['data'])
                    print(f"Retry successful! Retrieved page {page_number} with {news_count} news items")
                    
                    if news_count < page_size:
                        more_pages = False
                        print(f"Reached end of results (received {news_count} < {page_size})")
                    else:
                        page_number += 1
                        print(f"Waiting {wait_time} seconds before next request...")
                        time.sleep(wait_time)
                else:
                    more_pages = False
                    print(f"No more news found after retry")
            except Exception as retry_error:
                print(f"Retry failed: {str(retry_error)}")
                more_pages = False
    
    total_retrieved = len(all_news)
    print(f"Total news items retrieved for {ticker} - {period_desc}: {total_retrieved}")
    return all_news


def news_to_dataframe(news_items):
    """
    Convert news items to a pandas DataFrame for easier analysis.
    
    Args:
        news_items (list): List of news items from the API
    
    Returns:
        pandas.DataFrame: DataFrame containing news data
    """
    if not news_items:
        return pd.DataFrame()
    
    data = []
    
    for item in news_items:
        # Extract relevant fields (adjust based on actual API response structure)
        news_data = {
            'id': item.get('id'),
            'title': item.get('attributes', {}).get('title', ''),
            'published_at': item.get('attributes', {}).get('publishOn', ''),
            'author': item.get('attributes', {}).get('getAuthor', {}).get('name', ''),
            'url': f"https://seekingalpha.com{item.get('links', {}).get('self', '')}" if 'links' in item and 'self' in item['links'] else '',
            # 'content': item.get('attributes', {}).get('content', ''),
        }
        data.append(news_data)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Convert published_at to datetime
    if 'published_at' in df.columns:
        df['published_at'] = pd.to_datetime(df['published_at'])
    
    return df


def get_stock_news(ticker, year=None, month=None, api_key=None, range_years=None, range_months=None, wait_time=5):
    """
    Main function to retrieve and format stock news for a specific ticker and time period.
    Can handle single month/year or ranges of months/years to minimize API calls.
    
    Args:
        ticker (str): Stock ticker symbol (e.g., 'AAPL')
        year (int, optional): Year to retrieve news for
        month (int, optional): Month to retrieve news for (1-12)
        api_key (str): Your RapidAPI key
        range_years (tuple, optional): Tuple of (start_year, end_year) inclusive
        range_months (tuple, optional): Tuple of (start_month, end_month) inclusive for use with a single year
        wait_time (int): Number of seconds to wait between API requests
    
    Returns:
        pandas.DataFrame: DataFrame containing news data
    """
    if api_key is None:
        raise ValueError("API key is required")
        
    all_news_items = []
    
    # Case 1: Range of years specified
    if range_years is not None:
        start_year, end_year = range_years
        for y in range(start_year, end_year + 1):
            if range_months is not None and y == start_year:
                # For the start year, respect range_months
                start_month, _ = range_months
                for m in range(start_month, 13):
                    items = get_news_by_ticker_and_period(ticker, y, m, api_key, wait_time=wait_time)
                    all_news_items.extend(items)
                    # Additional wait between months
                    print(f"Waiting {wait_time} seconds between months...")
                    time.sleep(wait_time)
            elif range_months is not None and y == end_year:
                # For the end year, respect range_months
                _, end_month = range_months
                for m in range(1, end_month + 1):
                    items = get_news_by_ticker_and_period(ticker, y, m, api_key, wait_time=wait_time)
                    all_news_items.extend(items)
                    # Additional wait between months
                    if m < end_month:  # Don't wait after the last month
                        print(f"Waiting {wait_time} seconds between months...")
                        time.sleep(wait_time)
            else:
                # For middle years or when range_months is None, get full year
                items = get_news_by_ticker_and_period(ticker, y, None, api_key, wait_time=wait_time)
                all_news_items.extend(items)
            
            # Additional wait between years
            if y < end_year:  # Don't wait after the last year
                print(f"Waiting {wait_time*2} seconds between years...")
                time.sleep(wait_time*2)
    
    # Case 2: Single year with range of months
    elif year is not None and range_months is not None:
        start_month, end_month = range_months
        for m in range(start_month, end_month + 1):
            items = get_news_by_ticker_and_period(ticker, year, m, api_key, wait_time=wait_time)
            all_news_items.extend(items)
            # Additional wait between months
            if m < end_month:  # Don't wait after the last month
                print(f"Waiting {wait_time} seconds between months...")
                time.sleep(wait_time)
    
    # Case 3: Single year, single month
    elif year is not None and month is not None:
        all_news_items = get_news_by_ticker_and_period(ticker, year, month, api_key, wait_time=wait_time)
        
    # Case 4: Single year, all months
    elif year is not None:
        all_news_items = get_news_by_ticker_and_period(ticker, year, None, api_key, wait_time=wait_time)
    
    else:
        raise ValueError("Must specify either year or range_years")
    
    # Convert to DataFrame
    df = news_to_dataframe(all_news_items)
    
    # Report results
    period_desc = ""
    if range_years:
        start_y, end_y = range_years
        period_desc = f"years {start_y}-{end_y}"
    else:
        period_desc = f"year {year}"
        if month:
            period_desc += f", month {month}"
    
    print(f"Retrieved {len(df)} total news items for {ticker} in {period_desc}")
    
    return df

In [50]:
API_KEY = "a9065dc5d4mshfe28e7f02070e41p1d065cjsn176eb4fe0609"

# # Get news for a range of years (useful for collecting 5 years of data)
# apple_news = get_stock_news("AAPL", api_key=API_KEY, range_years=(2019, 2025))

# if not apple_news.empty:
#     print(apple_news.head())
    
#     # Save to CSV
#     apple_news.to_csv("AAPL_news_2019-2025.csv", index=False)

# load AAPL_news_2019-2025.csv
apple_news = pd.read_csv("AAPL_news_2019-2025.csv")
apple_news = apple_news.drop(columns=['author'])
apple_news.to_csv("AAPL_news_2019-2025_cleaned.csv", index=False)