In [None]:
pip install yfinance


SyntaxError: invalid syntax (3389932788.py, line 1)

In [2]:
pip install praw nltk pandas


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting websocket-client>=0.54.0 (from praw)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   -------------------------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import yfinance as yf
import pandas as pd
import os
from datetime import datetime, timedelta

def fetch_intraday_chunks(ticker, start_date, end_date, interval="1h", max_days=60):
    """
    Fetch intraday data in chunks for a given ticker from start_date to end_date.
    
    Parameters:
        ticker (str): Stock ticker symbol (e.g., "AAPL").
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
        interval (str): Data interval (default is "1h").
        max_days (int): Maximum number of days per chunk.
        
    Returns:
        DataFrame: Concatenated DataFrame with the entire period's data.
    """
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    
    all_data = []
    current_start = start_dt

    while current_start < end_dt:
        current_end = current_start + timedelta(days=max_days)
        if current_end > end_dt:
            current_end = end_dt

        # Convert current chunk start and end to string format
        chunk_start_str = current_start.strftime("%Y-%m-%d")
        chunk_end_str = current_end.strftime("%Y-%m-%d")
        
        print(f"Fetching {ticker} data from {chunk_start_str} to {chunk_end_str} with {interval} interval...")
        chunk_data = yf.download(ticker, start=chunk_start_str, end=chunk_end_str, interval=interval)
        
        if not chunk_data.empty:
            all_data.append(chunk_data)
        else:
            print(f"No data returned for {ticker} from {chunk_start_str} to {chunk_end_str}.")
        
        # Move to the next chunk. To avoid duplicate records at boundaries, add one day.
        current_start = current_end + timedelta(days=1)
    
    if all_data:
        # Concatenate all chunks and sort by index (date/time)
        full_data = pd.concat(all_data)
        full_data.sort_index(inplace=True)
        return full_data
    else:
        return pd.DataFrame()

def get_stocks_data(ticker_list, start_date, end_date, interval="1h", max_days=60):
    """
    Fetch historical intraday stock data for each ticker over a large date range by chunking.
    
    Parameters:
        ticker_list (list): List of stock ticker symbols (e.g., ["AAPL", "GOOG"]).
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
        interval (str): Data interval (default is "1h").
        max_days (int): Maximum number of days per chunk.
        
    Returns:
        dict: A dictionary with ticker symbols as keys and concatenated DataFrames as values.
    """
    stocks_data = {}
    for ticker in ticker_list:
        data = fetch_intraday_chunks(ticker, start_date, end_date, interval, max_days)
        stocks_data[ticker] = data
    return stocks_data

def save_stocks_to_csv(stocks_data, output_dir="."):
    """
    Save each ticker's data to a CSV file.
    
    Parameters:
        stocks_data (dict): Dictionary with ticker symbols as keys and DataFrames as values.
        output_dir (str): Directory to save CSV files.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for ticker, data in stocks_data.items():
        file_path = os.path.join(output_dir, f"{ticker}_data.csv")
        data.to_csv(file_path)
        print(f"Saved {ticker} data to {file_path}")

if __name__ == "__main__":
    # List of stock tickers to process
    tickers = ["AAPL", "GOOG", "MSFT", "AMZN", "NVDA"]
    
    # Define a longer time range that spans more than 60 days
    start_date = "2023-09-01"
    end_date = "2025-02-10"
    
    # Fetch data in chunks and then combine
    stocks_data = get_stocks_data(tickers, start_date, end_date, interval="1h", max_days=60)
    
    # Save each stock's combined data into CSV files in the 'stock_data' directory
    save_stocks_to_csv(stocks_data, output_dir="stock_data")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching AAPL data from 2023-09-01 to 2023-10-31 with 1h interval...
Fetching AAPL data from 2023-11-01 to 2023-12-31 with 1h interval...
Fetching AAPL data from 2024-01-01 to 2024-03-01 with 1h interval...
Fetching AAPL data from 2024-03-02 to 2024-05-01 with 1h interval...
Fetching AAPL data from 2024-05-02 to 2024-07-01 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching AAPL data from 2024-07-02 to 2024-08-31 with 1h interval...
Fetching AAPL data from 2024-09-01 to 2024-10-31 with 1h interval...
Fetching AAPL data from 2024-11-01 to 2024-12-31 with 1h interval...
Fetching AAPL data from 2025-01-01 to 2025-02-10 with 1h interval...
Fetching GOOG data from 2023-09-01 to 2023-10-31 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching GOOG data from 2023-11-01 to 2023-12-31 with 1h interval...
Fetching GOOG data from 2024-01-01 to 2024-03-01 with 1h interval...
Fetching GOOG data from 2024-03-02 to 2024-05-01 with 1h interval...
Fetching GOOG data from 2024-05-02 to 2024-07-01 with 1h interval...
Fetching GOOG data from 2024-07-02 to 2024-08-31 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching GOOG data from 2024-09-01 to 2024-10-31 with 1h interval...
Fetching GOOG data from 2024-11-01 to 2024-12-31 with 1h interval...
Fetching GOOG data from 2025-01-01 to 2025-02-10 with 1h interval...
Fetching MSFT data from 2023-09-01 to 2023-10-31 with 1h interval...
Fetching MSFT data from 2023-11-01 to 2023-12-31 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching MSFT data from 2024-01-01 to 2024-03-01 with 1h interval...
Fetching MSFT data from 2024-03-02 to 2024-05-01 with 1h interval...
Fetching MSFT data from 2024-05-02 to 2024-07-01 with 1h interval...
Fetching MSFT data from 2024-07-02 to 2024-08-31 with 1h interval...
Fetching MSFT data from 2024-09-01 to 2024-10-31 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching MSFT data from 2024-11-01 to 2024-12-31 with 1h interval...
Fetching MSFT data from 2025-01-01 to 2025-02-10 with 1h interval...
Fetching AMZN data from 2023-09-01 to 2023-10-31 with 1h interval...
Fetching AMZN data from 2023-11-01 to 2023-12-31 with 1h interval...
Fetching AMZN data from 2024-01-01 to 2024-03-01 with 1h interval...
Fetching AMZN data from 2024-03-02 to 2024-05-01 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching AMZN data from 2024-05-02 to 2024-07-01 with 1h interval...
Fetching AMZN data from 2024-07-02 to 2024-08-31 with 1h interval...
Fetching AMZN data from 2024-09-01 to 2024-10-31 with 1h interval...
Fetching AMZN data from 2024-11-01 to 2024-12-31 with 1h interval...
Fetching AMZN data from 2025-01-01 to 2025-02-10 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching NVDA data from 2023-09-01 to 2023-10-31 with 1h interval...
Fetching NVDA data from 2023-11-01 to 2023-12-31 with 1h interval...
Fetching NVDA data from 2024-01-01 to 2024-03-01 with 1h interval...
Fetching NVDA data from 2024-03-02 to 2024-05-01 with 1h interval...
Fetching NVDA data from 2024-05-02 to 2024-07-01 with 1h interval...
Fetching NVDA data from 2024-07-02 to 2024-08-31 with 1h interval...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Fetching NVDA data from 2024-09-01 to 2024-10-31 with 1h interval...
Fetching NVDA data from 2024-11-01 to 2024-12-31 with 1h interval...
Fetching NVDA data from 2025-01-01 to 2025-02-10 with 1h interval...
Saved AAPL data to stock_data\AAPL_data.csv
Saved GOOG data to stock_data\GOOG_data.csv
Saved MSFT data to stock_data\MSFT_data.csv
Saved AMZN data to stock_data\AMZN_data.csv
Saved NVDA data to stock_data\NVDA_data.csv


In [3]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\notjo\AppData\Roaming\nltk_data...


True

In [None]:
import os
import pandas as pd
import yfinance as yf
import praw
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from datetime import datetime, timedelta

# Ensure VADER is available
nltk.download('vader_lexicon')

# --- Reddit API Credentials ---
REDDIT_CLIENT_ID = "VQ-NOvyPWyJvGZs1ifD0Ww"
REDDIT_CLIENT_SECRET = "BX_Dlp6miv2eMo4qt5JY_imgYVyMBA"
REDDIT_USER_AGENT = "StockSentimentAnalysis/0.1 by Joseph"

# Initialize PRAW (Reddit client)
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)

def fetch_stock_data_in_chunks(ticker, start_date, end_date, interval="1h", chunk_size=60):
    """
    Fetch stock data from Yahoo Finance in chunks to bypass the ~60-day intraday limit.
    
    Parameters:
        ticker (str): Stock ticker symbol (e.g., "AAPL"). 
                      Pass as a string (not a list) for a single ticker to avoid multi-level columns.
        start_date (str): Start date in "YYYY-MM-DD" format.
        end_date (str): End date in "YYYY-MM-DD" format.
        interval (str): Data interval (e.g., "1h", "1d").
        chunk_size (int): Number of days per chunk (default=60).
    
    Returns:
        pd.DataFrame: Concatenated DataFrame of all chunks, indexed by floored hour.
    """
    print(f"Fetching {ticker} stock data in chunks from {start_date} to {end_date} at {interval} intervals...")

    start_dt = pd.to_datetime(start_date)
    end_dt = pd.to_datetime(end_date)
    
    all_chunks = []
    current_start = start_dt
    
    while current_start < end_dt:
        # Calculate chunk end
        current_end = current_start + timedelta(days=chunk_size)
        if current_end > end_dt:
            current_end = end_dt
        
        print(f"  Fetching chunk: {current_start.date()} to {current_end.date()}")
        df_chunk = yf.download(
            ticker,
            start=current_start.strftime("%Y-%m-%d"),
            end=current_end.strftime("%Y-%m-%d"),
            interval=interval
        )
        
        if not df_chunk.empty:
            all_chunks.append(df_chunk)
        else:
            print("  No data returned for this chunk.")
        
        # Move start to the next day after current_end
        current_start = current_end + timedelta(days=1)
    
    if not all_chunks:
        return pd.DataFrame()
    
    # Concatenate all chunks
    full_data = pd.concat(all_chunks)
    
    # ---------------------------------------------------------
    # FIX: If there's a multi-level index (e.g. (ticker, date)),
    # drop the top level so we have a single-level DatetimeIndex.
    # ---------------------------------------------------------
    if full_data.index.nlevels > 1:
        # e.g. drop the first level (the ticker level)
        full_data.index = full_data.index.droplevel(0)
    
    # Also flatten multi-level columns if needed
    # (happens if yfinance returns e.g. ('Open', 'AAPL'), etc.)
    if isinstance(full_data.columns, pd.MultiIndex):
        full_data.columns = ['_'.join(col) if isinstance(col, tuple) else col
                             for col in full_data.columns]
    
    # Floor the index to the hour
    full_data.index = full_data.index.floor('H')
    
    # Drop any duplicate timestamps if they exist
    full_data = full_data[~full_data.index.duplicated(keep='first')]
    
    return full_data

def fetch_reddit_posts(stock_symbol, subreddit="wallstreetbets", limit=100):
    """
    Fetch posts from a given subreddit that mention the stock symbol.
    
    Returns:
        list of dict: Each dict has:
            - 'created': datetime (floored to the hour)
            - 'text': combined title + selftext
    """
    print(f"Fetching Reddit posts for '{stock_symbol}' from r/{subreddit}...")
    posts = []
    try:
        for submission in reddit.subreddit(subreddit).search(stock_symbol, limit=limit):
            # Convert UTC timestamp to datetime and floor to the hour
            created_dt = datetime.fromtimestamp(submission.created_utc).replace(
                minute=0, second=0, microsecond=0
            )
            text = f"{submission.title} {submission.selftext}"
            posts.append({'created': created_dt, 'text': text})
    except Exception as e:
        print(f"Error fetching Reddit posts: {e}")
    return posts

def analyze_posts_sentiment(posts):
    """
    Analyze sentiment for each post using VADER.
    
    Returns:
        pd.DataFrame: 'created' (datetime) and 'compound' (VADER compound score).
    """
    if not posts:
        return pd.DataFrame(columns=['created', 'compound'])
    
    analyzer = SentimentIntensityAnalyzer()
    results = []
    for post in posts:
        scores = analyzer.polarity_scores(post['text'])
        results.append({'created': post['created'], 'compound': scores['compound']})
    
    return pd.DataFrame(results)

def aggregate_sentiment_by_hour(sentiment_df):
    """
    Aggregates sentiment scores by hour (average compound score).
    
    Returns:
        pd.DataFrame: Indexed by 'created' (hour) with column 'avg_sentiment'.
    """
    if sentiment_df.empty:
        return pd.DataFrame(columns=['avg_sentiment'])
    
    grouped = sentiment_df.groupby('created')['compound'].mean().reset_index()
    grouped.rename(columns={'compound': 'avg_sentiment'}, inplace=True)
    grouped.set_index('created', inplace=True)
    return grouped

def merge_stock_and_sentiment(stock_df, sentiment_df):
    """
    Merges the sentiment data (avg_sentiment) with the stock DataFrame on the hourly index.
    
    Returns:
        pd.DataFrame: Combined DataFrame with a new 'avg_sentiment' column.
    """
    if stock_df.empty:
        print("Stock DataFrame is empty, skipping merge.")
        return stock_df
    
    merged_df = stock_df.merge(sentiment_df, how='left', left_index=True, right_index=True)
    merged_df['avg_sentiment'].fillna(0, inplace=True)
    return merged_df

def run_analysis(ticker, start_date, end_date, interval="1h", subreddit="wallstreetbets", limit=100, output_dir="output"):
    """
    1) Fetch stock data in chunks (bypass 60-day limit)
    2) Fix multi-level index if present
    3) Fetch Reddit posts
    4) Analyze sentiment
    5) Aggregate by hour
    6) Merge with stock data
    7) Save final CSV
    """
    # Step 1 & 2: Fetch stock data in chunks and fix multi-level index
    stock_df = fetch_stock_data_in_chunks(ticker, start_date, end_date, interval=interval, chunk_size=60)
    if stock_df.empty:
        print(f"No stock data found for {ticker} in the given date range.")
        return
    
    # Step 3: Fetch Reddit posts
    posts = fetch_reddit_posts(ticker, subreddit=subreddit, limit=limit)
    
    # Step 4: Analyze sentiment
    sentiment_df = analyze_posts_sentiment(posts)
    
    # Step 5: Aggregate sentiment by hour
    aggregated_sentiment = aggregate_sentiment_by_hour(sentiment_df)
    
    # Step 6: Merge with stock data
    merged_df = merge_stock_and_sentiment(stock_df, aggregated_sentiment)
    
    # Step 7: Save final CSV
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_path = os.path.join(output_dir, f"{ticker}_merged_data.csv")
    merged_df.to_csv(output_path)
    print(f"Final merged CSV saved to: {output_path}")

if __name__ == "__main__":
    # Example usage
    TICKER = "AAPL"          # Single ticker as a string
    START_DATE = "2023-08-01"
    END_DATE = "2023-12-31"
    INTERVAL = "1h"          # '1d' for daily, '1h' for hourly, etc.
    SUBREDDIT = "wallstreetbets"
    POST_LIMIT = 100         # Number of Reddit posts to fetch
    
    run_analysis(
        ticker=TICKER,
        start_date=START_DATE,
        end_date=END_DATE,
        interval=INTERVAL,
        subreddit=SUBREDDIT,
        limit=POST_LIMIT,
        output_dir="output"
    )


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\notjo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
  full_data.index = full_data.index.floor('H')


Fetching AAPL stock data in chunks from 2023-08-01 to 2023-12-31 at 1h intervals...
  Fetching chunk: 2023-08-01 to 2023-09-30
  Fetching chunk: 2023-10-01 to 2023-11-30
  Fetching chunk: 2023-12-01 to 2023-12-31
Fetching Reddit posts for 'AAPL' from r/wallstreetbets...


TypeError: Cannot join tz-naive with tz-aware DatetimeIndex