## 1. Data Collection & Technical Analysis

In [None]:
!pip install yfinance alpha_vantage pandas_datareader requests

### 1.1 Introduction

- Brief overview of the study objective.
- Importance of analyzing SPY across different market segments.

### 1.2 Data Sources

- Description of data sources (e.g., Yahoo Finance, Alpha Vantage).
- Justification for choosing these sources.

In [98]:
import requests
from alpha_vantage.timeseries import TimeSeries
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mplfinance.original_flavor import candlestick_ohlc
import seaborn as sns
from datetime import datetime

### 1.3 Data Collection Process

- Python code to fetch hourly time series data for SPY.
- Include pre-market, market hours, and extended hours data.

### 1.4 Initial Data Exploration
Apparently Yahoo doesnt get extended hours data...

In [99]:
spy_hourly_data = yf.download('SPY', period='60d', interval='1h')

[*********************100%%**********************]  1 of 1 completed


In [100]:
# Use alpha_vantage data
def fetch_hourly_data(symbol, api_key):
    # Set up the API parameters
    params = {
        'function': 'TIME_SERIES_INTRADAY',
        'symbol': symbol,
        'interval': '60min',
        'apikey': api_key,
        'outputsize': 'full'
    }
    
    # Make the request to Alpha Vantage API
    response = requests.get("https://www.alphavantage.co/query", params=params)
    response.raise_for_status()  # Raise an exception for HTTP error codes
    
    # Parse the JSON response
    data = response.json()
    # The data is usually under the "Time Series (60min)" key, but this might change
    # Please check the exact structure of the response JSON
    time_series_key = list(data.keys())[1]
    hourly_data = data[time_series_key]
    
    # Convert to a Pandas DataFrame
    df = pd.DataFrame.from_dict(hourly_data, orient='index', dtype=float)
    df.index = pd.to_datetime(df.index)
    df = df.rename(columns={
        '1. open': 'Open',
        '2. high': 'High',
        '3. low': 'Low',
        '4. close': 'Close',
        '5. volume': 'Volume'
    })
    
    return df

# Use the function with your API key
api_key = 'ZKVJP2N2JKBY2WPZ'  # Replace with your actual API key


spy_hourly_alpha_data = fetch_hourly_data('SPY', api_key)
# Make copies of the data to split into different market segments
pre_market_alpha_data = spy_hourly_alpha_data.between_time('04:00:00', market_start).copy()
market_hours_alpha_data = spy_hourly_alpha_data.between_time(market_start, market_end).copy()
extended_hours_alpha_data = spy_hourly_alpha_data.between_time(market_end, '20:00:00').copy()

In [None]:
print(spy_hourly_data.head())

In [None]:
print(spy_hourly_data.describe())

In [None]:
print(spy_hourly_data.isnull().sum())

## 2. Data Cleaning and Splitting Market Segments

### 2.1 Data Preprocessing

In [None]:
# # Define market hours for the NYSE
# market_start = '09:30:00'
# market_end = '16:00:00'

### 2.2 Segmenting Market Hours

In [None]:
# Make copies of the data to split into different market segments
pre_market_data = spy_hourly_data.between_time('04:00:00', market_start).copy()
market_hours_data = spy_hourly_data.between_time(market_start, market_end).copy()
extended_hours_data = spy_hourly_data.between_time(market_end, '20:00:00').copy()

In [None]:
# Display the first few rows of each segment
print("Pre-Market Data:")
print(pre_market_alpha_data.head())

In [None]:
print("\nMarket Hours Data:")
print(market_hours_data.head())

In [None]:
print("\nExtended Hours Data:")
print(extended_hours_data.head())

### 2.3 Ensuring Consistency & Data Quality'
This section is design to help double check data quality before visualization.

In [123]:
# Check and Validate Pre-Market Data
def check_pre_market_data_quality(data):
    # Check for missing values
    if data.isnull().values.any():
        print("Data contains missing values.")
        # Optionally, handle missing values by filling or dropping
        # data = data.dropna() or data = data.fillna(method='ffill')
        
    # Check for duplicate timestamps
    if data.index.duplicated().any():
        print("Data contains duplicate timestamps.")
        # Optionally, drop duplicates
        # data = data[~data.index.duplicated(keep='first')]
    
    # Check for large jumps in price
    data['returns'] = data['Close'].pct_change()
    if data['returns'].abs().max() > 0.03:  # Arbitrary threshold for significant return
        print("Data contains large price jumps.")
        # Optionally, inspect large jumps
        # large_jumps = data[data['returns'].abs() > 0.03]
        
    # Check if close is ever less than low or more than high
    if ((data['Close'] < data['Low']) | (data['Close'] > data['High'])).any():
        print("Data contains Close price outside of Low/High range.")
    
    # More checks can be added as per the data characteristics and requirements
    
    # Return the data with a new column 'returns' if no issues, or with issues handled if needed
#     return data

checked_pre_market_data = check_pre_market_data_quality(pre_market_data)
checked_pre_market_data

Data contains missing values.


In [120]:
import pandas as pd

def check_market_data_quality(df):
    """
    Enhanced function to check for common data issues in market hours stock data and return errors.
    :param df: DataFrame with stock data
    :return: Prints error messages and counts for each type of error found. If no errors, prints "no errors found".
    """
    
    errors = []
    error_counts = {
        'missing_values': 0,
        'duplicate_timestamps': 0,
        'large_price_jumps': 0,
        'large_price_drops': 0
    }
    
    # Ensure data only includes hours in which options can be traded (assumed to be regular market hours for this example)
    market_start, market_end = '09:30:00', '16:00:00'
    df.index = pd.to_datetime(df.index)
    df = df.between_time(market_start, market_end)

    # Check for missing or blank values
    if df.isnull().values.any():
        errors.append("Data contains missing values.")
        error_counts['missing_values'] = df.isnull().sum().sum()

    # Check for duplicate indices (timestamps)
    if df.index.duplicated().any():
        errors.append("Data contains duplicate timestamps.")
        error_counts['duplicate_timestamps'] = df.index.duplicated().sum()

    # Check for large price jumps or drops (> 3%)
    df['Price_Change'] = df['Close'].pct_change()
    jumps = df[df['Price_Change'] > 0.03]
    drops = df[df['Price_Change'] < -0.03]
    
    if not jumps.empty:
        errors.append("Data contains large price jumps (> 3%).")
        error_counts['large_price_jumps'] = len(jumps)
        
    if not drops.empty:
        errors.append("Data contains large price drops (> 3%).")
        error_counts['large_price_drops'] = len(drops)

    # Print errors and counts
    if errors:
        for error in errors:
            print(error)
        for error_type, count in error_counts.items():
            if count > 0: print(f"{error_type}: {count}")
    else:
        print("No errors found.")
    
    return None  # Function modified to not return the DataFrame, but could be adjusted to return error details if needed

# Assuming 'market_hours_data' is a DataFrame loaded with relevant market hours data
# Example usage would be as follows (commented out since 'market_hours_data' is not defined in this cell):
check_market_data_quality(market_hours_data)


No errors found.


In [122]:
# POST MARKET / EXTENDED HOUR DATA
import pandas as pd

def check_post_market_data_quality(data, market_end='16:00:00'):
    """
    Check for common data issues in post-market stock data.
    
    :param data: DataFrame with stock data assumed to be post-market.
    :param market_end: String representing the market closing time to verify against.
    :return: DataFrame after performing checks and flagging rows with issues.
    """

    # Convert index to datetime if it's not already
    if not pd.api.types.is_datetime64_any_dtype(data.index):
        data.index = pd.to_datetime(data.index)

    # Check if data is outside market hours (after market_end)
    if not data.between_time(market_end, '23:59:59').empty:
        print("Data includes times outside expected post-market hours.")

    # Check for missing values
    if data.isnull().any().any():
        print("Data contains missing values.")
        # Optional: Handle missing values here

    # Check for duplicate indices
    if data.index.duplicated().any():
        print("Data contains duplicate indices.")
        # Optional: Handle duplicates here

    # Check for large price swings that may indicate bad ticks or outliers
    data['Price_Change'] = data['Close'].pct_change()
    if (data['Price_Change'].abs() > 0.05).any():  # Threshold at 5%
        print("Data contains large price changes, possible outliers.")

    # More checks can be added based on the specific use case...

    # Return data with quality checks performed
#     return data

# Example usage:
# Assuming you have a DataFrame 'post_market_data' with post-market data
checked_post_market_data = check_post_market_data_quality(extended_hours_alpha_data)
checked_post_market_data


Data includes times outside expected post-market hours.
Data contains missing values.


## 3. Data Visualization of Market Segments

In [None]:
# Insert Python code here for data visualization of market segments
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mplfinance.original_flavor import candlestick_ohlc
import pandas as pd
import plotly.graph_objects as go

def plot_candlestick_interactive(df, title):
    # Filter out the data based on the condition
    increasing_df = df[df['Close'] > df['Open'] * 1.03]
    decreasing_df = df[df['Close'] < df['Open'] * 0.97]
    neutral_df = df[(df['Close'] <= df['Open'] * 1.03) & (df['Close'] >= df['Open'] * 0.97)]

    # Create figure and add candlestick trace
    fig = go.Figure()

    # Neutral candles
    fig.add_trace(go.Candlestick(x=neutral_df.index, open=neutral_df['Open'], high=neutral_df['High'],
                                 low=neutral_df['Low'], close=neutral_df['Close'], 
                                 increasing_line_color='green', decreasing_line_color='red',
                                 name='Neutral'))
    
    # Increasing candles
    fig.add_trace(go.Candlestick(x=increasing_df.index, open=increasing_df['Open'], high=increasing_df['High'],
                                 low=increasing_df['Low'], close=increasing_df['Close'],
                                 increasing_line_color='black', decreasing_line_color='black',
                                 name='Increase > 3%'))
    
    # Decreasing candles
    fig.add_trace(go.Candlestick(x=decreasing_df.index, open=decreasing_df['Open'], high=decreasing_df['High'],
                                 low=decreasing_df['Low'], close=decreasing_df['Close'],
                                 increasing_line_color='white', decreasing_line_color='white',
                                 name='Decrease > 3%'))

    # Update the layout
    fig.update_layout(title=title, xaxis_title='Date', yaxis_title='Price', xaxis_rangeslider_visible=False)

    # Show the plot
    fig.show()

# Example usage with your dataframe:
# plot_candlestick_interactive(spy_hourly_data, 'SPY Hourly Data Candlestick Chart')


In [None]:
# Check and Validate Post-Market Data


In [None]:
# Call the function with each DataFrame
plot_candlestick_interactive(spy_hourly_data, 'SPY Hourly Data Candlestick Chart')

In [None]:
plot_candlestick_interactive(pre_market_alpha_data, 'SPY Pre-Market Data Candlestick Chart')

In [None]:
plot_candlestick_interactive(market_hours_data, 'SPY Market Hours Data Candlestick Chart')

In [None]:
plot_candlestick_interactive(extended_hours_alpha_data, 'SPY Extended Hours Data Candlestick Chart')

### 3.1 Overview of Visualization Techniques

- Introduction to visualization libraries (e.g., Matplotlib, Seaborn).

### 3.2 Comparative Visualization

In [None]:
# Insert Python code here for comparative visualization

### 3.3 Trends and Patterns

In [None]:
# Insert Python code here to identify trends and patterns

## 4. Forecasting OHLC & Visualizing Tomorrow's Forecasts

In [None]:
# Insert Python code here for forecasting OHLC and visualizing tomorrow's forecasts

### 4.1 Forecasting Models Overview

- Introduction to forecasting models (e.g., ARIMA, LSTM).

### 4.2 Model Implementation

In [None]:
# Insert Python code here for model implementation

### 4.3 Visualization of Forecasts

In [None]:
# Insert Python code here for visualization of forecasts

## 5. Backtesting NightShare Strategy (Sell Premarket, Buy Post-market)

In [None]:
# Insert Python code here for backtesting NightShare strategy

### 5.1 Strategy Rationale

- Explanation of the NightShare strategy.
- Hypothesis on why it might be effective.

### 5.2 Backtesting Framework

In [None]:
# Insert Python code here for the backtesting framework

### 5.3 Strategy Performance

In [101]:
# Insert Python code here for strategy performance

## 6. Conclusions: Practical Business Applications

### 6.1 Key Findings
- Summary of significant insights from each section.

### 6.2 Practical Implications
- How investors/traders can use this study.
- Limitations and considerations for real-world application.

### 6.3 Future Research Directions
- Suggestions for further studies or improvements.
- Potential for algorithmic trading strategies based on findings.