In [None]:
# preprocess imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
import pickle
import pandas as pd
import glob
import os

# data types
from typing import Dict, Tuple, List

# CNN imports
from tensorflow.keras import Sequential, layers
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

pd.options.mode.chained_assignment = None

# READ ME

1. if running on google colab, mount driver with this code
and adjust path accordingly

2. Master function that runs whole project is at the bottom before optional plotting functions

3. The whole project can be tested by just running all cells

# Data Import

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
def load_raw_data_dic(folder_path: str) -> Dict:
    '''
    Load CSV files from a specified folder into a dictionary of DataFrames.
    '''
    
    # Use glob to get all CSV files in the folder
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    
    # Initialize an empty dictionary to store DataFrames
    dataframes_dic = {}
    
    # Loop through the list of CSV files and read each one into a DataFrame
    for file in csv_files:
        # Extract the file name without the folder path and extension
        file_name = os.path.basename(file).replace('.csv', '')
    
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)
    
        # Store the DataFrame in the dictionary
        dataframes_dic[file_name] = df
    
    # Optionally, display the keys of the dictionary to see the loaded DataFrames
    print(dataframes_dic.keys())

    return dataframes_dic


# Preprocess and Training

In [None]:
def preprocess_dataframe(data: pd.DataFrame, split_size: float = 0.85) -> Tuple[pd.DataFrame, pd.DataFrame]:
    '''
    Preprocess the input DataFrame by calculating log returns and realized volatility, 
    and split it into training and testing sets.
    '''

    df = data.copy()
    df["Date Time"] = pd.to_datetime(df["Date Time"])

    # calculate log returns
    df["Log_Returns"] = np.log(df["Close"] / df["Close"].shift(1))

    # remove every first minute of each day
    df = df[~(df['Date Time'].dt.time == pd.Timestamp('09:35').time())].reset_index()
    
    # Create a flag to group by days (380 rows)
    df["day"] = (df.index // 380)

    # Calculate the RV for each grouped 380 row day
    group_sums = df.groupby("day")["Log_Returns"].transform(lambda x: np.sqrt(np.sum(x**2)))
    
    # Add the group sums as a new column
    df["realized_vol"] = group_sums
    
    df.set_index("Date Time", inplace=True)

    # calculate train test split; make sure no day is cut in half between the two data splits
    split_index = int(np.floor(df.shape[0] * split_size / 380) * 380)

    # Split the DataFrame into training and testing sets
    df_train = df.iloc[:split_index]
    df_test = df.iloc[split_index:]

    return df_train, df_test

In [None]:
def create_subsequences(dataframe: pd.DataFrame) -> Tuple[List[pd.DataFrame], List[float]]:
    '''
    Create subsequences for training and a list of RV targets
    '''

    # create lists to fill with subsequences and their Realized Volatility targets
    sequence_list = []
    sequence_target = []

    # Loop through the DataFrame to create subsequences
    for i in range(int(len(dataframe)/380-20)):
        
        # Extract a subsequence of 21 days (each day with 380 rows)
        tmp_subsequence = dataframe.iloc[i * 380: (21 + i) * 380]
        sequence_list.append(tmp_subsequence)

        # Try to get the target value for the current subsequence
        try:
            subsequence_target = dataframe.iloc[(21 + i) * 380]["realized_vol"]
            sequence_target.append(subsequence_target)
        except:
            # Print a message if an IndexError occurs (likely in the last iteration)
            # since the last 21 day window doesn't have a RV target
            print("last iteration")

    return sequence_list, sequence_target

In [None]:
def one_month_to_image(one_month: np.ndarray, add_blue: bool = False) -> np.ndarray:
    '''
    Convert a month's worth of log returns data into image representation of size 21x380x2
    If images are needed for plotting, add blue channel and return 21x380x3 images
    '''
    
    size = len(one_month)

    # Initialize arrays for the RGB channels; should be (7980,)
    red = np.zeros(size)
    green = np.zeros(size)
    blue = np.zeros(size)

    # adding negative returns absolute values to red channel
    # and positive returns to green channel
    # fill other channel with zero
    
    # leave blue channel at zeros
    for i in range(len(one_month)):
        log_return = one_month[i]

        if log_return < 0:
            red[i] = abs(log_return)
        elif log_return > 0:
            green[i] = log_return
        elif log_return == 0:
            continue

    # turning shape (7980,) into (7980,1) for scaling
    red = np.reshape(np.array(red), (-1,1))
    green = np.reshape(np.array(green), (-1,1))
    
    # scaling returns to [0, 255] interval
    mm_scaler_red = MinMaxScaler(feature_range=(0,255))
    red_scaled = mm_scaler_red.fit_transform(red)

    mm_scaler_green = MinMaxScaler(feature_range=(0,255))
    green_scaled = mm_scaler_green.fit_transform(green)

    # Flatten the scaled arrays back to (size,) to stack
    red_scaled_flat = red_scaled.flatten()
    green_scaled_flat = green_scaled.flatten()

    # flag if blue color channel is needed for plotting
    if add_blue:
        # Stack the red, green, and blue channels into a single array
        flat_image = np.column_stack((red_scaled_flat,
                                      green_scaled_flat,
                                      blue))
         # Reshape the array to (21, 380, 3) for image representation
        square_image = flat_image.reshape((21, 380, 3))

    # don't add blue channel if images are used in CNN
    else:
        # Stack the red and green channels into a single array
        flat_image = np.column_stack((red_scaled_flat,
                                      green_scaled_flat))#,
                                      #blue))
    
        # Reshape the array to (21, 380, 2) for CNN training
        square_image = flat_image.reshape((21, 380, 2))

    return square_image

In [None]:
def create_images(sequence_list: List[pd.DataFrame]) -> List[np.ndarray]:
    '''
    Convert a list of DataFrame subsequences each containin 21 days
    into a list of their image representations
    '''

    image_list = []

    # Loop through each rolling month DataFrame in the sequence list
    for rolling_month in sequence_list:
        
        # Extract the 'Log_Returns' column as a numpy array
        X_array = np.array(rolling_month["Log_Returns"])

        # Convert each month into its image representation
        image = one_month_to_image(X_array)

        image_list.append(image)

    return image_list

In [None]:
def get_subsequence_images(dataframe: pd.DataFrame) -> Tuple[List[np.ndarray], np.ndarray]:
    '''
    Helper function: Generate image representations for subsequences of log returns from the input DataFrame.
    '''

    # Create subsequences and their RV targets from the input DataFrame
    month_sequences, subsequence_targets = create_subsequences(dataframe)

     # Generate images for the created subsequences
    image_list = create_images(month_sequences)

    # Convert the list of target values to a numpy array and return the images and their targets
    return image_list, np.array(subsequence_targets)

In [None]:
def initialize_model() -> Sequential:
    '''
    Initialize and return a Sequential convolutional neural network model.
    
    The model architecture consists of:
    - Input layer with shape (21, 380, 2)
    - Convolutional layer with 16 filters, kernel size (3, 5), 'relu' activation, and 'same' padding
    - MaxPooling2D layer with pool size (2, 2)
    - Dropout layer with a rate of 0.2
    - Convolutional layer with 32 filters, kernel size (3, 3), 'relu' activation, and 'same' padding
    - Dropout layer with a rate of 0.2
    - Flatten layer
    - Dense layer with 1 unit and 'linear' activation
    '''

    model = Sequential()
    model.add(Input(shape = (21, 380, 2)))
    
    model.add(Conv2D(16, (3,5), activation = "relu", padding = "same"))
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.2))
    
    model.add(Conv2D(32, (3, 3), activation = 'relu', padding = 'same'))
    model.add(Dropout(0.2))
    model.add(Flatten())
    
    model.add(Dense(1, activation = "linear"))

    return model

In [None]:
def compile_model(model: Sequential, learning_rate: float = 0.0001) -> Sequential:
    '''
    Compile the given Sequential model with mean squared error loss and Adam optimizer.
    '''

    learning_rate = 0.0001  # Example value
    optimizer = Adam(learning_rate=learning_rate)

    model.compile(loss = 'mse',
                  optimizer = optimizer)
    return model

In [None]:
def init_and_train_model(X_train: np.ndarray, y_train: np.ndarray, 
                         X_test: np.ndarray, y_test: np.ndarray) -> Tuple[List, List]:
    '''
    Initialize, compile, and train a convolutional neural network model on the given training data,
    and evaluate it on the test data.
    
    Args:
    X_train (np.ndarray): Training data features.
    y_train (np.ndarray): Training data labels.
    X_test (np.ndarray): Test data features.
    y_test (np.ndarray): Test data labels.
    
    Returns:
    Tuple[List, List]: A tuple containing two lists:
        - First list: The trained model and the training history.
        - Second list: The evaluation score and the test predictions.
    '''

    # Initialize the model
    model = initialize_model()
    # Compile the model
    model = compile_model(model)
    # Set up early stopping callback
    es = EarlyStopping(patience = 10, verbose = 2)
    # Train the model with training data
    history = model.fit(X_train, y_train,
                    validation_split = 0.2,
                    callbacks = [es],
                    epochs = 100,
                    batch_size = 16, verbose=2)
    
    # Evaluate the model on test data
    evaluate = model.evaluate(X_test, y_test, verbose=0)

    # Generate predictions on the test data
    test_predict = model.predict(X_test, verbose=0)

    return [model, history], [evaluate, test_predict]

In [None]:
def prepare_and_train_cnn(df: pd.DataFrame) -> Tuple[List, List]:
    '''
    Preprocess the DataFrame, prepare training and test sets, normalize the data, 
    and train multiple CNN models.

    Args:
    df (pd.DataFrame): The input DataFrame containing the returns for one stock

    Returns:
    Tuple[List, List]: A tuple containing two lists:
        - First list: The list of trained CNN models.
        - Second list: The list of results including model evaluations and predictions as well as X_test and y_test.
    '''

    # Preprocess the DataFrame and split it into training and test sets
    train, test = preprocess_dataframe(df, split_size = 0.85)

    # Get subsequence images for training and test sets as well as RV targets
    X_train, y_train = get_subsequence_images(train)
    X_test, y_test = get_subsequence_images(test)

    # Remove the last element from the training and test sets
    # since it doesn't have a Realized Volatility target
    del X_test[-1]
    del X_train[-1]

    # Convert the lists of subsequence images to numpy arrays
    X_train_arr = np.array(X_train)
    X_test_arr = np.array(X_test)

    # Normalize the image data to the range [0, 1]
    X_train_norm = X_train_arr / 255.
    X_test_norm = X_test_arr / 255.

    # Initialize and train CNN ensemble
    cnn_0 = init_and_train_model(X_train_norm, y_train, X_test_norm, y_test)
    print("Finished CNN 1")
    cnn_1 = init_and_train_model(X_train_norm, y_train, X_test_norm, y_test)
    print("Finished CNN 2")
    cnn_2 = init_and_train_model(X_train_norm, y_train, X_test_norm, y_test)
    print("Finished CNN 3")
    cnn_3 = init_and_train_model(X_train_norm, y_train, X_test_norm, y_test)
    print("Finished CNN 4")
    cnn_4 = init_and_train_model(X_train_norm, y_train, X_test_norm, y_test)
    print("Finished CNN 5")

    # Collect results and models
    model_list = [cnn_0[0], cnn_1[0], cnn_2[0], cnn_3[0], cnn_4[0]]
    num_results_list = [cnn_0[1], cnn_1[1], cnn_2[1], cnn_3[1], cnn_4[1], X_test_norm, y_test]

    return model_list, num_results_list

In [None]:
def run_and_save_model(path: str) -> None:
    '''
    Load raw data from the specified path, train CNN models on each DataFrame, and save the models and results.
    dataframes_dic = load_raw_data_dic(path)
    '''
    counter = 1

    # Load raw data from the specified path into a dictionary of DataFrames
    dataframes_dic = load_raw_data_dic(path)
    
    os.makedirs('all_models', exist_ok=True)
    os.makedirs('all_num_results', exist_ok=True)
    
    # Iterate over each DataFrame in the dictionary
    for key, df in dataframes_dic.items():
    
        print(counter)
        print(key)
        # Train CNN models and get the results
        model_dic, result_dic = prepare_and_train_cnn(df)

        # Save the trained models to a file
        with open(f'all_models/{key}.pkl', 'wb') as f:
          pickle.dump(model_dic, f)

        # Save the results to a file
        with open(f'all_num_results/{key}.pkl', 'wb') as f:
          pickle.dump(result_dic, f)
    
        counter +=1



# Master function that runs whole project

In [None]:
run_and_save_model("Data_10_year/")

# Extra plotting functions
Not cleaned

In [None]:
def plot_history(history, title='', axs=None, exp_name=""):
    if axs is not None:
        ax1, ax2 = axs
    else:
        f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    if len(exp_name) > 0 and exp_name[0] != '_':
        exp_name = '_' + exp_name
    ax1.plot(history.history['loss'], label = 'train' + exp_name)
    ax1.plot(history.history['val_loss'], label = 'val' + exp_name)
    #ax1.set_ylim(0., 0.0005)
    ax1.set_title('loss')
    ax1.legend()

    """ax2.plot(history.history['mae'], label='train mae'  + exp_name)
    ax2.plot(history.history['val_mae'], label='val mae'  + exp_name)
    ax2.set_ylim(0, 0.35)
    ax2.set_title('MAE')
    ax2.legend()"""
    return (ax1)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_pred_rv(true_rv, predict_rv):
    # Calculate the number of subplots needed
    num_plots = len(true_rv) // 50
    rows, cols = 7, 2  # Setting up a 7x2 grid
    total_plots = rows * cols

    # Calculate the number of rows needed based on the actual number of plots
    num_actual_rows = (num_plots + cols - 1) // cols  # Ceiling division

    # Adjust figure size based on the number of rows needed
    fig, axes = plt.subplots(num_actual_rows, cols, figsize=(15, num_actual_rows * 5+5))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    # Keep track of the current plot index, skipping segment 8
    plot_index = 0

    y_min = 0  # min(np.min(true_rv), np.min(predict_rv))
    y_max = 0.03  # max(np.max(true_rv), np.max(predict_rv))

    # Loop through the segments and create subplots
    for i in range(num_plots):
        # Skip segment 8 (index 7)
        if i == 7 or i == 5:
            continue
        
        start_idx = i * 50
        end_idx = start_idx + 50
        indices = np.arange(start_idx, end_idx)

        # Determine the y-axis limits based on the data of the current segment
        segment_true_rv = true_rv[start_idx:end_idx]
        segment_predict_rv = predict_rv[start_idx:end_idx]

        # Define the y-ticks
        y_ticks = np.linspace(y_min, y_max, num=5)

        axes[plot_index].plot(indices, segment_true_rv, marker='o', linestyle='-', color='b', label='True RV')
        axes[plot_index].plot(indices, segment_predict_rv, marker='x', linestyle='-', color='r', label='Predict RV')
        axes[plot_index].set_xlabel('Day in Test Set')
        axes[plot_index].set_ylabel('Realized Volatility')
        axes[plot_index].set_title(f'Segment {i+1}')
        axes[plot_index].legend()
        axes[plot_index].grid(True)

        # Set the y-axis limits and ticks
        axes[plot_index].set_ylim([y_min, y_max])
        axes[plot_index].set_yticks(y_ticks)

        plot_index += 1

    # Hide any unused subplots
    for j in range(plot_index, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout for better spacing
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.2, wspace=0.2)  # Add space between subplots

    # Save the figure with higher resolution
    plt.savefig("amgen_good_results.png", dpi=300)
    plt.show()

# Example usage:
# plot_pred_rv(amgen[6], amgen[3][1])
