<a href="https://colab.research.google.com/github/MasterBeard/Proxy-Labels-Implementation/blob/main/Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To ensure the reproducibility of the article, we have provided a notebook code that explains how to use proxy labels to identify investment portfolios with higher precision and a greater number of correct upward signal predictions. The example in the notebook code involves the test results for **Test Set 2** under **Task 1**.

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import numpy.polynomial.polynomial as poly

# Define stock and index tickers
stock_tickers = [
    'OKE', 'ENPH', 'UHS', 'DLTR', 'AMZN', 'EFX', 'RSG', 'OXY', 'REGN', 'DECK',
    '^GSPC',  # S&P 500
    '^IXIC',  # NASDAQ Composite
    '^HSI',   # Hang Seng Index
    '^DJI',   # Dow Jones Industrial Average
    '^FCHI',  # CAC 40
    '^GDAXI', # DAX
    '^N225',  # Nikkei 225
    '^KS11',  # KOSPI
    '^STOXX50E'  # EURO STOXX 50
]

# Define date ranges
date_ranges = {
    'train': ("2005-01-01", "2015-01-01"),
    'val': ("2015-01-02", "2019-12-31"),
    'test': ("2020-01-01", "2024-10-31")
}

# Store matrices, labels, and origin for each split
data_splits = {
    split: {
        'matrices_4x4': [],
        'matrices_1st': [],
        'labels4': [],
        'labels1': [],
        'labels2': [],
        'labels3': [],
        'origin': []  # Record the origin (stock or index) of each matrices_4x4
    }
    for split in date_ranges
}

# Window length
window_size = 25
degree = 2

# Fetch and process data for each time period
for split, (start_date, end_date) in date_ranges.items():
    # Download all data (stocks and indices) for the specified date range
    all_data = {ticker: yf.download(ticker, start=start_date, end=end_date, auto_adjust=False) for ticker in stock_tickers}

    # Create first and second derivative matrices for each stock and index
    for ticker, data in all_data.items():
        # Extract required 'Open' and 'Close' data
        open_values = data['Open'].dropna().values
        close_values = data['Close'].dropna().values

        # Build window matrices
        for start in range(len(data) - window_size + 1):
            # Extract each row of data
            open_row = open_values[start:start + window_size]
            close_row = close_values[start:start + window_size]

            # Normalize using the 16th last value of close_row
            normalization_factor = close_row[-16]
            open_row = open_row / normalization_factor
            close_row = close_row / normalization_factor

            # Create an alternating combined array
            combined = np.array([open_row[i // 2] if i % 2 == 0 else close_row[i // 2] for i in range(window_size * 2)])
            matrix_4x4 = combined[:-30].reshape(4, 5, 1)
            matrix_4x5 = combined[-32:].reshape(-1)
            result = [combined[-32], combined[-31], combined[-1]]
            data_splits[split]['matrices_4x4'].append(matrix_4x4)
            data_splits[split]['matrices_1st'].append(matrix_4x5)
            data_splits[split]['origin'].append(ticker)  # Record origin (stock or index)

            # Perform higher-order fitting on combined data
            x = np.arange(len(combined[-4:-1]))
            coeffs = poly.polyfit(x, result, deg=degree)

            # Calculate first derivative
            first_derivative_coeffs = poly.polyder(coeffs)
            first_derivatives = poly.polyval(x, first_derivative_coeffs)

            # Calculate second derivative
            second_derivative_coeffs = poly.polyder(first_derivative_coeffs)
            second_derivatives = poly.polyval(x, second_derivative_coeffs)

            # Generate labels
            label1 = 1 if first_derivatives[0][-1] > 0 else 0
            label2 = 1 if second_derivatives[0][-1] > 0 else 0
            label3 = 1 if close_row[-1] > close_row[-16] else 0
            label4 = [first_derivatives[-1], second_derivatives[-1]]

            data_splits[split]['labels1'].append(label1)
            data_splits[split]['labels2'].append(label2)
            data_splits[split]['labels3'].append(label3)
            data_splits[split]['labels4'].append(label4)

# Convert matrices in each split to NumPy arrays
def convert_to_numpy(split):
    return (
        np.array(data_splits[split]['matrices_4x4']),
        np.array(data_splits[split]['matrices_1st']),
        np.array(data_splits[split]['labels1']),
        np.array(data_splits[split]['labels2']),
        np.array(data_splits[split]['labels3']),
        np.array(data_splits[split]['labels4']),
        np.array(data_splits[split]['origin'])
    )

train_data = convert_to_numpy('train')
val_data = convert_to_numpy('val')
test_data = convert_to_numpy('test')

# Output shapes of each split to check results
print(f"Train 4x4 matrices shape: {train_data[0].shape}")
print(f"Train 1st matrices shape: {train_data[1].shape}")
print(f"Train labels1 shape: {train_data[2].shape}")
print(f"Train origin shape: {train_data[6].shape}")

print(f"Validation 4x4 matrices shape: {val_data[0].shape}")
print(f"Validation 1st matrices shape: {val_data[1].shape}")
print(f"Validation labels1 shape: {val_data[2].shape}")
print(f"Validation origin shape: {val_data[6].shape}")

print(f"Test 4x4 matrices shape: {test_data[0].shape}")
print(f"Test 1st matrices shape: {test_data[1].shape}")
print(f"Test labels1 shape: {test_data[2].shape}")
print(f"Test origin shape: {test_data[6].shape}")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Train 4x4 matrices shape: (44871, 4, 5, 1)
Train 1st matrices shape: (44871, 32)
Train labels1 shape: (44871,)
Train origin shape: (44871,)
Validation 4x4 matrices shape: (23353, 4, 5, 1)
Validation 1st matrices shape: (23353, 32)
Validation labels1 shape: (23353,)
Validation origin shape: (23353,)
Test 4x4 matrices shape: (22604, 4, 5, 1)
Test 1st matrices shape: (22604, 32)
Test labels1 shape: (22604,)
Test origin shape: (22604,)


In [None]:
def combine_labels(labels1, labels2, labels3):
    """
    Generate new grouped labels based on the permutations of the three input labels.
    Each combination is uniquely encoded: e.g., (0, 1, 1) → 3.
    """
    # Combine labels into a unique code using weighted sum: labels1 * 4 + labels2 * 2 + labels3 * 1
    combined_labels = labels1 * 4 + labels2 * 2 + labels3 * 1
    return combined_labels

# Regenerate labels for train, validation, and test sets
train_labels = combine_labels(train_data[2], train_data[3], train_data[4])
val_labels = combine_labels(val_data[2], val_data[3], val_data[4])
test_labels = combine_labels(test_data[2], test_data[3], test_data[4])

# Print label distributions to check the combinations
print("Train Labels Distribution:", np.unique(train_labels, return_counts=True))
print("Validation Labels Distribution:", np.unique(val_labels, return_counts=True))
print("Test Labels Distribution:", np.unique(test_labels, return_counts=True))

Train Labels Distribution: (array([0, 1, 2, 5, 6, 7]), array([17051,   559,   859,  1096,   438, 24868]))
Validation Labels Distribution: (array([0, 1, 2, 5, 6, 7]), array([ 9210,   263,   480,   560,   264, 12576]))
Test Labels Distribution: (array([0, 1, 2, 5, 6, 7]), array([ 9010,   241,   446,   492,   225, 12190]))


In [None]:
import tensorflow as tf
from tensorflow import keras

# Build MLP model
model1 = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(4, 5, 1)),  # Flatten (4, 5, 1) into (20,)
    tf.keras.layers.Dense(units=64, activation='relu'),  # First Dense layer
    tf.keras.layers.Dense(units=64, activation='relu'),  # Second Dense layer
    tf.keras.layers.Dense(units=8, activation='softmax')  # 8-class classification
])

# Compile the model
model1.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',  # Suitable for integer labels
    metrics=['accuracy']
)

# Train the model
history1 = model1.fit(
    train_data[0], train_labels,  # Training data (inputs, labels)
    epochs=300,
    batch_size=128,
    validation_data=(val_data[0], val_labels),  # Validation data
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)],
    verbose=1
)

# Save model1
model1.save('modeltaskmlp1.h5')

Epoch 1/300


  super().__init__(**kwargs)


[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5350 - loss: 1.0502 - val_accuracy: 0.5385 - val_loss: 0.9795
Epoch 2/300
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5562 - loss: 0.9645 - val_accuracy: 0.5386 - val_loss: 0.9755
Epoch 3/300
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5545 - loss: 0.9628 - val_accuracy: 0.5385 - val_loss: 0.9729
Epoch 4/300
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5601 - loss: 0.9605 - val_accuracy: 0.5385 - val_loss: 0.9728
Epoch 5/300
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5522 - loss: 0.9692 - val_accuracy: 0.5385 - val_loss: 0.9729
Epoch 6/300
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5547 - loss: 0.9648 - val_accuracy: 0.5385 - val_loss: 0.9736
Epoch 7/300
[1m351/351[0m [32m━



In [None]:
import zipfile
import os
from google.colab import drive

# Mount Google Drive to access files
drive.mount('/content/drive')

# Set file paths
zip_file_path = '/content/drive/My Drive/SP500_data2020-2024.zip'  # Replace with the path to the file in Google Drive
output_dir = '/content/SP500_data2020-2024'  # Extract to Colab working directory

# Extract files
os.makedirs(output_dir, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

print("Files extracted to:", output_dir)

# List all CSV files in the extracted directory
csv_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.csv')])
print(f"Found {len(csv_files)} CSV files.")
print("Sample files:", csv_files[:5])  # Print the first 5 files alphabetically

import pandas as pd

# Read all CSV files into a dictionary
sp500_data = {}
for csv_file in csv_files:
    symbol = csv_file.replace('.csv', '')  # Extract the stock symbol
    file_path = os.path.join(output_dir, csv_file)
    sp500_data[symbol] = pd.read_csv(file_path)

# Filter symbols to keep only those with the most common data length
def filter_symbols_by_most_common_length(sp500_data):
    # Calculate the length of data for each symbol
    lengths = {symbol: len(data) for symbol, data in sp500_data.items()}

    # Count the frequency of each length
    length_counts = pd.Series(lengths).value_counts()

    # Find the most common length
    most_common_length = length_counts.idxmax()
    print(f"Most common length: {most_common_length}, Count: {length_counts.max()}")

    # Filter symbols with the most common length
    filtered_symbols = [symbol for symbol, length in lengths.items() if length == most_common_length]

    # Return filtered data and the most common length
    filtered_data = {symbol: sp500_data[symbol] for symbol in filtered_symbols}
    return filtered_data, most_common_length

# Filter the data
filtered_sp500_data, most_common_length = filter_symbols_by_most_common_length(sp500_data)

# Check the filtered result
print(f"Filtered data count: {len(filtered_sp500_data)}")

# Window length
window_size = 25
degree = 2

# Data collection
data_set = {'matrices_4x4': [], 'labels1': [], 'labels2': [], 'labels3': []}

# Create matrices and labels for each stock
for index_name, data in filtered_sp500_data.items():
    # Ensure 'Open' and 'Close' columns are numeric
    data['Open'] = pd.to_numeric(data['Open'], errors='coerce')
    data['Close'] = pd.to_numeric(data['Close'], errors='coerce')

    # Extract 'Open' and 'Close' values, skipping the first two rows
    open_values = data['Open'].values[2:]
    close_values = data['Close'].values[2:]

    # Build window matrices
    num_samples = len(data) - window_size + 1
    for start in range(num_samples - 2):  # Iterate through all data points
        # Extract window data
        open_row = open_values[start:start + window_size]
        close_row = close_values[start:start + window_size]

        # Normalize using the 16th last value of close_row
        normalization_factor = close_row[-16]
        open_row = open_row / normalization_factor
        close_row = close_row / normalization_factor

        # Create an alternating combined array
        combined = np.array([open_row[i // 2] if i % 2 == 0 else close_row[i // 2] for i in range(window_size * 2)])
        matrix_4x4 = combined[:-30].reshape(4, 5, 1)
        result = [combined[-32], combined[-31], combined[-1]]

        # Perform higher-order fitting on combined data
        x = np.arange(len(combined[-4:-1]))
        coeffs = poly.polyfit(x, result, deg=degree)

        # Calculate first and second derivatives
        first_derivative_coeffs = poly.polyder(coeffs)
        first_derivatives = poly.polyval(x, first_derivative_coeffs)
        second_derivative_coeffs = poly.polyder(first_derivative_coeffs)
        second_derivatives = poly.polyval(x, second_derivative_coeffs)

        # Calculate labels
        label1 = 1 if first_derivatives[-1] > 0 else 0
        label2 = 1 if second_derivatives[-1] > 0 else 0
        label3 = 1 if close_row[-1] > close_row[-16] else 0

        # Add to the unified data collection
        data_set['matrices_4x4'].append(matrix_4x4)
        data_set['labels1'].append(label1)
        data_set['labels2'].append(label2)
        data_set['labels3'].append(label3)

# Convert data to NumPy arrays
matrices_4x4 = np.array(data_set['matrices_4x4'])
labels1 = np.array(data_set['labels1'])
labels2 = np.array(data_set['labels2'])
labels3 = np.array(data_set['labels3'])

# Output results for verification
print(f"4x4 matrices shape: {matrices_4x4.shape}")
print(f"Labels1 shape: {labels1.shape}")
print(f"Labels2 shape: {labels2.shape}")
print(f"Labels3 shape: {labels3.shape}")

Mounted at /content/drive
Files extracted to: /content/SP500_data2020-2024
Found 501 CSV files.
Sample files: ['A.csv', 'AAPL.csv', 'ABBV.csv', 'ABNB.csv', 'ABT.csv']
Most common length: 1239, Count: 489
Filtered data count: 489
4x4 matrices shape: (593157, 4, 5, 1)
Labels1 shape: (593157,)
Labels2 shape: (593157,)
Labels3 shape: (593157,)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score

# Get model prediction probabilities
predictions2_pro = model1.predict(matrices_4x4)

# Get predicted labels
predicted_classes = np.argmax(predictions2_pro, axis=1)

# Get the total number of classes
num_classes = predictions2_pro.shape[1]

# Initialize the final prediction array
final_predictions = np.full(predictions2_pro.shape[0], -1)  # Initialize with -1 to indicate unclassified

# Define the threshold range
thresholds = np.arange(0.5, 0.95, 0.05)  # From 0.5 to 0.95 with a step of 0.05

# Iterate over each threshold for evaluation
for threshold in thresholds:
    # Reset the prediction array
    final_predictions[:] = -1

    # Iterate over each class label to process samples predicted as that class
    for class_label in range(num_classes):
        # Find indices of samples predicted as the current class
        class_indices = np.where(predicted_classes == class_label)[0]

        if len(class_indices) > 0:
            # Get probabilities for these samples
            class_probs = predictions2_pro[class_indices, class_label]

            # Filter indices where probability exceeds the threshold
            selected_indices = class_indices[class_probs >= threshold]

            # Mark as odd (1) or even (0)
            odd_or_even = 1 if class_label % 2 == 1 else 0
            final_predictions[selected_indices] = odd_or_even

    # Filter out valid predictions
    valid_indices = final_predictions != -1
    filtered_final_predictions = final_predictions[valid_indices]
    filtered_true_labels = labels3[valid_indices]  # Corresponding true labels

    # Calculate accuracy
    accuracy = np.mean(filtered_final_predictions == filtered_true_labels)

    # Create confusion matrix
    cm = confusion_matrix(filtered_true_labels, filtered_final_predictions, labels=[0, 1])

    # Calculate Precision
    tp = cm[1, 1]  # True Positives
    fp = cm[0, 1]  # False Positives
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0  # Handle division by zero

    # Calculate Recall
    fn = cm[1, 0]  # False Negatives
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0  # Handle division by zero

    # Calculate F1 Score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # Print results
    print(f"Threshold: {threshold:.2f}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Precision (TP / (TP + FP)): {precision:.2%}")
    print(f"Recall (TP / (TP + FN)): {recall:.2%}")
    print(f"F1 Score: {f1:.2%}")
    print(f"Confusion Matrix:\n{cm}\n")

[1m18537/18537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step
Threshold: 0.50
Accuracy: 55.18%
Precision (TP / (TP + FP)): 55.26%
Recall (TP / (TP + FN)): 99.44%
F1 Score: 71.04%
Confusion Matrix:
[[   957 215846]
 [  1503 266636]]

Threshold: 0.55
Accuracy: 55.29%
Precision (TP / (TP + FP)): 55.33%
Recall (TP / (TP + FN)): 99.75%
F1 Score: 71.18%
Confusion Matrix:
[[   287 159375]
 [   488 197372]]

Threshold: 0.60
Accuracy: 62.39%
Precision (TP / (TP + FP)): 63.46%
Recall (TP / (TP + FN)): 95.94%
F1 Score: 76.39%
Confusion Matrix:
[[  86 1959]
 [ 144 3402]]

Threshold: 0.65
Accuracy: 70.91%
Precision (TP / (TP + FP)): 71.19%
Recall (TP / (TP + FN)): 98.79%
F1 Score: 82.75%
Confusion Matrix:
[[  31  758]
 [  23 1873]]

Threshold: 0.70
Accuracy: 73.55%
Precision (TP / (TP + FP)): 73.54%
Recall (TP / (TP + FN)): 99.87%
F1 Score: 84.71%
Confusion Matrix:
[[   6  549]
 [   2 1526]]

Threshold: 0.75
Accuracy: 75.39%
Precision (TP / (TP + FP)): 75.36%
Recall (TP / (TP + FN