In [None]:
import pandas as pd
import numpy as np
import json

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import sys
import os
sys.path.append(os.path.abspath('../'))
from tool import *

import matplotlib.pyplot as plt


data_root = '../../MyData/'

# 1 layer, 128 dim, 10e-4 lr, 300 epoch, 256 batch size

In [None]:
# Define a simple model
class ProjectionNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ProjectionNet, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

# Define contrastive loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, z1, z2, labels):
        distances = torch.norm(z1 - z2, p=2, dim=1)
        loss = (labels * distances.pow(2)) + ((1 - labels) * torch.relu(self.margin - distances).pow(2))
        return loss.mean()

In [None]:
input_dim = 256
output_dim = 128

# Model, loss, and optimizer
model = ProjectionNet(input_dim, output_dim)
criterion = ContrastiveLoss(margin=1.0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 300
for epoch in range(num_epochs):
    print(epoch)
    total_loss = 0
    for batch_data, batch_labels, idx in dataloader:

        batch_size = batch_data.size(0)
        shuffled_indices = torch.randperm(batch_size)

        z1 = model(batch_data)
        z2 = z1[shuffled_indices]
        shuffled_idx = idx[shuffled_indices]
        pair_labels = batch_labels[torch.arange(batch_size), shuffled_idx]

        # Filter out neutral pairs (if any)
        mask = pair_labels != -1
        if mask.any():  # Check if there's at least one valid pair
            z1 = z1[mask]
            z2 = z2[mask]
            pair_labels = pair_labels[mask]

            # Compute loss
            loss = criterion(z1, z2, pair_labels)
        else:
            # Use a dummy tensor with requires_grad=True to avoid backward errors
            loss = torch.tensor(0.0, device=z1.device, requires_grad=True)
        
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

#### Clustering resuts

Normalized Within-Cluster Sum of Squares (WCSS): 0.027439458427970968

Normalized Within-Cluster Sum of Squares (WCSS): 0.018166965111753995


#### Stock correlation

0.36964344557233986

0.43379651782101963


#### Pair evaluation

0	cluster_10	0.781770	0.980949	10	119.70

1	cluster_100	0.531989	0.866828	100	11.97

# 2 layers, 128 dim, 10e-4, 500 epochs, 128 batch

In [None]:
# Define a custom dataset
class ContrastiveDataset(Dataset):
    def __init__(self, data, index_list):
        self.data = data
        self.index_list = index_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.index_list[idx]


print(average_latent.shape)
print(relation_matrix.shape)
dataset = ContrastiveDataset(average_latent, torch.arange(1197))
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

torch.Size([1197, 256])
torch.Size([1197, 1197])


In [None]:
class ProjectionNet(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256, dropout_prob=0.5):
        super(ProjectionNet, self).__init__()
        
        # Define the layers sequentially
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),         # First fully connected layer
            nn.BatchNorm1d(hidden_dim),               # Batch Normalization
            nn.LeakyReLU(negative_slope=0.01),        # LeakyReLU activation
            # nn.Dropout(p=dropout_prob),               # Dropout
            nn.Linear(hidden_dim, output_dim)         # Second fully connected layer
        )
    
    def forward(self, x):
        # Pass the input through the sequential model
        return self.model(x)


# Define contrastive loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, z1, z2, labels):
        distances = torch.norm(z1 - z2, p=2, dim=1)
        loss = (labels * distances.pow(2)) + ((1 - labels) * torch.relu(self.margin - distances).pow(2))
        return loss.mean()

In [None]:
input_dim = 256
output_dim = 128

# Model, loss, and optimizer
# model = ProjectionNet(input_dim, output_dim)
model = ProjectionNet(input_dim, output_dim, 256)
criterion = ContrastiveLoss(margin=5.0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    total_loss = 0
    for batch_data, batch_idx in dataloader:

        batch_size = batch_data.size(0)
        shuffled_indices = torch.randperm(batch_size)

        z1 = model(batch_data)
        z2 = z1[shuffled_indices]
        shuffled_idx = batch_idx[shuffled_indices]
        pair_labels = relation_matrix[batch_idx, shuffled_idx]

        # # =================================================
        # print(shuffled_indices)
        # print(z1)
        # print(z2,'\n')
        # print(idx)
        # print(shuffled_idx)

        # num_ones = (pair_labels == 1).sum().item()
        # num_neg_ones = (pair_labels == 0).sum().item()
        # print(f"Number of 1's: {num_ones}")
        # print(f"Number of 0's: {num_neg_ones}")
        # # =================================================

        # Filter out neutral pairs (if any)
        mask = pair_labels != -1
        if mask.any():  # Check if there's at least one valid pair
            z1 = z1[mask]
            z2 = z2[mask]
            pair_labels = pair_labels[mask]

            # Compute loss
            loss = criterion(z1, z2, pair_labels)
        else:
            # Use a dummy tensor with requires_grad=True to avoid backward errors
            loss = torch.tensor(0.0, device=z1.device, requires_grad=True)
        
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

Epoch [1/500], Loss: 57.0303
Epoch [2/500], Loss: 64.5819
Epoch [3/500], Loss: 76.6168
Epoch [4/500], Loss: 12.1999
Epoch [5/500], Loss: 56.9490
Epoch [6/500], Loss: 28.6786
Epoch [7/500], Loss: 83.5121
Epoch [8/500], Loss: 88.8005
Epoch [9/500], Loss: 30.8483
Epoch [10/500], Loss: 22.6382
Epoch [11/500], Loss: 26.3074
Epoch [12/500], Loss: 73.6596
Epoch [13/500], Loss: 34.8769
Epoch [14/500], Loss: 22.1943
Epoch [15/500], Loss: 27.8497
Epoch [16/500], Loss: 31.1980
Epoch [17/500], Loss: 50.1540
Epoch [18/500], Loss: 36.2814
Epoch [19/500], Loss: 14.7565
Epoch [20/500], Loss: 46.9651
Epoch [21/500], Loss: 18.2522
Epoch [22/500], Loss: 5.0161
Epoch [23/500], Loss: 65.7460
Epoch [24/500], Loss: 15.9814
Epoch [25/500], Loss: 57.8415
Epoch [26/500], Loss: 45.1925
Epoch [27/500], Loss: 36.7640
Epoch [28/500], Loss: 41.5648
Epoch [29/500], Loss: 14.0835
Epoch [30/500], Loss: 28.4817
Epoch [31/500], Loss: 48.3096
Epoch [32/500], Loss: 54.5274
Epoch [33/500], Loss: 46.8167
Epoch [34/500], Loss

#### Clustering resuts

Normalized Within-Cluster Sum of Squares (WCSS): 0.027554724748272048

Normalized Within-Cluster Sum of Squares (WCSS): 0.016283134071649663


#### Stock correlation

0.3708273747323824

0.4488078773163156


#### Pair evaluation

0	cluster_10	0.799737	0.976797	10	119.70

1	cluster_100	0.649430	0.851235	100	11.97