In [1]:
import pandas as pd
import numpy as np
import json

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import sys
import os
sys.path.append(os.path.abspath('../'))
from tool import *

import matplotlib.pyplot as plt


data_root = '../../MyData/'

# Load data (With GICS_Sector)

In [3]:
new_embedding_df = pd.read_csv("../6_with_neural_network/data/embedding_256.csv")
print(len(new_embedding_df))
new_embedding_df.head()

1197


Unnamed: 0,cik,tic,Year,GICS_Sector,item1_embeddings,SP_LONG_DESC_embeddings,SP_SHORT_DESC_embeddings,ORBIS_PROD_SERV_embeddings,ORBIS_OVERVIEW_embeddings
0,850460,WIRE,2021,20.0,"[-0.5898345708847046, 0.980162501335144, -0.96...","[-0.8353348970413208, 0.973402738571167, -0.98...","[-0.6867204904556274, 0.9938115477561951, -0.9...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,352541,LNT,2021,55.0,"[0.9820889830589294, 0.4754180908203125, 0.869...","[0.9716767072677612, 0.7131884098052979, 0.879...","[0.9754056930541992, 0.2400549054145813, 0.895...","[0.972804069519043, 0.9935750365257263, 0.9685...","[0.8935458064079285, 0.9151054620742798, 0.755..."
2,1704715,AMR,2021,15.0,"[0.9945562481880188, -0.4087281823158264, 0.68...","[0.9718917012214661, -0.33507728576660156, 0.9...","[0.9910370111465454, -0.38968169689178467, 0.1...","[0.9949943423271179, -0.8197710514068604, 0.87...","[0.9967610239982605, -0.36475804448127747, 0.7..."
3,1575515,SFM,2021,30.0,"[-0.9608494639396667, -0.926054060459137, -0.9...","[-0.9911385178565979, -0.9708278775215149, -0....","[-0.9603582620620728, -0.9756494164466858, -0....","[-0.9450609087944031, -0.40782180428504944, -0...","[-0.13962775468826294, -0.8745130896568298, -0..."
4,1125376,ENSG,2021,35.0,"[-0.9341953992843628, -0.137389674782753, -0.9...","[-0.9762463569641113, -0.8669420480728149, -0....","[-0.8951124548912048, -0.10518676787614822, -0...","[0.1766112595796585, 0.7543022036552429, -0.99...","[-0.07263371348381042, 0.22911271452903748, -0..."


### Create a mapping

In [4]:
unique_cik = new_embedding_df['cik'].unique()

cik_to_index = {cik: idx for idx, cik in enumerate(unique_cik)}
index_to_cik = {idx: cik for idx, cik in enumerate(unique_cik)}

print("CIK to Index Mapping:", cik_to_index)

CIK to Index Mapping: {850460: 0, 352541: 1, 1704715: 2, 1575515: 3, 1125376: 4, 1674335: 5, 1023128: 6, 1262039: 7, 1122976: 8, 1128928: 9, 58492: 10, 40533: 11, 1370946: 12, 1469367: 13, 858655: 14, 1486159: 15, 70145: 16, 77476: 17, 1593034: 18, 1140536: 19, 1353283: 20, 912728: 21, 73309: 22, 1754301: 23, 882796: 24, 1309402: 25, 1524358: 26, 80424: 27, 1551182: 28, 1283699: 29, 1158172: 30, 8947: 31, 804328: 32, 1120370: 33, 1655075: 34, 1361658: 35, 40987: 36, 1320414: 37, 1584509: 38, 1466301: 39, 1552033: 40, 1281761: 41, 93410: 42, 76282: 43, 19584: 44, 723531: 45, 1530804: 46, 101984: 47, 910329: 48, 5513: 49, 31462: 50, 1338749: 51, 1409171: 52, 821026: 53, 817720: 54, 5272: 55, 1013871: 56, 1111928: 57, 1393818: 58, 66382: 59, 812011: 60, 730708: 61, 1672013: 62, 1109242: 63, 883945: 64, 1527166: 65, 1421461: 66, 355811: 67, 1262823: 68, 26058: 69, 14930: 70, 1637459: 71, 892553: 72, 1345016: 73, 844965: 74, 1318220: 75, 819793: 76, 1175454: 77, 936468: 78, 29989: 79, 35291

# Generation

### Prepare features

In [8]:
target_list = ['item1_embeddings', 'SP_LONG_DESC_embeddings', 'SP_SHORT_DESC_embeddings', 'ORBIS_PROD_SERV_embeddings', 'ORBIS_OVERVIEW_embeddings']
info_list = ['cik', 'tic', 'Year', 'GICS_Sector']

# Drop rows that don't have GICS label
exp_df = convert_to_array(new_embedding_df, info_list, target_list, 256, False)
print(type(exp_df))
print(exp_df.columns)
print(exp_df.shape, '\n')



# Check if the DataFrame follows the order in cik_to_index
expected_order = [cik for _, cik in sorted(index_to_cik.items())]  # Get ordered CIKs
actual_order = exp_df["cik"].tolist()  # Get CIKs from the DataFrame

# Compare the order
if actual_order == expected_order:
    print("The DataFrame follows the expected order.")
else:
    print("The DataFrame is not in the expected order.")



# Dictionary to store the stacked embeddings as PyTorch tensors
embedding_tensors = {}
for col in target_list:
    numpy_array = np.vstack(exp_df[col].values)  # Stack the column values
    embedding_tensors[col] = torch.tensor(numpy_array, dtype=torch.float32)  # Convert to tensor

<class 'pandas.core.frame.DataFrame'>
Index(['cik', 'tic', 'Year', 'GICS_Sector', 'item1_embeddings',
       'SP_LONG_DESC_embeddings', 'SP_SHORT_DESC_embeddings',
       'ORBIS_PROD_SERV_embeddings', 'ORBIS_OVERVIEW_embeddings'],
      dtype='object')
(1197, 9) 

The DataFrame follows the expected order.


In [12]:
'''
Calculate the average of embedding for every firm
'''
def average_tensors(tensor_dict):
    """
    Computes the element-wise average of 2D tensors in a dictionary, 
    while handling NaN values properly.
    
    :param tensor_dict: Dictionary where values are 2D tensors of the same shape
    :return: A 2D tensor with the averaged values
    """
    tensor_list = list(tensor_dict.values())  # Extract tensors
    
    # Stack tensors along a new dimension (shape: [num_tensors, rows, cols])
    stacked_tensors = torch.stack(tensor_list, dim=0)

    # Create a mask for valid (non-NaN) values
    valid_mask = ~torch.isnan(stacked_tensors)

    # Replace NaNs with 0 for summation
    stacked_tensors = torch.nan_to_num(stacked_tensors, nan=0.0)

    # Sum across all tensors (axis=0)
    summed_tensors = torch.sum(stacked_tensors, dim=0)

    # Count valid (non-NaN) values at each position
    valid_counts = valid_mask.sum(dim=0)

    # Avoid division by zero and compute the average
    average_tensor = summed_tensors / torch.clamp(valid_counts, min=1)

    return average_tensor

In [10]:
from itertools import combinations
all_combinations = [
    dict(subset) for r in range(1, len(embedding_tensors))
    for subset in combinations(embedding_tensors.items(), r)
]
len(all_combinations)

30

In [13]:
all_average_latent = []
for sub_set in all_combinations:
    all_average_latent.append(average_tensors(sub_set))

for sub_set in all_average_latent:
    print(sub_set.shape)

torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])
torch.Size([1197, 256])


# Test

In [None]:

# Original dictionary
original_dict = {'a': 1, 'b': 2, 'c': 3}

# Generate all non-empty combinations of key-value pairs
all_combinations = [
    dict(subset) for r in range(1, len(original_dict))
    for subset in combinations(original_dict.items(), r)
]

# Print the result
for combo in all_combinations:
    print(combo)


{'a': 1}
{'b': 2}
{'c': 3}
{'a': 1, 'b': 2}
{'a': 1, 'c': 3}
{'b': 2, 'c': 3}
