In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
# import torch.nn.functional as F
import pandas as pd
import numpy as np
import json

import sys
import os

sys.path.append(os.path.abspath('../'))
from tool import *

data_root = '../../MyData/'

# Load data (With GICS_Sector)

In [2]:
item1_embedding = pd.read_csv(data_root+'merged_1197.csv')
item1_embedding = item1_embedding[['cik', 'tic', 'Year', 'item1_embeddings', 'GICS_Sector']]
print(len(item1_embedding))
print(item1_embedding.head())

other_embedding = pd.read_csv(data_root+'output_embeddings_2.csv')
other_embedding = other_embedding[['cik', 'SP_SHORT_DESC_embeddings', 'SP_LONG_DESC_embeddings', 'ORBIS_PROD_SERV_embeddings', 'ORBIS_OVERVIEW_embeddings']]
print(len(other_embedding))
print(other_embedding.head())

total_embedding = pd.merge(item1_embedding, other_embedding, on=['cik'])

# To reduce mem consumption
item1_embedding = ''
other_embedding = ''

print(len(total_embedding))
total_embedding.head()

1197
       cik   tic  Year                                   item1_embeddings  \
0   850460  WIRE  2021  [0.03955410048365593, -0.04159577935934067, -0...   
1   352541   LNT  2021  [-0.0231856107711792, 0.001279839314520359, 0....   
2  1704715   AMR  2021  [0.014074714854359627, 0.006938479840755463, 0...   
3  1575515   SFM  2021  [-0.023708730936050415, 0.01762891560792923, 0...   
4  1125376  ENSG  2021  [0.04463111609220505, 0.0018805989529937506, 0...   

   GICS_Sector  
0         20.0  
1         55.0  
2         15.0  
3         30.0  
4         35.0  
1197
       cik                           SP_SHORT_DESC_embeddings  \
0   850460  [0.01568225771188736, -0.07636360824108124, -0...   
1   352541  [-0.013836896046996117, -0.028995024040341377,...   
2  1704715  [0.0030535957776010036, 0.0008783274097368121,...   
3  1575515  [-0.025762900710105896, 0.0034106436651200056,...   
4  1125376  [0.04616139456629753, -0.0021259395871311426, ...   

                             SP_LO

Unnamed: 0,cik,tic,Year,item1_embeddings,GICS_Sector,SP_SHORT_DESC_embeddings,SP_LONG_DESC_embeddings,ORBIS_PROD_SERV_embeddings,ORBIS_OVERVIEW_embeddings
0,850460,WIRE,2021,"[0.03955410048365593, -0.04159577935934067, -0...",20.0,"[0.01568225771188736, -0.07636360824108124, -0...","[0.06089901179075241, -0.07059630751609802, -0...",,
1,352541,LNT,2021,"[-0.0231856107711792, 0.001279839314520359, 0....",55.0,"[-0.013836896046996117, -0.028995024040341377,...","[-0.021961161866784096, 0.020302705466747284, ...","[-0.015400929376482964, 0.03232719004154205, -...","[-0.041709959506988525, -0.011898815631866455,..."
2,1704715,AMR,2021,"[0.014074714854359627, 0.006938479840755463, 0...",15.0,"[0.0030535957776010036, 0.0008783274097368121,...","[-0.0017570963827893138, 0.006557094398885965,...","[-0.02030838653445244, -0.009386725723743439, ...","[-0.012340557761490345, -0.01428779773414135, ..."
3,1575515,SFM,2021,"[-0.023708730936050415, 0.01762891560792923, 0...",30.0,"[-0.025762900710105896, 0.0034106436651200056,...","[-0.015190708450973034, 0.011541897431015968, ...","[-0.013177500106394291, 0.036401789635419846, ...","[-0.01883152313530445, 0.021741388365626335, 0..."
4,1125376,ENSG,2021,"[0.04463111609220505, 0.0018805989529937506, 0...",35.0,"[0.04616139456629753, -0.0021259395871311426, ...","[0.05935207009315491, 0.0013673527864739299, 0...","[0.029405493289232254, 0.014010551385581493, 0...","[-0.006068837363272905, 0.010537531226873398, ..."


### Check dataset

In [3]:
nan_proportion = total_embedding.isna().mean()
print(f"The nan value proportion in each column:\n{nan_proportion}\n\n")

The nan value proportion in each column:
cik                           0.000000
tic                           0.000000
Year                          0.000000
item1_embeddings              0.000000
GICS_Sector                   0.006683
SP_SHORT_DESC_embeddings      0.041771
SP_LONG_DESC_embeddings       0.095238
ORBIS_PROD_SERV_embeddings    0.208020
ORBIS_OVERVIEW_embeddings     0.168755
dtype: float64




# Load ae

In [4]:
from model.obtain_model import load_ae
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trained_ae = load_ae("../model/saved_models/basic_ae.pth", device)

### Prepare data

In [17]:
target_list = ['item1_embeddings', 'SP_LONG_DESC_embeddings', 'SP_SHORT_DESC_embeddings', 'ORBIS_PROD_SERV_embeddings', 'ORBIS_OVERVIEW_embeddings']
info_list = ['cik', 'tic', 'Year', 'GICS_Sector']

exp_df = convert_to_array(total_embedding, info_list, target_list, 1536, False)
print(type(exp_df))
print(exp_df.shape)


# Dictionary to store the stacked embeddings as PyTorch tensors
embedding_tensors = {}

# Loop through the columns and convert to PyTorch tensor
for col in target_list:
    numpy_array = np.vstack(exp_df[col].values)  # Stack the column values
    embedding_tensors[col] = torch.tensor(numpy_array, dtype=torch.float32)  # Convert to tensor

# embedding_tensors is a dictionary containing all the openai embeddings I have.
emb_num = 0
for col in target_list:
    emb_num += len(embedding_tensors[col])
print(f"total number of embedding: {emb_num}")

<class 'pandas.core.frame.DataFrame'>
(1197, 9)
total number of embedding: 5985


In [20]:
'''
convert original embeddings to new latent space with trained_ae and trained_clasf
'''
def safe_inference(model, input_tensor):
    '''
    Passes the input tensor through the network,
    skipping rows containing only NaNs while preserving their original positions in the output.
    '''
    # Create a mask to identify NaN rows
    nan_mask = torch.isnan(input_tensor).all(dim=1)  # True for rows that are fully NaN
    
    # Extract valid (non-NaN) rows
    valid_rows = input_tensor[~nan_mask]  # Select rows where nan_mask is False
    
    with torch.no_grad():
        valid_output = model(valid_rows)

    if isinstance(valid_output, tuple):
        _, valid_output = valid_output
    
    # Create an output tensor filled with NaNs
    output = torch.full((input_tensor.shape[0], valid_output.shape[1]), float('nan'), device=input_tensor.device)
    
    # Insert computed values into the non-NaN positions
    output[~nan_mask] = valid_output
    
    return output

latent_tensors = {}
for col in target_list:
    latent_tensors[col] = safe_inference(trained_ae.encoder_net, embedding_tensors[col].to(device))

print(latent_tensors)

emb_num = 0
for col in target_list:
    emb_num += len(latent_tensors[col])
print(f"total number of embedding: {emb_num}")

{'item1_embeddings': tensor([[-0.5898,  0.9802, -0.9664,  ...,  0.9971,  0.9972, -0.3560],
        [ 0.9821,  0.4754,  0.8698,  ...,  0.9983,  0.9933,  0.3218],
        [ 0.9946, -0.4087,  0.6804,  ...,  0.9944,  0.9053,  0.3607],
        ...,
        [-0.3125,  0.9592,  0.8852,  ...,  0.9983,  0.9903,  0.9140],
        [-0.6024,  0.8309,  0.6144,  ...,  0.9998,  0.8273, -0.8432],
        [ 0.5317,  0.9640,  0.9502,  ..., -0.9994, -0.1773,  0.8744]],
       device='cuda:0'), 'SP_LONG_DESC_embeddings': tensor([[-0.8353,  0.9734, -0.9823,  ...,  0.9971,  0.9976, -0.7578],
        [ 0.9717,  0.7132,  0.8791,  ...,  0.9911,  0.8676, -0.7440],
        [ 0.9719, -0.3351,  0.9141,  ...,  0.9956,  0.9862,  0.3395],
        ...,
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [-0.1758,  0.9904,  0.8948,  ..., -0.9995, -0.1103,  0.8007]],
       device='cuda:0'), 'SP_SHORT_DESC_embeddings': tensor

In [24]:
for col in target_list:
    exp_df[col] = latent_tensors[col].tolist()

In [None]:
# exp_df.to_csv("./data/embedding_256.csv", index=False)

  values = values.astype(str)
