In [1]:
import sys
sys.path.append(r'../src/utils/')

In [2]:
import pandas as pd
import os
from Parse10K import Parse_10k
from Parse10Q import Parse_10q
import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Main_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings.to('cuda')

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [5]:
from transformers import AutoTokenizer, AutoModel

def bert_embedding(model, tokenizer, data_dict):

    embedding_dict = {}

    for item in data_dict:
        embedding_dict[item] = []
        tokenized_dataset = Main_Dataset(tokenizer(data_dict[item], truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
        tokenized_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False)
        for batch in tokenized_dataloader:
            with torch.no_grad():
                embedding = model(**batch)
                embedding_dict[item].extend(embedding.pooler_output.clone().cpu().detach().tolist())

    return embedding_dict

In [8]:
# Extract text from 10-K and using BERT embedding
# Compute the euclidean_distances between the embeddings
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

model_type = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
model.to('cuda')

for dir in os.listdir('../data/sample/sec-filings/0000320193/10-K'):
    file_path = os.path.join('../data/sample/sec-filings/0000320193/10-K', dir, 'full-submission.txt')
    data_dict = Parse_10k(file_path)
    embedding_dict = bert_embedding(model, tokenizer, data_dict)
    torch.cuda.empty_cache()

    for i in embedding_dict:
        embedding_df = pd.DataFrame(embedding_dict[i])
        #embedding_df.to_csv(f'../sample/sec-filings/0000320193/10-K/{dir}/embedding_{i}.csv', index=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  


In [1]:
# Extract text from 10-Q and using BERT embedding
# Compute the euclidean_distances between the embeddings
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

for dir in os.listdir('../sample/sec-filings/0000320193/10-Q'):
    file_path = os.path.join('../sample/sec-filings/0000320193/10-Q', dir, 'full-submission.txt')
    data_dict = Parse_10q(file_path)
    embedding_dict = bert_embedding(model, tokenizer, data_dict)
    torch.cuda.empty_cache()

    for i in embedding_dict:
        embedding_df = pd.DataFrame(embedding_dict[i])
        embedding_df.to_csv(f'../sample/sec-filings/0000320193/10-Q/{dir}/embedding_{i}.csv', index=False)

NameError: name 'Parse_10q' is not defined

In [38]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
#np.savetxt('test.csv', cosine_similarity(embedding_dict['item7'], embedding_dict['item7']), delimiter=',')

In [25]:
len(data_dict['item7'])

201

In [43]:
euclidean_distances(embedding_dict['item7'], embedding_dict['item1a'])

array([[0.40481595, 0.50042398, 0.44980586, ..., 0.53981367, 0.54969621,
        0.66821801],
       [0.44014344, 0.30664065, 0.41236877, ..., 0.37967468, 0.47161786,
        0.69495852],
       [0.55199175, 0.5351668 , 0.56939966, ..., 0.54923428, 0.60723344,
        0.72769174],
       ...,
       [0.55262761, 0.45134904, 0.4452891 , ..., 0.49442286, 0.48552243,
        0.66906868],
       [0.45705761, 0.39144281, 0.42129927, ..., 0.38523885, 0.44769884,
        0.62019126],
       [0.49915907, 0.46015159, 0.36673597, ..., 0.48791908, 0.45626974,
        0.54283875]])