In [2]:
import pandas as pd
import numpy as np
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
class Main_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings.to('cuda')

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [5]:
from transformers import AutoTokenizer, AutoModel

def bert_embedding(model, tokenizer, data):

    embedding_matrix = []
    tokenized_dataset = Main_Dataset(tokenizer(data, truncation=True, padding='max_length', max_length=512, return_tensors='pt'))
    tokenized_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False)
    for batch in tokenized_dataloader:
        with torch.no_grad():
            embedding = model(**batch)
            embedding_matrix.extend(embedding.pooler_output.clone().cpu().detach().tolist())

    return embedding_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from transformers import AutoTokenizer, AutoModel
model_type = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
model.to('cuda')

for file in os.listdir('../data/sample/transcripts/'):
    data = pd.read_feather('../data/sample/transcripts/' + file)
    
    embedding_dict = {}
    for i in range(data.shape[0]):
        sentence_list = data['componenttext'][i].split('. ')
        embedding = bert_embedding(model, tokenizer, sentence_list)
        embedding_dict[int(data['transcriptcomponentid'][i])] = embedding

    with open('../data/sample/transcripts/' + file.split('.')[0] + '_embedding.json', 'w') as f:
        json.dump(embedding_dict, f)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  


In [10]:
embedding_dict = {}
for i in range(data.shape[0]):
    sentence_list = data['componenttext'][i].split('. ')
    embedding = bert_embedding(model, tokenizer, sentence_list)
    embedding_dict[int(data['transcriptcomponentid'][i])] = embedding

  


In [14]:
pd.DataFrame(embedding_dict[3573155]).shape

(82, 768)

In [12]:
data

Unnamed: 0,transcriptid,transcriptcomponentid,componentorder,transcriptcomponenttypeid,transcriptcomponenttypename,transcriptpersonid,transcriptpersonname,proid,companyofperson,speakertypeid,speakertypename,componenttextpreview,word_count,componenttext
0,46831.0,3573153.0,1.0,1.0,Presentation Operator Message,1.0,Operator,,,1.0,Operator,"Good day, everyone, and welcome to the Apple I...",47.0,"Good day, everyone, and welcome to the Apple I..."
1,46831.0,3573154.0,2.0,2.0,Presenter Speech,10581.0,Nancy Paxton,,,2.0,Executives,Thank you. Good afternoon and thanks to everyo...,153.0,Thank you. Good afternoon and thanks to everyo...
2,46831.0,3573155.0,3.0,2.0,Presenter Speech,106799.0,Peter Oppenheimer,4258680.0,,2.0,Executives,"Thank you, Nancy. Thank you for joining us. We...",2146.0,"Thank you, Nancy. Thank you for joining us. We..."
3,46831.0,3573156.0,4.0,7.0,Question and Answer Operator Message,1.0,Operator,,,1.0,Operator,[Operator Instructions] And your first questio...,14.0,[Operator Instructions] And your first questio...
4,46831.0,3573157.0,5.0,3.0,Question,97937.0,C. Eugene Munster,37850311.0,,3.0,Analysts,The Mac was impressive 33% growth third quarte...,60.0,The Mac was impressive 33% growth third quarte...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,46831.0,3573231.0,79.0,4.0,Answer,106800.0,Timothy Cook,169601.0,,2.0,Executives,"Bill, we saw a small year-over-year increase i...",67.0,"Bill, we saw a small year-over-year increase i..."
79,46831.0,3573232.0,80.0,3.0,Question,101955.0,Bill Fearnley,39178260.0,,3.0,Analysts,And if I could switch gears to education. What...,106.0,And if I could switch gears to education. What...
80,46831.0,3573233.0,81.0,4.0,Answer,106800.0,Timothy Cook,169601.0,,2.0,Executives,"Well the great thing is, and I'm talking about...",171.0,"Well the great thing is, and I'm talking about..."
81,46831.0,3573234.0,82.0,4.0,Answer,10581.0,Nancy Paxton,,,2.0,Executives,"Thank you, Bill, and thanks to everyone, for y...",102.0,"Thank you, Bill, and thanks to everyone, for y..."
