In [1]:
from sentence_transformers import SentenceTransformer
checkpoints_path = 'checkpoints/sentence_transformers/bert_base_cased_200_v7.2_epoch1'
test_model = SentenceTransformer(checkpoints_path)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [2]:
import pandas as pd
import numpy as np

def get_sentence_list(data_path, sentence_index=0):
    df_data = pd.read_csv(data_path, sep='\t', header=None)
    df_data = np.array(df_data)
    sentence_list = df_data[:, sentence_index]
    return sentence_list

def get_embed(sentences):
    return test_model.encode(sentences, batch_size=512)

In [3]:
# Input file path
train_data_path = "data/v7/train_set_v1_7.txt"
test_data_path = "data/v7/dev_set_v1_7.txt"

# Output file path
# test_tweets_unlabeled_dataframe_v1_7
train_embed_out_path = 'data/v7/train_siamese_bert_base_cased_v7.2_epoch1'
test_embed_out_path = 'data/v7/dev_siamese_bert_base_cased_v7.2_epoch1'


In [4]:
# Read Data
train_sentence = list(get_sentence_list(train_data_path, sentence_index=1))
test_sentence = list(get_sentence_list(test_data_path, sentence_index=1))

print(len(train_sentence), type(train_sentence), type(train_sentence[0]))
print(len(test_sentence), type(test_sentence), type(test_sentence[0]))


292748 <class 'list'> <class 'str'>
36184 <class 'list'> <class 'str'>


In [5]:
%%time
test_feature_list = get_embed(test_sentence)
train_feature_list = get_embed(train_sentence)
print(len(test_feature_list), len(train_feature_list))


36184 292748
CPU times: user 5min 34s, sys: 1min 5s, total: 6min 39s
Wall time: 6min 27s


In [6]:
test_feature_list_np_array = np.asarray(test_feature_list)
train_feature_list_np_array = np.asarray(train_feature_list)

print(test_feature_list_np_array.shape)
print(train_feature_list_np_array.shape)

np.save(test_embed_out_path, test_feature_list_np_array)
np.save(train_embed_out_path, train_feature_list_np_array)


(36184, 768)
(292748, 768)
