BERTopic embedding improvement script

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import gensim

import nltk

import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

In [2]:
%load_ext autoreload

In [3]:
import sys

sys.path.append('../')

In [4]:
# load the dataset

%autoreload 2
from dataset_loader import GENRES, load_dataset

genre = GENRES.INDIE
unique_list = ['review_text']

dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
dataset, dataset_path = load_dataset(genre, dataset_folder)

dataset.info(verbose=True)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/01_indie.pkl



<class 'pandas.core.frame.DataFrame'>
Index: 725737 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         725737 non-null  int64 
 1   app_id        725737 non-null  int64 
 2   app_name      725737 non-null  object
 3   review_text   725737 non-null  object
 4   review_score  725737 non-null  int64 
 5   review_votes  725737 non-null  int64 
 6   genre_id      725737 non-null  object
 7   category_id   725737 non-null  object
dtypes: int64(4), object(4)
memory usage: 49.8+ MB


In [5]:
# The path of the dataset to be stored to the config file
str(dataset_path.relative_to(dataset_path.parent.parent.parent.parent))

'dataset/topic_modelling/top_11_genres_unique_[review_text]/01_indie.pkl'

In [6]:
# data preprocessing

sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [7]:
cleaning(dataset, 'review_text')

In [8]:
# remove reviews with too many punctuations

def calculate_nonalphabet_ratio(review: str) -> float:
    count = 0
    for char in review:
        if not char.isalpha():
            count += 1
    return count / (len(review) + 1e-5)

dataset['alphabet_ratio'] = dataset['review_text'].apply(calculate_nonalphabet_ratio)

dataset['alphabet_ratio'].describe([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count    725737.000000
mean          0.222376
std           0.059417
min           0.000000
25%           0.200000
50%           0.215385
75%           0.234257
90%           0.263889
95%           0.294964
99%           0.406250
max           1.000000
Name: alphabet_ratio, dtype: float64

In [9]:
# remove reviews with too many punctuations
# ratio = 99 percentile

# this further remove ~7.4K reviews

dataset = dataset[dataset['alphabet_ratio'] < dataset.alphabet_ratio.quantile(0.99)]

In [10]:
X = dataset['review_text'].values

In [11]:
# remove empty strings

X = list(filter(lambda x: len(x) > 0, X))

In [12]:
# check the length when loading in a evaluation script

print(len(X))
print(X[0])

718440
Take one part Faerie Solitaire and two parts Puzzle Quest and mix in a little Poker or Yahtzee for good measure and you will get something like Runespell: Overture. You're a changeling of some sort and you fight monsters and take quests in exchange for coin and buffs (which come in the form of power-up cards). There's a story but it's not the strongest element in the game. Like the Puzzle Quest games, your battles are determined by playing a mini-game. Instead of match-3 though, the game is a card game similar to poker in which making certain combinations of cards (pairs, 5 of a kind, full house, flush, straight) will do a certain amount of damage to your opponent, who is trying to do the same to you. The ability to steal some cards from your opponent, plus the limited number of moves you get per turn to move cards or play power-ups adds just enough strategy to the game to keep it interesting. Admittedly, the game can get a bit repetitive after a while and I found the dialogue o

In [13]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [25]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer('all-MiniLM-L6-v2')

This script is about encoding with long text

First we create two sub array of texts for comparison of different scripts to produce embeddings

In [37]:
longest = max(X, key=len)
len(longest)

8000

In [17]:
X_longtext = list(filter(lambda x: len(x.split()) > 256, X))
print(len(X_longtext))

30122


In [18]:
X_shorttext = list(filter(lambda x: len(x) <= 100, X))

In [19]:
X_longtext[0]

"Runespell: Overture melds together classic RPG components with Windows Live's player-vs-player solitaire with poker. I obtained Runespell for free in a pile of coal during the Steam Holiday Sale 2011 event. Receiving the game for free, I simply assumed that Runespell was merely a terrible, no-name title that was not worth playing. It was not until the Summer 2013 that I truly realized what I had come across. Runespell sports a great soundtrack with honestly a decent storyline. Often times, card games that try to incorporate RPG elements (or just a storyline) end up drowning out the added components with the basic mechanics the card game - Runespell does not do this. If you are a cards-only enthusiast, this may stray a little too far from those roots; if you are an RPG-only enthusaist, this may feel more like a card game. However, if you are willing try a game that can have both, Runespell: Overture is for you. Verdict: While Runespell: Overture is far from a lump of coal turned into a

In [41]:
# check length of the longest review

counter = 0

# o = sbert[0].tokenizer(X, return_attention_mask=False, return_token_type_ids=False)

from tqdm.autonotebook import trange
for start_index in trange(0, len(X), 32, desc="Batches", disable=False):
    sentences_batch = X[start_index : start_index + 32]
    features = sbert[0].tokenizer(sentences_batch, return_attention_mask=False, return_token_type_ids=False)

    for input_id in features['input_ids']:
        if len(input_id) > 256:
            counter += 1

Batches:   0%|          | 0/22452 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors
Batches:   1%|          | 120/22452 [00:00<01:17, 287.18it/s]

Batches: 100%|██████████| 22452/22452 [01:30<00:00, 248.37it/s]


In [42]:
test_encode_sbert = sbert.encode(X[:10])

In [43]:
test_encode_sbert.shape

(10, 384)

In [44]:
counter

43594

Work-around 1: Use the tokenizer from sbert, then calculate the embeddings with huggingface tramsformer

In [22]:
def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.extend(_input_id_chunk)
        _new_token_type_ids.extend(_token_type_id)
        _new_mask_chunks.extend(_mask_chunk)    

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result

def add_special_tokens_at_beginning_and_end(data) -> dict:
    """
    Adds special CLS token (token id = 101) at the beginning.
    Adds SEP token (token id = 102) at the end of each chunk.
    Adds corresponding attention masks equal to 1 (attention mask is boolean).
    """

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []


    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _input_id_chunk = [101] + input_id + [102]
        _token_type_id = [0] + token_type_id + [0]
        _mask_chunk = [1] + mask_chunk + [1]

        _new_input_id_chunks.append(_input_id_chunk)
        _new_token_type_ids.append(_token_type_id)
        _new_mask_chunks.append(_mask_chunk)

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}

def add_padding_tokens(data, pad_len:int) -> dict:
    """Adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly 512 tokens."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []


    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _pad_len = pad_len - len(input_id)

        if _pad_len > 0:
            _input_id_chunk = input_id + [0] * _pad_len
            _token_type_id = token_type_id + [0] * _pad_len
            _mask_chunk = mask_chunk + [0] * _pad_len
        else:
            _input_id_chunk = input_id
            _token_type_id = token_type_id
            _mask_chunk = mask_chunk

        _new_input_id_chunks.append(_input_id_chunk)
        _new_token_type_ids.append(_token_type_id)
        _new_mask_chunks.append(_mask_chunk)

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}


def add_special_tokens_and_padding_tokens(data, pad_len:int) -> dict:
    """
    Adds special CLS token (token id = 101) at the beginning.
    Adds SEP token (token id = 102) at the end of each chunk.
    Adds corresponding attention masks equal to 1 (attention mask is boolean).
    Then adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly same number of tokens for the sbert model.

    Parameters:
    data: dict
        A dictionary containing input_ids, token_type_ids, and attention_mask
    pad_len: int
        The len of the resulting chunks
    """
    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _input_id_chunk = [101] + input_id + [102]
        _token_type_id = [0] + token_type_id + [0]
        _mask_chunk = [1] + mask_chunk + [1]

        _pad_len = pad_len - len(_input_id_chunk)

        if _pad_len > 0:
            _input_id_chunk = _input_id_chunk + [0] * _pad_len
            _token_type_id = _token_type_id + [0] * _pad_len
            _mask_chunk = _mask_chunk + [0] * _pad_len

        _new_input_id_chunks.append(_input_id_chunk)
        _new_token_type_ids.append(_token_type_id)
        _new_mask_chunks.append(_mask_chunk)

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}


In [40]:
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def tokenize_dataset(data):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)


# Sentences we want sentence embeddings for
new_sentences = []
sentences = X_longtext[:40]
# sentences = X_shorttext[:10]

# Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = sbert_model[0].tokenizer
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

ds_sentences = Dataset.from_dict({'text': sentences})

# Tokenize sentences
# for start_index in trange(0, len(sentences), 32, desc="Batches", disable=False):
#     sentences_batch = sentences[start_index : start_index + 32]
#     features = sbert_model[0].tokenizer(sentences_batch, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False)

ds_sentences = ds_sentences.map(tokenize_dataset, batched=True)
ds_sentences_2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask']})

ds_sentences_2 = ds_sentences_2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert_model.max_seq_length-2, 'stride': sbert_model.max_seq_length-2, 'minimal_chunk_length': 1})
ds_sentences_3 = ds_sentences_2.map(add_special_tokens_at_beginning_and_end, batched=True)
ds_sentences_4 = ds_sentences_3.map(add_padding_tokens, batched=True, fn_kwargs={'pad_len': sbert_model.max_seq_length})

# recreate a new text with splitted chunks
# funcs: https://blog.csdn.net/weixin_42043940/article/details/108015797
for input_id in ds_sentences_2['input_ids']:
    new_sentences.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

dataset = ds_sentences_4

dataset_dict = {k: torch.tensor(v) for k, v in dataset.to_dict().items()}



# input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(features, chunk_size=sbert_model.max_seq_length-2, stride=sbert_model.max_seq_length-2, minimal_chunk_length=1)
# add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
# add_padding_tokens(input_id_chunks, mask_chunks)
# input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)

# Compute token embeddings
with torch.no_grad():
    model_output = model(**dataset_dict)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, dataset_dict['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 40/40 [00:00<00:00, 1007.41 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 2809.31 examples/s]
Map: 100%|██████████| 107/107 [00:00<00:00, 7210.76 examples/s]
Map: 100%|██████████| 107/107 [00:00<00:00, 6957.56 examples/s]


Sentence embeddings:
tensor([[-0.0596, -0.0540, -0.0341,  ..., -0.0373, -0.0372,  0.0121],
        [-0.0753,  0.0279, -0.0197,  ..., -0.0291, -0.0666,  0.1272],
        [-0.0416, -0.0222, -0.0502,  ...,  0.0350, -0.0647,  0.1271],
        ...,
        [ 0.0123,  0.0493,  0.0251,  ...,  0.0321, -0.0565,  0.0442],
        [-0.0369, -0.0344,  0.0276,  ..., -0.0670, -0.1108,  0.0040],
        [-0.0094, -0.0693,  0.0462,  ..., -0.1637, -0.1084, -0.0200]])


In [68]:
len(ds_sentences_4['input_ids'][0])

256

In [58]:
for i in dataset:
    print(new_sentences[i])
    print('')

TypeError: list indices must be integers or slices, not dict

In [60]:
sentences

["Runespell: Overture melds together classic RPG components with Windows Live's player-vs-player solitaire with poker. I obtained Runespell for free in a pile of coal during the Steam Holiday Sale 2011 event. Receiving the game for free, I simply assumed that Runespell was merely a terrible, no-name title that was not worth playing. It was not until the Summer 2013 that I truly realized what I had come across. Runespell sports a great soundtrack with honestly a decent storyline. Often times, card games that try to incorporate RPG elements (or just a storyline) end up drowning out the added components with the basic mechanics the card game - Runespell does not do this. If you are a cards-only enthusiast, this may stray a little too far from those roots; if you are an RPG-only enthusaist, this may feel more like a card game. However, if you are willing try a game that can have both, Runespell: Overture is for you. Verdict: While Runespell: Overture is far from a lump of coal turned into 

In [103]:
new_sentences

['a good game in the beginning but later it gets boring and eventually even frustrating.',
 'does not work in mac os or no support available from the company.',
 'its good cuz therse cards an teyy fight guys',
 'really cool game.',
 'will not open on mac.',
 'this is actually a really cool game and fun too. also changelings.',
 'nice twist on poker. i liked it',
 'grate game to help past time',
 "it ' s it comes free with windows",
 "oh it ' s so i don ' t it makes you loose track of time : )"]

In [90]:
tokenizer_hg = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

assert new_sentences[0] == tokenizer_hg.convert_ids_to_tokens(tokenizer_hg.encode(sentences[0], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False,truncation=False, return_tensors=))


Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors


In [80]:
ds_sentences_2['input_ids']

[[29161,
  11880,
  2140,
  25052,
  1006,
  4003,
  1013,
  11662,
  1013,
  22531,
  1013,
  5656,
  8893,
  1007,
  2005,
  7047,
  2057,
  2031,
  8389,
  3737,
  1006,
  7915,
  1010,
  3435,
  1010,
  3722,
  1010,
  2204,
  1010,
  3376,
  1010,
  1998,
  10392,
  1007,
  1010,
  3332,
  2098,
  2030,
  2440,
  18182,
  1010,
  8011,
  1998,
  4530,
  2030,
  11562,
  1998,
  11562,
  1006,
  2043,
  3048,
  5329,
  2105,
  1007,
  1010,
  1998,
  4165,
  1006,
  2189,
  1998,
  16420,
  2595,
  1007,
  1012,
  2017,
  2064,
  2036,
  5454,
  1037,
  3898,
  5813,
  5973,
  2013,
  19714,
  2595,
  18139,
  2692,
  2000,
  4444,
  2595,
  10790,
  17914,
  1012,
  1996,
  14924,
  4818,
  2003,
  2200,
  2092,
  1011,
  2589,
  1010,
  4252,
  2017,
  1996,
  14607,
  2302,
  2108,
  15703,
  1012,
  2061,
  2054,
  2785,
  1997,
  2208,
  2003,
  2023,
  1029,
  2009,
  3504,
  2066,
  2019,
  22531,
  2008,
  2017,
  2377,
  2007,
  5329,
  1010,
  2021,
  2008,
  1005,
  1055

In [45]:
sentence_embeddings2 = sbert.encode(sentences)

In [34]:
# compare the two embeddings
print(sentence_embeddings.dtype)
print(sentence_embeddings2.dtype)
print('\n\n')

# use numpy.assert_allclose to compare the two embeddings
np.testing.assert_allclose(sentence_embeddings.cpu().numpy(), sentence_embeddings2, rtol=1e-4, atol=1e-4)

torch.float32
float32





In [76]:
# convert input_ids to token, then to str

tokenizer.convert_ids_to_tokens(features[0]['input_ids'][0])

['runes',
 '##pel',
 '##l',
 'overture',
 '(',
 'card',
 '/',
 'poker',
 '/',
 'rpg',
 '/',
 'strategy',
 'hybrid',
 ')',
 'for',
 'options',
 'we',
 'have',
 'graphics',
 'quality',
 '(',
 'fastest',
 ',',
 'fast',
 ',',
 'simple',
 ',',
 'good',
 ',',
 'beautiful',
 ',',
 'and',
 'fantastic',
 ')',
 ',',
 'window',
 '##ed',
 'or',
 'full',
 '##screen',
 ',',
 'drag',
 'and',
 'drop',
 'or',
 'click',
 'and',
 'click',
 '(',
 'when',
 'moving',
 'cards',
 'around',
 ')',
 ',',
 'and',
 'sounds',
 '(',
 'music',
 'and',
 'sf',
 '##x',
 ')',
 '.',
 'you',
 'can',
 'also',
 'choose',
 'a',
 'screen',
 'resolution',
 'anywhere',
 'from',
 '640',
 '##x',
 '##48',
 '##0',
 'to',
 '1920',
 '##x',
 '##10',
 '##80',
 '.',
 'the',
 'tutor',
 '##ial',
 'is',
 'very',
 'well',
 '-',
 'done',
 ',',
 'teaching',
 'you',
 'the',
 'ropes',
 'without',
 'being',
 'annoying',
 '.',
 'so',
 'what',
 'kind',
 'of',
 'game',
 'is',
 'this',
 '?',
 'it',
 'looks',
 'like',
 'an',
 'rpg',
 'that',
 'you',
 

In [81]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(features[0]['input_ids'][0], skip_special_tokens=True))

"runespell overture ( card / poker / rpg / strategy hybrid ) for options we have graphics quality ( fastest, fast, simple, good, beautiful, and fantastic ), windowed or fullscreen, drag and drop or click and click ( when moving cards around ), and sounds ( music and sfx ). you can also choose a screen resolution anywhere from 640x480 to 1920x1080. the tutorial is very well - done, teaching you the ropes without being annoying. so what kind of game is this? it looks like an rpg that you play with cards, but that ' s not it at all. the primary thrust is actually a solitaire it ' s battle - solitaire - poker with rpg elements! * evil grin * whee ~! how does that work? looking at the screenshots will help a bit. as you can see, we and our opponent are each dealt a set of cards, solitaire - style. ours is on top, theirs is on the bottom. we need to create poker hands by grouping cards in sets of five. if you don ' t know the poker hands, no worries - - the game will show you what groupings 

---

Approach 2: use tokenizer to quickly tokenize all text, then split the text by the length of the token to create new text

Then use sbert to embed, as sbert has sophisicated mem control for very large dataset

In [41]:
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def tokenize_dataset(data):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)


# Sentences we want sentence embeddings for
new_sentences = []
# sentences = X
sentences = X_longtext[:40]
# sentences = X_shorttext[:10]

# Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = sbert_model[0].tokenizer
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').cuda()

ds_sentences = Dataset.from_dict({'text': sentences})

# Tokenize sentences
# for start_index in trange(0, len(sentences), 32, desc="Batches", disable=False):
#     sentences_batch = sentences[start_index : start_index + 32]
#     features = sbert_model[0].tokenizer(sentences_batch, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False)

ds_sentences = ds_sentences.map(tokenize_dataset, batched=True)
ds_sentences_2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask']})

ds_sentences_2 = ds_sentences_2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert_model.max_seq_length-2, 'stride': sbert_model.max_seq_length-2, 'minimal_chunk_length': 1})
# ds_sentences_3 = ds_sentences_2.map(add_special_tokens_at_beginning_and_end, batched=True)
# ds_sentences_4 = ds_sentences_3.map(add_padding_tokens, batched=True, fn_kwargs={'pad_len': sbert_model.max_seq_length})

# recreate a new text with splitted chunks
# funcs: https://blog.csdn.net/weixin_42043940/article/details/108015797
for input_id in ds_sentences_2['input_ids']:
    new_sentences.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

embeddings = sbert_model.encode(new_sentences, show_progress_bar=True, batch_size=32)

# dataset = ds_sentences_4.to_iterable_dataset()

# dataset_dict = {k: torch.tensor(v) for k, v in dataset.to_dict().items()}



# input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(features, chunk_size=sbert_model.max_seq_length-2, stride=sbert_model.max_seq_length-2, minimal_chunk_length=1)
# add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
# add_padding_tokens(input_id_chunks, mask_chunks)
# input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)

# Compute token embeddings
# with torch.no_grad():
#     model_output = model(**dataset)

# # Perform pooling
# sentence_embeddings = mean_pooling(model_output, dataset['attention_mask'])

# # Normalize embeddings
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# print("Sentence embeddings:")
# print(sentence_embeddings)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 40/40 [00:00<00:00, 1282.51 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 2601.72 examples/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 16.72it/s]


In [42]:
embeddings

array([[-0.05962738, -0.05401213, -0.03413564, ..., -0.03733548,
        -0.0371554 ,  0.01211602],
       [-0.08233497,  0.02996074, -0.02355522, ..., -0.06725559,
        -0.04395745,  0.10841285],
       [-0.04162469, -0.02216977, -0.05017149, ...,  0.03499959,
        -0.06472465,  0.12707347],
       ...,
       [ 0.01234634,  0.04928512,  0.02506178, ...,  0.03211971,
        -0.0565111 ,  0.04421298],
       [-0.03691207, -0.03443582,  0.02758813, ..., -0.06702632,
        -0.11079085,  0.00399974],
       [-0.00944206, -0.06928544,  0.04619222, ..., -0.16365847,
        -0.10841736, -0.01999812]], dtype=float32)

In [64]:
sentence_embeddings.cpu().numpy()

array([[-0.05962733, -0.05401207, -0.03413556, ..., -0.03733547,
        -0.0371554 ,  0.01211603],
       [-0.07531875,  0.02791587, -0.01967332, ..., -0.02912043,
        -0.06664141,  0.12719508],
       [-0.04162464, -0.02216974, -0.05017136, ...,  0.03499951,
        -0.06472477,  0.12707353],
       ...,
       [ 0.01234631,  0.04928513,  0.0250617 , ...,  0.03211972,
        -0.05651109,  0.04421302],
       [-0.03691205, -0.03443579,  0.02758805, ..., -0.06702632,
        -0.11079085,  0.00399982],
       [-0.00944207, -0.06928538,  0.04619218, ..., -0.16365841,
        -0.10841739, -0.01999813]], dtype=float32)

In [46]:
sentence_embeddings2

array([[-0.05962738, -0.05401213, -0.03413564, ..., -0.03733548,
        -0.0371554 ,  0.01211602],
       [-0.04162469, -0.02216977, -0.05017149, ...,  0.03499959,
        -0.06472465,  0.12707347],
       [-0.04332576, -0.02880152, -0.00841515, ..., -0.00979051,
        -0.05014643,  0.01050244],
       ...,
       [-0.05029024, -0.02632071,  0.02227809, ..., -0.02229666,
         0.0047383 ,  0.04096819],
       [-0.01987775,  0.00541067,  0.07004219, ..., -0.12607142,
        -0.06799727,  0.05826021],
       [-0.03691207, -0.03443582,  0.02758813, ..., -0.06702632,
        -0.11079085,  0.00399974]], dtype=float32)

In [58]:
embeddings.shape

(107, 384)

In [71]:
# regarding the first row (i.e. the first sentence, with truncate=True in default sbert.encode() func)
# they are the same :D
np.testing.assert_allclose(embeddings[0], sentence_embeddings2[0], rtol=1e-4, atol=1e-4)
    
np.testing.assert_allclose(embeddings[0], sentence_embeddings[0], rtol=1e-4, atol=1e-4)

# comparing work-around 1 and work-around 2
# as the sentences are sorted by len in sbert.encode()
# hence we need to sort the embeddings by len of the new_sentences

# sort the embeddings by len of the new_sentences
embeddings_s = embeddings[np.argsort([sum([len(s) for s in new_sentences])])]
sentence_embeddings2_s = sentence_embeddings2[np.argsort([sum([len(s) for s in new_sentences])])]

# test passes
np.testing.assert_allclose(embeddings_s, sentence_embeddings2_s, rtol=1e-4, atol=1e-4)

In [72]:
# ref: https://blog.csdn.net/qysh123/article/details/126438203

example = sentences[0].split(' ')
example_batch_encoding = features[0]

word2char_mapping = {}
char_cursor = 0
print(sentences[0])
print(example_batch_encoding['input_ids'])
print(len(example_batch_encoding['input_ids']))
print(example_batch_encoding.tokens())
print(len(example_batch_encoding.tokens()))
print(example_batch_encoding.word_ids())
print(len(example_batch_encoding.word_ids()))
print('\n\n')

for ind in range(len(example)):
    if len(example[ind]) > 0:
        start = char_cursor
        end = char_cursor + len(example[ind])
        word2char_mapping[ind] = (start, end)
        char_cursor = char_cursor + len(example[ind]) + 1       # consider the white-space length

print(word2char_mapping)

word2token_mapping = {}
for token_index in range(len(example_batch_encoding.tokens())):
    this_token = example_batch_encoding.word_ids()[token_index]
    if this_token is None:
        continue
        
    char_span = example_batch_encoding.token_to_chars(token_index)
    for word in word2char_mapping:
        start = word2char_mapping[word][0]
        end = word2char_mapping[word][1]

        if char_span.start >= start and char_span.end <= end:
            if word in word2token_mapping:
                word2token_mapping[word].append(token_index)
            else:
                word2token_mapping[word] = [token_index]

print(word2token_mapping)


Runespell Overture (Card/Poker/RPG/Strategy Hybrid) For options we have Graphics Quality (Fastest, Fast, Simple, Good, Beautiful, and Fantastic), Windowed or Fullscreen, Drag and Drop or Click and Click (when moving cards around), and Sounds (Music and SFX). You can also choose a screen resolution anywhere from 640x480 to 1920x1080. The tutorial is very well-done, teaching you the ropes without being annoying. So what kind of game is this? It looks like an RPG that you play with cards, but that's not it at all. The primary thrust is actually a solitaire It's battle-solitaire-poker with RPG elements! *evil grin* Whee~! How does that work? Looking at the screenshots will help a bit. As you can see, we and our opponent are each dealt a set of cards, solitaire-style. Ours is on top, theirs is on the bottom. We need to create poker hands by grouping cards in sets of five. If you don't know the poker hands, no worries -- the game will show you what groupings are 'legal' to play. Each 'hand' 

In [62]:
encoding = tokenizer(sentences, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, padding=True, truncation=False)

encoding

{'input_ids': [[29161, 11880, 2140, 25052, 1006, 4003, 1013, 11662, 1013, 22531, 1013, 5656, 8893, 1007, 2005, 7047, 2057, 2031, 8389, 3737, 1006, 7915, 1010, 3435, 1010, 3722, 1010, 2204, 1010, 3376, 1010, 1998, 10392, 1007, 1010, 3332, 2098, 2030, 2440, 18182, 1010, 8011, 1998, 4530, 2030, 11562, 1998, 11562, 1006, 2043, 3048, 5329, 2105, 1007, 1010, 1998, 4165, 1006, 2189, 1998, 16420, 2595, 1007, 1012, 2017, 2064, 2036, 5454, 1037, 3898, 5813, 5973, 2013, 19714, 2595, 18139, 2692, 2000, 4444, 2595, 10790, 17914, 1012, 1996, 14924, 4818, 2003, 2200, 2092, 1011, 2589, 1010, 4252, 2017, 1996, 14607, 2302, 2108, 15703, 1012, 2061, 2054, 2785, 1997, 2208, 2003, 2023, 1029, 2009, 3504, 2066, 2019, 22531, 2008, 2017, 2377, 2007, 5329, 1010, 2021, 2008, 1005, 1055, 2025, 2009, 2012, 2035, 1012, 1996, 3078, 7400, 2003, 2941, 1037, 14017, 29422, 2009, 1005, 1055, 2645, 1011, 14017, 29422, 1011, 11662, 2007, 22531, 3787, 999, 1008, 4763, 5861, 1008, 1059, 21030, 1066, 999, 2129, 2515, 2008, 2

In [64]:
encoding._encodings

[Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=1266, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_to

In [49]:
ttt = BatchEncoding({'input_ids': ds_sentences['input_ids'], 'attention_mask': ds_sentences['attention_mask']})

In [55]:
ttt.token_to_chars(0)

ValueError: token_to_chars() is not available when using Python based tokenizers

In [51]:
word2char_mapping = {}
char_cursor=0
for ind in range(len())

TypeError: token_to_chars() missing 1 required positional argument: 'batch_or_token_index'

In [46]:
ds_sentences['input_ids']

[[29161,
  11880,
  2140,
  25052,
  1006,
  4003,
  1013,
  11662,
  1013,
  22531,
  1013,
  5656,
  8893,
  1007,
  2005,
  7047,
  2057,
  2031,
  8389,
  3737,
  1006,
  7915,
  1010,
  3435,
  1010,
  3722,
  1010,
  2204,
  1010,
  3376,
  1010,
  1998,
  10392,
  1007,
  1010,
  3332,
  2098,
  2030,
  2440,
  18182,
  1010,
  8011,
  1998,
  4530,
  2030,
  11562,
  1998,
  11562,
  1006,
  2043,
  3048,
  5329,
  2105,
  1007,
  1010,
  1998,
  4165,
  1006,
  2189,
  1998,
  16420,
  2595,
  1007,
  1012,
  2017,
  2064,
  2036,
  5454,
  1037,
  3898,
  5813,
  5973,
  2013,
  19714,
  2595,
  18139,
  2692,
  2000,
  4444,
  2595,
  10790,
  17914,
  1012,
  1996,
  14924,
  4818,
  2003,
  2200,
  2092,
  1011,
  2589,
  1010,
  4252,
  2017,
  1996,
  14607,
  2302,
  2108,
  15703,
  1012,
  2061,
  2054,
  2785,
  1997,
  2208,
  2003,
  2023,
  1029,
  2009,
  3504,
  2066,
  2019,
  22531,
  2008,
  2017,
  2377,
  2007,
  5329,
  1010,
  2021,
  2008,
  1005,
  1055

In [27]:
ddd = dataset.to_dict()

In [28]:
ddd['input_ids']

[[101,
  29161,
  11880,
  2140,
  25052,
  1006,
  4003,
  1013,
  11662,
  1013,
  22531,
  1013,
  5656,
  8893,
  1007,
  2005,
  7047,
  2057,
  2031,
  8389,
  3737,
  1006,
  7915,
  1010,
  3435,
  1010,
  3722,
  1010,
  2204,
  1010,
  3376,
  1010,
  1998,
  10392,
  1007,
  1010,
  3332,
  2098,
  2030,
  2440,
  18182,
  1010,
  8011,
  1998,
  4530,
  2030,
  11562,
  1998,
  11562,
  1006,
  2043,
  3048,
  5329,
  2105,
  1007,
  1010,
  1998,
  4165,
  1006,
  2189,
  1998,
  16420,
  2595,
  1007,
  1012,
  2017,
  2064,
  2036,
  5454,
  1037,
  3898,
  5813,
  5973,
  2013,
  19714,
  2595,
  18139,
  2692,
  2000,
  4444,
  2595,
  10790,
  17914,
  1012,
  1996,
  14924,
  4818,
  2003,
  2200,
  2092,
  1011,
  2589,
  1010,
  4252,
  2017,
  1996,
  14607,
  2302,
  2108,
  15703,
  1012,
  2061,
  2054,
  2785,
  1997,
  2208,
  2003,
  2023,
  1029,
  2009,
  3504,
  2066,
  2019,
  22531,
  2008,
  2017,
  2377,
  2007,
  5329,
  1010,
  2021,
  2008,
  1005,

In [50]:
len(ds_sentences_2['input_ids'][0])

254

In [47]:
len(ds_sentences_3['input_ids'][0])

256

In [62]:
for i in ds_sentences_4['input_ids']:
    print(len(i))

256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256


In [49]:
ds_sentences_2['token_type_ids']

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [53]:
len(features['input_ids'])

10

In [49]:
sbert.max_seq_length

256

In [40]:
features

{'input_ids': [[101, 2045, 3432, 4995, 1005, 1056, 2438, 2399, 2007, 18148, 1004, 2130, 8491, 1997, 2122, 2008, 2024, 2941, 4276, 2652, 999, 2005, 2033, 1010, 18747, 1024, 19508, 1006, 1051, 1024, 1052, 1007, 2003, 2008, 4678, 6453, 1012, 2348, 2025, 2172, 1999, 2126, 1997, 1037, 2466, 1999, 1996, 2208, 1006, 2053, 2309, 2447, 2044, 2035, 1007, 1010, 2045, 2003, 3243, 1037, 6925, 2006, 2129, 18747, 2234, 2000, 2022, 1996, 2208, 2009, 2003, 1996, 2460, 2544, 2003, 1010, 2008, 1051, 1024, 1052, 2003, 2019, 6622, 1997, 3025, 18747, 4486, 1010, 22412, 21038, 1004, 22412, 3786, 7698, 2761, 5306, 2067, 2000, 1037, 3144, 2249, 14590, 7559, 3334, 3049, 1012, 2348, 28667, 25587, 2220, 6256, 1006, 2008, 2052, 5373, 3102, 1037, 2208, 14489, 2046, 27885, 28817, 15780, 1007, 1010, 1996, 2208, 2506, 2000, 19852, 2007, 5377, 14409, 1006, 2502, 1004, 2235, 1007, 2058, 1996, 2086, 2013, 1996, 9797, 1012, 12313, 5881, 2007, 2009, 999, 2023, 12276, 1004, 16612, 25416, 3170, 3672, 1997, 1996, 2208, 2038, 

In [44]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
features_test = tokenizer(sentences[0], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True, padding=False, truncation=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (916 > 512). Running this sequence through the model will result in indexing errors


In [46]:
features_test

{'input_ids': [101, 29161, 11880, 2140, 25052, 1006, 4003, 1013, 11662, 1013, 22531, 1013, 5656, 8893, 1007, 2005, 7047, 2057, 2031, 8389, 3737, 1006, 7915, 1010, 3435, 1010, 3722, 1010, 2204, 1010, 3376, 1010, 1998, 10392, 1007, 1010, 3332, 2098, 2030, 2440, 18182, 1010, 8011, 1998, 4530, 2030, 11562, 1998, 11562, 1006, 2043, 3048, 5329, 2105, 1007, 1010, 1998, 4165, 1006, 2189, 1998, 16420, 2595, 1007, 1012, 2017, 2064, 2036, 5454, 1037, 3898, 5813, 5973, 2013, 19714, 2595, 18139, 2692, 2000, 4444, 2595, 10790, 17914, 1012, 1996, 14924, 4818, 2003, 2200, 2092, 1011, 2589, 1010, 4252, 2017, 1996, 14607, 2302, 2108, 15703, 1012, 2061, 2054, 2785, 1997, 2208, 2003, 2023, 1029, 2009, 3504, 2066, 2019, 22531, 2008, 2017, 2377, 2007, 5329, 1010, 2021, 2008, 1005, 1055, 2025, 2009, 2012, 2035, 1012, 1996, 3078, 7400, 2003, 2941, 1037, 14017, 29422, 2009, 1005, 1055, 2645, 1011, 14017, 29422, 1011, 11662, 2007, 22531, 3787, 999, 1008, 4763, 5861, 1008, 1059, 21030, 1066, 999, 2129, 2515, 200

In [48]:
assert features_test['input_ids'] == features['input_ids'][0]

In [31]:
# copied from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = X_longtext[:256]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')


# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.0501, -0.0386, -0.0553,  ...,  0.0457, -0.0483,  0.1054],
        [-0.0410, -0.0452, -0.0067,  ..., -0.0521, -0.0062,  0.0525],
        [-0.0129,  0.0241, -0.0244,  ..., -0.0006, -0.0080,  0.0600],
        ...,
        [-0.0285, -0.0173,  0.0572,  ..., -0.0408, -0.0664,  0.0337],
        [-0.0303, -0.0340,  0.0413,  ..., -0.0179, -0.0451,  0.0331],
        [-0.0461,  0.0472,  0.0062,  ...,  0.0251, -0.0365,  0.0272]])


In [52]:
encoded_input['token_type_ids'].dtype

torch.int64

In [22]:
# how about using ollama to generate embeddings

from langchain_community.embeddings import OllamaEmbeddings

ollama_emb = OllamaEmbeddings(model='llama2')

r1 = ollama_emb.embed_documents(
    X[:10]
)

r1

[[-0.6624012589454651,
  -0.5945977568626404,
  1.897308111190796,
  -2.679438829421997,
  -1.8338860273361206,
  -1.440673828125,
  0.5866485238075256,
  2.6048591136932373,
  -0.19371403753757477,
  2.3323678970336914,
  0.7579759359359741,
  -0.00353318452835083,
  1.2476534843444824,
  2.1306116580963135,
  -0.1454595923423767,
  -0.6225206255912781,
  0.3926948308944702,
  1.7409977912902832,
  -3.165381669998169,
  -0.7255942225456238,
  -0.5962603092193604,
  -1.9199563264846802,
  -2.8454020023345947,
  -5.146990776062012,
  1.776212453842163,
  -1.7746793031692505,
  -0.10192640125751495,
  -0.35242924094200134,
  -2.707256555557251,
  -1.044919490814209,
  2.8377878665924072,
  -1.1090648174285889,
  -1.1648586988449097,
  2.65484356880188,
  3.1633474826812744,
  -1.902618169784546,
  1.7478952407836914,
  1.2832590341567993,
  0.17792284488677979,
  1.15621817111969,
  0.10232443362474442,
  -1.8454188108444214,
  -1.6758105754852295,
  -0.06012474745512009,
  0.55763524770

In [24]:
len(r1[0])

4096

---