In [2]:
from datasets import load_dataset

data = load_dataset("gfissore/arxiv-abstracts-2021")
data

Using custom data configuration gfissore--arxiv-abstracts-2021-23556c248bdbe0fc
Found cached dataset json (/Users/olang/.cache/huggingface/datasets/gfissore___json/gfissore--arxiv-abstracts-2021-23556c248bdbe0fc/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'abstract', 'report-no', 'categories', 'versions'],
        num_rows: 1999486
    })
})

In [5]:
title = data["train"]["title"]

import pandas as pd

df = pd.DataFrame(title, columns=["title"])
df.head()

Unnamed: 0,title
0,Calculation of prompt diphoton production cros...
1,Sparsity-certifying Graph Decompositions
2,The evolution of the Earth-Moon system based o...
3,A determinant of Stirling cycle numbers counts...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...


In [6]:
def clean_text(text):
    text = str(text)
    text = text.replace('\d+', '')  # remove numbers
    text = text.replace('[^\w\s]','')   # remove punctuation
    text = text.lower()
    return text

In [7]:
df['clean_text'] = df['title'].apply(lambda x: clean_text(x))

In [8]:
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-xlm-r-multilingual-v1")
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-xlm-r-multilingual-v1")

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [9]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def embed_text(text_list):
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [12]:
# print last index of the dataframe
print(df.index[-1])

1999485


In [17]:
clean_texts = df['clean_text'].tolist()

In [None]:
# embed items with progress bar

from tqdm import tqdm

embeddings = []
for i in tqdm(range(0, df.index[-1], 1000)):        # the 1000 is the batch size
    embeddings.append(embed_text(clean_texts[i:i+1000]))    