This code allows to create the embeddings, and it contains code from Muennighoff, N. (2022). Sgpt: Gpt sentence embeddings for semantic search. 
Accessible at https://arxiv.org/abs/2202.08904. 
The code has been modiefied and adapted to fulfill the task

In [None]:


import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from tqdm import tqdm
import numpy as np

model_name = 'bigscience/bloom-560m'
#to create the embeddings for GPT-Neo, comment the line above this and uncomment the one under this.
#model_name = 'EleutherAI/gpt-neo-1.3B'


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

texts = pd.read_csv("combined-set.csv")["selftext"].to_list()
#sample_size = 10
#texts = texts[:sample_size]

# Tokenize input texts


batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

batchsize = 1

outputs = []

for i in tqdm(range(0, len(texts), batchsize)):
    batch = texts[i:i+batchsize]
    batch_tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        batch_tokens["attention_mask"]
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask
    embeddings= np.array(embeddings)
    outputs.extend(embeddings)

final_embeddings = pd.DataFrame(data=outputs)

print(final_embeddings)
final_embeddings.to_csv("BLOOM_embedded.csv")
#final_embeddings.to_csv("GPTNeo_embedded.csv")

