In [1]:
import pandas as pd

df = pd.read_excel("../data/repa_data_20230630.xlsx")

TypeError: 'Index' object is not callable

In [2]:
print(df.columns)

Index(['pubpeer_id', 'title', 'comments_total', 'journals', 'published_time',
       'author_1', 'author_1_id', 'link', 'affiliation', 'created',
       'discipline', 'situation', 'rawcommentstext', 'comments',
       'my_prediction_NEU', 'my_prediction_POS', 'my_prediction_NEG',
       'total_comment', 'author_reply', 'author_active', 'reword',
       'is_retracted'],
      dtype='object')


In [6]:
comment_len = []
for comment in df["rawcommentstext"]:
    comment_len.append(len(comment))
print(len(comment_len))

25890


In [4]:
comment_len

[]

In [5]:
df["comments"]

0        0.988941
1        0.998431
2        0.994018
3        0.995968
4        0.987079
           ...   
25885         NaN
25886         NaN
25887         NaN
25888         NaN
25889         NaN
Name: comments, Length: 25890, dtype: float64

In [7]:
df["comment_len"] = comment_len
df.columns

Index(['pubpeer_id', 'title', 'comments_total', 'journals', 'published_time',
       'author_1', 'author_1_id', 'link', 'affiliation', 'created',
       'discipline', 'situation', 'rawcommentstext', 'comments',
       'my_prediction_NEU', 'my_prediction_POS', 'my_prediction_NEG',
       'total_comment', 'author_reply', 'author_active', 'reword',
       'is_retracted', 'comment_len'],
      dtype='object')

In [9]:
df.to_csv("../data/repa_data_20231022.csv", index=False)

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch

# Mean Pooling - Take attention mask into account for correct averaging
def meanpooling(output, mask):
    embeddings = output[0] # First element of model_output contains all token embeddings
    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("neuml/pubmedbert-base-embeddings")
model = AutoModel.from_pretrained("neuml/pubmedbert-base-embeddings")

# Tokenize sentences
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    output = model(**inputs)

# Perform pooling. In this case, mean pooling.
embeddings = meanpooling(output, inputs['attention_mask'])

print("Sentence embeddings:")
print(embeddings)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence embeddings:
tensor([[-0.5490, -0.0099, -0.2638,  ..., -0.1579, -1.2998,  0.8093],
        [-1.0421,  0.7897,  0.5180,  ..., -0.5906, -1.0819,  0.5043]])


In [17]:
affi_list = []
for affi in df["affiliation"]:
    # Tokenize sentences
    inputs = tokenizer(affi, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        output = model(**inputs)

    # Perform pooling. In this case, mean pooling.
    embeddings = meanpooling(output, inputs['attention_mask'])

    affi_list.append(embeddings)
    print(embeddings)

len(affi_list)

tensor([[-1.8979e-01, -6.8017e-01,  5.2341e-01, -7.0386e-01,  8.8785e-01,
          2.3975e-01,  1.3085e-01,  2.0441e-01, -1.5418e+00, -7.4541e-01,
          1.3769e-01, -2.4155e-01,  2.7621e-01, -1.9596e-01, -1.8577e+00,
          1.6384e-01,  1.1339e+00,  3.2634e-01,  1.9482e-01, -1.4316e-01,
         -1.5743e-01, -4.1070e-01, -6.0260e-01, -3.2611e-01,  1.0842e-01,
          6.4312e-02,  6.6766e-01, -8.5240e-02,  4.4024e-01,  8.0997e-01,
         -3.2580e-01, -6.7398e-01,  3.6923e-01,  3.3694e-02,  2.4787e-01,
          3.9415e-01, -1.9272e-01, -1.4738e-01,  1.0770e+00,  1.4688e-01,
         -3.0563e-01,  4.9195e-01,  4.1364e-01, -6.6112e-02, -3.4836e-02,
         -9.3826e-01,  1.9906e-01, -7.1570e-02, -6.7851e-01,  7.0934e-01,
          1.5022e-01,  2.2189e-01, -3.3719e-01,  7.9568e-02,  4.4089e-01,
          3.8067e-02,  7.9688e-02,  3.2315e-02,  8.2612e-01,  8.1624e-01,
         -7.9253e-01,  3.4582e-01,  2.6749e-02, -7.5779e-01, -3.6205e-01,
         -5.2234e-01, -5.3651e-01, -3.

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).