In [1]:
from pathlib import Path
import pandas as pd

# List of dataset paths
dataset_paths = [
    Path("GoEmotions/data/full_dataset/goemotions_1.csv"),
    Path("GoEmotions/data/full_dataset/goemotions_2.csv"),
    Path("GoEmotions/data/full_dataset/goemotions_3.csv"),
]

# Read and combine all datasets
dfs = [
    pd.read_csv(path, usecols=["text", "id", "author", "subreddit", "parent_id", "created_utc"])
    for path in dataset_paths
]

df_combined = pd.concat(dfs, ignore_index=True)

# Drop duplicate entries based on 'id'
df_deduplicated = df_combined.drop_duplicates(subset="id")

# Add a new 'label' column with dtype int (optional, depending on your needs)
df_deduplicated["label"] = pd.Series(dtype='int')

# Save the cleaned dataset to a new CSV file
output_path = Path("GoEmotions/data/cleaned_dataset.csv")
df_deduplicated.to_csv(output_path, index=False)

print(f"Saved deduplicated dataset to {output_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deduplicated["label"] = pd.Series(dtype='int')


Saved deduplicated dataset to GoEmotions/data/cleaned_dataset.csv


In [2]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

model_path = "./emotion_model_full"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

model.eval()

2025-05-13 15:20:37.730335: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-13 15:20:38.176728: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-13 15:20:38.337277: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8473] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-13 15:20:38.379938: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1471] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-13 15:20:38.708784: I tensorflow/core/platform/cpu_feature_guar

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [1]:
for i, row in df_deduplicated.iterrows():
    text = row['text']
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax().item()

    row['label'] = predicted_class_id
    df_deduplicated.at[i, 'label'] = predicted_class_id
    
    if i%1000 == 0:
        print(row)

NameError: name 'df_deduplicated' is not defined

In [16]:
output_path = Path("GoEmotions/data/relabeled_dataset.csv")
df_deduplicated = df_deduplicated.copy()
df_deduplicated["label"] = df_deduplicated["label"].fillna(-1).astype(int)
df_deduplicated.to_csv(output_path, index=False)

print(f"Saved relabeled dataset to {output_path}")

Saved relabeled dataset to GoEmotions/data/relabeled_dataset.csv


In [11]:
for i, row in df_deduplicated.iterrows():
    print(row)

text           That game hurt.
id                     eew5j0j
author                   Brdd9
subreddit                  nrl
parent_id           t1_eew18eq
created_utc       1548381039.0
label                      NaN
Name: 0, dtype: object
text            >sexuality shouldn’t be a grouping category I...
id                                                       eemcysk
author                                               TheGreen888
subreddit                                       unpopularopinion
parent_id                                              t3_ai4q37
created_utc                                         1548084169.0
label                                                        NaN
Name: 1, dtype: object
text           You do right, if you don't care then fuck 'em!
id                                                    ed2mah1
author                                               Labalool
subreddit                                         confessions
parent_id                         

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



text           Did they really have overhead projectors in th...
id                                                       ed3az8l
author                                                  liamemsa
subreddit                                            gatekeeping
parent_id                                              t3_abtd1s
created_utc                                         1546452781.0
label                                                        NaN
Name: 27807, dtype: object
text           Loooooool what?
id                     effyunk
author           MarkellNelson
subreddit       torontoraptors
parent_id            t3_almkpn
created_utc       1548945593.0
label                      NaN
Name: 27809, dtype: object
text           I mean I would be a [NAME] and I’m pretty reas...
id                                                       eel42m0
author                                        Poormidlifechoices
subreddit                                            AskALiberal
parent_id     

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



text               I guess. Thanks.
id                          ef3ww90
author                     zinofire
subreddit      traaaaaaannnnnnnnnns
parent_id                t1_ef3wt5k
created_utc            1548606239.0
label                           NaN
Name: 78214, dtype: object
text           I did and found it basically the same except I...
id                                                       eeuer5u
author                                        loptthetreacherous
subreddit                                     WhitePeopleTwitter
parent_id                                             t1_eeu91cq
created_utc                                         1548339605.0
label                                                        NaN
Name: 78216, dtype: object
text           Is that because it's an accurate description o...
id                                                       eeuheu6
author                                          Buffalo__Buffalo
subreddit                                   

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

