In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_train = pd.read_csv("./training_data/merged_train_i2.csv", sep=';' )

numeric_column_sums = df_train.select_dtypes(include='number').sum()
print(numeric_column_sums)
print(f"Total data: {df_train.shape[0]}")


anger           2763
anticipation    1697
disgust         2820
fear            1842
joy             2711
love            1514
optimism        2385
pessimism       1598
sadness         2211
surprise        1369
trust           1369
dtype: int64
Total data: 12652


In [6]:
def adaptive_thresholding(predictions, target_count, initial_threshold=0.99, step=0.001):
    
    total_pseudo_labels = 0
    
    while total_pseudo_labels < target_count:

        pseudo_labels = [ (0 if item < initial_threshold else 1) for item in predictions]
        total_pseudo_labels = np.sum(pseudo_labels)
        
        if total_pseudo_labels >= target_count:
            break

        if initial_threshold < 0.1:
            break

        initial_threshold -= step
    
    return initial_threshold

not_chosen_columns = ['ID', 'Tweet']
label_columns = [col for col in df_train.columns if col not in not_chosen_columns]

increase_needed = [100, 350, 100, 300, 100, 400, 200, 400, 100, 500, 500]
pseudo_thresholds = []

df_goemotion = pd.read_csv("./pseudo_labeled_dataset_i1.csv", sep=';')

for i in range( len(label_columns) ):
    
    predic_list = df_goemotion[label_columns[i]].tolist()
    result_threshold = adaptive_thresholding(predic_list, increase_needed[i])
    pseudo_thresholds.append(round(result_threshold, 3))

print(pseudo_thresholds)

[0.901, 0.815, 0.863, 0.786, 0.925, 0.782, 0.787, 0.736, 0.752, 0.85, 0.81]


In [7]:
df_goemotion_result = df_goemotion.copy()

for i in range( len(label_columns) ):
    label = label_columns[i]
    df_goemotion_result[label] = df_goemotion_result[label].apply(lambda val: 0 if val < pseudo_thresholds[i] else 1 )

df_filtered = df_goemotion_result.loc[(df_goemotion_result[label_columns].sum(axis=1) > 0)]
df_remaining = df_goemotion_result.loc[(df_goemotion_result[label_columns].sum(axis=1) == 0)]
df_combined = pd.concat([df_train, df_filtered], ignore_index=True)

df_remaining.to_csv('goemotion_train_i2.csv', sep=';', encoding='utf_8', index=False)
df_combined.to_csv('merged_train_i2.csv', sep=';', encoding='utf_8', index=False)
