# Main code scripts

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestClassifier

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

# Prepare the Emojibag Translator

In [2]:
MODEL = 't5-small'


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = T5Tokenizer.from_pretrained(MODEL, lagacy=False)

# path_pos = 'dataset/1k_data_tweets_emoticon_pos.csv'
# df_pos = pd.read_csv(path_pos)
# path_neg = 'dataset/1k_data_tweets_emoticon_neg.csv'
# df_neg = pd.read_csv(path_neg)
# data_df = pd.concat([df_pos, df_neg], ignore_index=True)

data_df = pd.read_csv('dataset/1k_data_emoji_tweets_senti_posneg.csv')
from tqdm import tqdm
def set_the_model(path):
    """
    Load various trained T5-based Emojibag models
    """
    model = T5ForConditionalGeneration.from_pretrained(MODEL)
    model.to(DEVICE)
    model.load_state_dict(torch.load(path, weights_only=True))    

    return model


def translate_emoji(texts, tokenizer, model, device='cuda', batch_size=32, max_new_tokens=20):
    model.eval()
    translated_texts = []

    dataloader = DataLoader(texts, batch_size=batch_size)

    for batch in tqdm(dataloader, desc='Translating emojis (batched)'):
        # Tokenize the batch
        inputs = tokenizer(list(batch), return_tensors='pt', padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        # Decode batch
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translated_texts.extend(decoded)

    return translated_texts


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Conduct experiments

In [None]:
def run_experiment(data_df, translate_emoji_func, model, tokenizer, vectorizer = None, description="Raw", binary=False):
    
    X_text = data_df['post'].astype(str)
    y = data_df['sentiment']

    if "raw" not in description.lower():
        X_text = translate_emoji_func(X_text, tokenizer=tokenizer, model=model)

    X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train classifier
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_tfidf, y_train)

    # Predict and evaluate
    y_pred = clf.predict(X_test_tfidf)
    y_proba = clf.predict_proba(X_test_tfidf)

    # Handle binary vs multiclass AUROC
    if binary or len(set(y)) == 2:
        # Convert labels to binary if not already
        if y_proba.shape[1] > 1:
            y_scores = y_proba[:, 1]
        else:
            y_scores = y_proba.ravel()
        auroc = roc_auc_score(y_test, y_scores)
    else:
        y_test_bin = label_binarize(y_test, classes=clf.classes_)
        auroc = roc_auc_score(y_test_bin, y_proba, average='macro', multi_class='ovo')

    # Metrics
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"[INFO] {description} classification report:")
    print(classification_report(y_test, y_pred))

    return {
        "Description": description,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "AUROC": auroc
    }


# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

T5_GPT2_MODEL = set_the_model(path='t5-small_50_GPT2.pt')
T5_CHATGPT_MODEL = set_the_model(path='t5-small_50_chatgpt.pt')

results = []
results.append(run_experiment(data_df, None, None, None, vectorizer, description="Raw", binary=False))
results.append(run_experiment(data_df, translate_emoji, T5_GPT2_MODEL, tokenizer, vectorizer, description="Emojibag-GPT2", binary=False))
results.append(run_experiment(data_df, translate_emoji, T5_CHATGPT_MODEL, tokenizer, vectorizer, description="Emojibag-ChatGPT", binary=False))


# Show all results as a table
results_df = pd.DataFrame(results)
print(results_df)


[INFO] Raw classification report:
              precision    recall  f1-score   support

           0       0.61      0.48      0.54        87
           1       0.66      0.76      0.70       113

    accuracy                           0.64       200
   macro avg       0.63      0.62      0.62       200
weighted avg       0.64      0.64      0.63       200



Translating emojis (batched): 100%|██████████| 32/32 [00:06<00:00,  4.86it/s]


[INFO] Emojibag-GPT2 classification report:
              precision    recall  f1-score   support

           0       0.53      0.71      0.61        87
           1       0.70      0.52      0.60       113

    accuracy                           0.60       200
   macro avg       0.62      0.62      0.60       200
weighted avg       0.63      0.60      0.60       200



Translating emojis (batched): 100%|██████████| 32/32 [00:06<00:00,  5.13it/s]


[INFO] Emojibag-ChatGPT classification report:
              precision    recall  f1-score   support

           0       0.77      0.51      0.61        87
           1       0.70      0.88      0.78       113

    accuracy                           0.72       200
   macro avg       0.74      0.70      0.70       200
weighted avg       0.73      0.72      0.71       200

        Description  Precision    Recall        F1     AUROC
0               Raw   0.632592  0.621910  0.621690  0.726732
1     Emojibag-GPT2   0.618432  0.617384  0.604911  0.677805
2  Emojibag-ChatGPT   0.735615  0.695351  0.696181  0.790255


# Using Transformer-based vectorizer

In [4]:
from transformers import AutoTokenizer, AutoModel
from utils import BertVectorizer


# Load BERT model/tokenizer (can be any transformer)
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to("cuda")

bert_vectorizer = BertVectorizer(model=bert_model, tokenizer=bert_tokenizer)

results = []
results.append(run_experiment(data_df, None, None, None, bert_vectorizer, description="Raw", binary=False))
results.append(run_experiment(data_df, translate_emoji, T5_GPT2_MODEL, tokenizer, bert_vectorizer, description="Emojibag-GPT2", binary=False))
results.append(run_experiment(data_df, translate_emoji, T5_CHATGPT_MODEL, tokenizer, bert_vectorizer, description="Emojibag-ChatGPT", binary=False))


pd.DataFrame(results)

Encoding with BERT: 100%|██████████| 25/25 [00:00<00:00, 37.42it/s]
Encoding with BERT: 100%|██████████| 7/7 [00:00<00:00, 67.63it/s]


[INFO] Raw classification report:
              precision    recall  f1-score   support

           0       0.61      0.77      0.68        87
           1       0.78      0.62      0.69       113

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.68       200
weighted avg       0.70      0.69      0.69       200



Translating emojis (batched): 100%|██████████| 32/32 [00:06<00:00,  4.96it/s]
Encoding with BERT: 100%|██████████| 25/25 [00:00<00:00, 72.94it/s]
Encoding with BERT: 100%|██████████| 7/7 [00:00<00:00, 81.36it/s]


[INFO] Emojibag-GPT2 classification report:
              precision    recall  f1-score   support

           0       0.49      0.72      0.58        87
           1       0.66      0.42      0.51       113

    accuracy                           0.55       200
   macro avg       0.58      0.57      0.55       200
weighted avg       0.59      0.55      0.54       200



Translating emojis (batched): 100%|██████████| 32/32 [00:06<00:00,  5.28it/s]
Encoding with BERT: 100%|██████████| 25/25 [00:00<00:00, 69.08it/s]
Encoding with BERT: 100%|██████████| 7/7 [00:00<00:00, 87.50it/s]


[INFO] Emojibag-ChatGPT classification report:
              precision    recall  f1-score   support

           0       0.67      0.76      0.71        87
           1       0.79      0.71      0.75       113

    accuracy                           0.73       200
   macro avg       0.73      0.73      0.73       200
weighted avg       0.74      0.73      0.73       200



Unnamed: 0,Description,Precision,Recall,F1,AUROC
0,Raw,0.693434,0.694792,0.684929,0.78639
1,Emojibag-GPT2,0.575172,0.570034,0.547101,0.625928
2,Emojibag-ChatGPT,0.729373,0.733293,0.72867,0.833486
