In [1]:
import torch
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification, TextClassificationPipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import TextClassificationPipeline

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base')
model = model.to(device)

# Data Model

In [11]:
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


df = pd.read_csv("data/merged_labeled_cleaned.csv", sep='|', index_col=False)
df = df.astype({"clean_body": str})
le = LabelEncoder()
le.fit(["fear", "greed"])

train_texts, val_texts, train_labels, val_labels = train_test_split(df["clean_body"], df["label"], test_size=0.2)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

train_dataset = RedditDataset(train_encodings, le.transform(train_labels))
val_dataset = RedditDataset(val_encodings, le.transform(val_labels))

# fine tuning

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=5,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

# Accuracy

In [None]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df = pd.read_csv('data/small_merged_labeled_cleaned.csv',delimiter='|', index_col=False)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
result = []
for i, sentence in df.iterrows():
    result.append(pipe(encoder.encode(sentence['label'])))