# Load Data

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
train=pd.read_csv('../weibo_emoji_data/train_weibo_emoji.csv')
test=pd.read_csv('../weibo_emoji_data/test_weibo_emoji.csv')

In [3]:
train.head()

Unnamed: 0,comment_text,label
0,我很佩服国外一些真·女权人士 为追求平等权利 而国内大部分都是田园👊基本上都是为了拿特权,1
1,看来你的脑子真是有问题，你连你讨论的目标该是什么人都搞不清楚吗？我们在讨论的是事件外评论者呢...,1
2,大魔王和马良吧！这两个人简直标准答案，其它人也不抄一下😅,0
3,我一开始了解狗粉丝是因为演员热依扎，当时她因为抑郁症精神状态很不好，大批狗粉丝辱骂她让她早点...,0
4,有意识地建立自己的社群也很重要，一个人不穿bra走在路上孤单单，一群人不穿bra走在路上理直...,0


In [4]:
from datasets import load_dataset
dataset=load_dataset('csv',data_files={'train':'../weibo_emoji_data/train_weibo_emoji.csv','test':'../weibo_emoji_data/test_weibo_emoji.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 600
    })
})


In [6]:
from collections import Counter
print(Counter(dataset['train']['label']))
print(Counter(dataset['test']['label']))

Counter({1: 1265, 0: 1135})
Counter({1: 316, 0: 284})


# Tokenization

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-large')

In [8]:
def tokenize_function(example):
    return tokenizer(example['comment_text'],padding="max_length",truncation=True,return_tensors="pt")
tokenized_dataset=dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [9]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['comment_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
})


In [10]:
tokenized_dataset=tokenized_dataset.remove_columns(['comment_text'])
tokenized_dataset=tokenized_dataset.rename_column('label','labels')
tokenized_dataset.set_format('torch')
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
})


In [11]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=4, shuffle=True)
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=4, shuffle=True)

# Model and Optimizer

In [12]:
id2label = {0: "NONHATE", 1: "HATE"}
label2id = {"NONHATE": 0, "HATE": 1}

In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-large', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-6)

In [15]:
from transformers import get_scheduler
num_epochs = 5
num_training_steps = len(train_dataloader) * num_epochs
scheduler = get_scheduler(name='linear',optimizer=optimizer,num_warmup_steps=50,num_training_steps=num_training_steps)

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
   

# Training and evaluation

In [17]:
from tqdm.auto import tqdm

def train(model, train_dataloader, optimizer, scheduler, num_epochs, num_training_steps):
    progress_bar=tqdm(range(num_training_steps))
    model.train()
    
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs=model(**batch)
            loss=outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

In [18]:
import evaluate

def eval(model, test_dataloader):
    acc=evaluate.load('accuracy')
    f1=evaluate.load('f1')
    mf1=evaluate.load('f1')
    progress_bar = tqdm(range(len(test_dataloader)))

    all_predictions = []
    all_references = []

    model.eval()

    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs=model(**batch)
        
        logits=outputs.logits
        predictions=torch.argmax(logits,dim=-1)

        all_predictions.extend(predictions.cpu().numpy())
        all_references.extend(batch['labels'].cpu().numpy())
        progress_bar.update(1)

    accuracy=acc.compute(predictions=all_predictions,references=all_references)
    f1_score=f1.compute(predictions=all_predictions,references=all_references,average=None)
    macro_f1=mf1.compute(predictions=all_predictions,references=all_references,average='macro')
    
    return accuracy, f1_score, macro_f1

In [None]:
train(model, train_dataloader, optimizer, scheduler, num_epochs, num_training_steps)
acc,f1,mf1 = eval(model, test_dataloader)

In [21]:
print('Accuracy: ',acc['accuracy'])
print('F1_sex: ', f1['f1'][1])
print('F1_not: ', f1['f1'][0])
print('F1_macro: ', mf1['f1'])