# Load Data 

In [2]:
import pandas as pd
import numpy as np
import torch

In [3]:
train=pd.read_csv('../weibo_text_data/train_weibo_text.csv')
test=pd.read_csv('../weibo_text_data/test_weibo_text.csv')

In [4]:
train.head()

Unnamed: 0,comment_text,label
0,我在工作后才开始怀疑自己是不是真的偏右，因为在网络上我其实是反对这些很过头的甚至会有恶劣负面...,0
1,唉 上世纪70年代的阿富汗女性还是可以抛头露面的 她们的着装到现在都不过时,0
2,我觉得是这个文章源头pagesix的故意，毕竟写的又不止甜茶一个,0
3,对的女生投稿喜欢男方再说一下男方性生活方面的事直接被骂 最近刚取关 真的无知当个性,0
4,"搞极端吸脑残粉变现挣钱呗，手段多着呢，我这几天看了很多那种“女权”博主，发现的奥秘所在,我觉...",1


In [5]:
print(train.shape)
print(test.shape)

(8072, 2)
(897, 2)


In [6]:
from datasets import load_dataset
dataset=load_dataset('csv',data_files={'train': '../weibo_text_data/train_weibo_text.csv', 'test': '../weibo_text_data/test_weibo_text.csv'})

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 8072
    })
    test: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 897
    })
})


In [8]:
from collections import Counter
print(Counter(dataset['train']['label']))
print(Counter(dataset['test']['label']))

Counter({0: 5288, 1: 2784})
Counter({0: 588, 1: 309})


# Tokenization

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-large')

In [10]:
def tokenize_function(example):
    return tokenizer(example['comment_text'],padding="max_length",truncation=True,return_tensors="pt")
tokenized_dataset=dataset.map(tokenize_function, batched=True)

In [11]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8072
    })
    test: Dataset({
        features: ['comment_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 897
    })
})


In [12]:
tokenized_dataset=tokenized_dataset.remove_columns(['comment_text'])
tokenized_dataset=tokenized_dataset.rename_column('label','labels')
tokenized_dataset.set_format('torch')
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8072
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 897
    })
})


In [13]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=4, shuffle=True)
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=4, shuffle=True)

# Model and Optimizer

In [14]:
id2label = {0: "NONHATE", 1: "HATE"}
label2id = {"NONHATE": 0, "HATE": 1}

In [15]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-large', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-6)

In [17]:
from transformers import get_scheduler
num_epochs = 5
num_training_steps = len(train_dataloader) * num_epochs
scheduler = get_scheduler(name='linear',optimizer=optimizer,num_warmup_steps=50,num_training_steps=num_training_steps)

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
   

# Training and evaluation

In [19]:
from tqdm.auto import tqdm

def train(model, train_dataloader, optimizer, scheduler, num_epochs, num_training_steps):
    progress_bar=tqdm(range(num_training_steps))
    model.train()
    
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs=model(**batch)
            loss=outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

In [20]:
import evaluate

def eval(model, test_dataloader):
    acc=evaluate.load('accuracy')
    f1=evaluate.load('f1')
    mf1=evaluate.load('f1')
    progress_bar = tqdm(range(len(test_dataloader)))

    all_predictions = []
    all_references = []

    model.eval()

    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs=model(**batch)
        
        logits=outputs.logits
        predictions=torch.argmax(logits,dim=-1)

        all_predictions.extend(predictions.cpu().numpy())
        all_references.extend(batch['labels'].cpu().numpy())
        progress_bar.update(1)

    accuracy=acc.compute(predictions=all_predictions,references=all_references)
    f1_score=f1.compute(predictions=all_predictions,references=all_references,average=None)
    macro_f1=mf1.compute(predictions=all_predictions,references=all_references,average='macro')
    
    return accuracy, f1_score, macro_f1

In [None]:
train(model, train_dataloader, optimizer, scheduler, num_epochs, num_training_steps)
acc, f1, mf1 = eval(model, test_dataloader)

In [None]:
print("Result: Weibo_text_prev")
print('Accuracy: ', acc['accuracy'])
print('F1_hate: ', f1['f1'][1])
print('F1_not: ', f1['f1'][0])
print('F1_macro: ', mf1['f1'])