In [1]:
import numpy as np
import pandas as pd
import torch
import csv
import transformers
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
from torch.optim import AdamW
import json
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification,AutoConfig, get_linear_schedule_with_warmup


In [2]:
device = 'cuda'

In [3]:
#MODEL_NAME = "roberta-base"
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
MAX_LEN = 64
EPOCHS = 10
BATCH_SIZE = 32 
LR = 2e-5 
# WARMUP_STEPS = 100 
# T_TOTAL = 1000 

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [5]:
def load_dataset(filepath, max_len):
    label = []
    sentences = []
    # load dataset
    f = open(filepath, 'r', encoding='utf-8')
    r = csv.reader(f)
    for item in r:
        if r.line_num == 1:
            continue
        #print(item)
    
        label.append(int(item[2]))
        sentences.append(item[1])
        
    input_ids = []
    attention_masks = []

    # For every sentence...
    for data in sentences:
        encoded_data = tokenizer.encode_plus(
            text=data,                      # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,             # Max length to truncate/pad
            padding='max_length',           # Pad sentence to max length
            return_attention_mask=True,      # Return attention mask
            truncation= True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_data.get('input_ids'))
        attention_masks.append(encoded_data.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(label)
    return input_ids, attention_masks, labels


In [6]:
import os
os.getcwd()

'f:\\Desktop\\SocialNetworkProj\\model'

In [7]:
train_dataset = load_dataset('train/train.csv', max_len = MAX_LEN)
valid_dataset = load_dataset('train/val.csv', max_len = MAX_LEN)
test_dataset = load_dataset('train/test.csv', max_len = MAX_LEN)

In [8]:
train_dataset[2]

tensor([0, 2, 0,  ..., 0, 2, 0])

In [9]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(train_dataset[0], train_dataset[1],train_dataset[2])
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size = BATCH_SIZE)

val_data = TensorDataset(valid_dataset[0],valid_dataset[1],valid_dataset[2])
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data,sampler=val_sampler, batch_size = BATCH_SIZE)

test_data = TensorDataset(test_dataset[0],test_dataset[1],test_dataset[2])

In [10]:
def batch_accuracy(pre, label):
    pre = pre.argmax(dim=1)
    correct = torch.eq(pre, label).sum().float().item()
    accuracy = correct / float(len(label))

    return accuracy

In [11]:
# config = AutoConfig.from_pretrained(MODEL_NAME)
# config.num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 3)
model.cuda()
print()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [12]:
optimizer = AdamW(model.parameters(), lr = LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

In [13]:
from tqdm import tqdm

In [14]:
model.train()
print('training...')
best_acc = 0
for epoch in range(EPOCHS):
    loss_t, batch_loss, batch_acc, batch_counts = 0, 0, 0, 0
    
    model.train()
    for step,batch in tqdm(enumerate(train_loader), desc = "Training"):
        batch_counts +=1
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        outputs = model(b_input_ids, b_attn_mask, labels=b_labels)
        loss, logits = outputs[:2]
        optimizer.zero_grad()
        
        loss_t += loss.item()
        batch_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        
        acc = batch_accuracy(logits, b_labels)
        batch_acc += acc
        
        if (step % 200 == 0 and step != 0) or (step == len(train_loader) - 1):
            print(f'epoch:{epoch} | step:{step} | avg_batch_acc:{batch_acc/batch_counts:^.6f} | avg_batch_loss:{batch_loss/batch_counts:^.6f}')
            batch_acc, batch_loss, batch_counts = 0, 0, 0
        
    avg_train_loss = loss_t / len(train_loader)
    
    #evaluate 
    val_acc, val_loss = [],[]
    model.eval()
    for batch in val_loader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(b_input_ids, b_attn_mask, labels=b_labels)
        loss, logits = outputs[:2]
        val_loss.append(loss.item())
        acc = batch_accuracy(logits, b_labels)
        val_acc.append(acc)
        
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_acc)
            
    print(f'epoch:{epoch} | avg_train_loss:{avg_train_loss} | val_loss:{val_loss} | val_accuracy:{val_accuracy}')
    
    if val_accuracy>=best_acc:
        torch.save(model.state_dict(), 'bert_cla2.ckpt')
        best_acc = val_accuracy
        print('saving trained model...')


training...


Training: 201it [01:20,  2.57it/s]

epoch:0 | step:200 | avg_batch_acc:0.765858 | avg_batch_loss:0.560319


Training: 401it [02:38,  2.55it/s]

epoch:0 | step:400 | avg_batch_acc:0.767344 | avg_batch_loss:0.556397


Training: 601it [03:56,  2.55it/s]

epoch:0 | step:600 | avg_batch_acc:0.791562 | avg_batch_loss:0.510752


Training: 687it [04:30,  2.54it/s]

epoch:0 | step:686 | avg_batch_acc:0.796512 | avg_batch_loss:0.510198





epoch:0 | avg_train_loss:0.538473016266566 | val_loss:0.5356500614867654 | val_accuracy:0.7994186046511628
saving trained model...


Training: 201it [01:18,  2.55it/s]

epoch:1 | step:200 | avg_batch_acc:0.837065 | avg_batch_loss:0.420908


Training: 401it [02:37,  2.58it/s]

epoch:1 | step:400 | avg_batch_acc:0.835469 | avg_batch_loss:0.416487


Training: 601it [03:54,  2.59it/s]

epoch:1 | step:600 | avg_batch_acc:0.824063 | avg_batch_loss:0.448980


Training: 687it [04:27,  2.57it/s]

epoch:1 | step:686 | avg_batch_acc:0.839026 | avg_batch_loss:0.426822





epoch:1 | avg_train_loss:0.42853377351681043 | val_loss:0.5138168627786082 | val_accuracy:0.806218853820598
saving trained model...


Training: 201it [01:17,  2.59it/s]

epoch:2 | step:200 | avg_batch_acc:0.881996 | avg_batch_loss:0.323796


Training: 401it [02:34,  2.60it/s]

epoch:2 | step:400 | avg_batch_acc:0.868437 | avg_batch_loss:0.332743


Training: 601it [03:52,  2.56it/s]

epoch:2 | step:600 | avg_batch_acc:0.875469 | avg_batch_loss:0.334079


Training: 687it [04:26,  2.58it/s]

epoch:2 | step:686 | avg_batch_acc:0.881541 | avg_batch_loss:0.315840





epoch:2 | avg_train_loss:0.32839812772156024 | val_loss:0.574549911153871 | val_accuracy:0.8004568106312292


Training: 201it [01:18,  2.56it/s]

epoch:3 | step:200 | avg_batch_acc:0.917600 | avg_batch_loss:0.235902


Training: 401it [02:37,  2.58it/s]

epoch:3 | step:400 | avg_batch_acc:0.915625 | avg_batch_loss:0.243981


Training: 601it [03:55,  2.54it/s]

epoch:3 | step:600 | avg_batch_acc:0.917969 | avg_batch_loss:0.238821


Training: 687it [04:29,  2.55it/s]

epoch:3 | step:686 | avg_batch_acc:0.909520 | avg_batch_loss:0.264463





epoch:3 | avg_train_loss:0.24267895316387889 | val_loss:0.6276241961953252 | val_accuracy:0.8011835548172757


Training: 201it [01:18,  2.55it/s]

epoch:4 | step:200 | avg_batch_acc:0.947450 | avg_batch_loss:0.164868


Training: 401it [02:37,  2.57it/s]

epoch:4 | step:400 | avg_batch_acc:0.947344 | avg_batch_loss:0.162822


Training: 601it [03:55,  2.55it/s]

epoch:4 | step:600 | avg_batch_acc:0.942031 | avg_batch_loss:0.177046


Training: 687it [04:29,  2.55it/s]

epoch:4 | step:686 | avg_batch_acc:0.933866 | avg_batch_loss:0.192720





epoch:4 | avg_train_loss:0.17130433814125637 | val_loss:0.7815324270794558 | val_accuracy:0.7971864617940199


Training: 201it [01:18,  2.54it/s]

epoch:5 | step:200 | avg_batch_acc:0.964397 | avg_batch_loss:0.110018


Training: 401it [02:37,  2.56it/s]

epoch:5 | step:400 | avg_batch_acc:0.958125 | avg_batch_loss:0.125746


Training: 601it [03:55,  2.55it/s]

epoch:5 | step:600 | avg_batch_acc:0.959688 | avg_batch_loss:0.139227


Training: 687it [04:29,  2.55it/s]

epoch:5 | step:686 | avg_batch_acc:0.963663 | avg_batch_loss:0.116856





epoch:5 | avg_train_loss:0.12395617578604314 | val_loss:0.9361229580502177 | val_accuracy:0.7956810631229236


Training: 201it [01:18,  2.54it/s]

epoch:6 | step:200 | avg_batch_acc:0.974658 | avg_batch_loss:0.087821


Training: 401it [02:37,  2.57it/s]

epoch:6 | step:400 | avg_batch_acc:0.968437 | avg_batch_loss:0.103273


Training: 601it [03:55,  2.58it/s]

epoch:6 | step:600 | avg_batch_acc:0.969688 | avg_batch_loss:0.106919


Training: 687it [04:28,  2.56it/s]

epoch:6 | step:686 | avg_batch_acc:0.972384 | avg_batch_loss:0.094272





epoch:6 | avg_train_loss:0.09868683270229571 | val_loss:1.087910391563593 | val_accuracy:0.7919954318936877


Training: 201it [01:18,  2.57it/s]

epoch:7 | step:200 | avg_batch_acc:0.978545 | avg_batch_loss:0.077363


Training: 401it [02:37,  2.56it/s]

epoch:7 | step:400 | avg_batch_acc:0.978594 | avg_batch_loss:0.079324


Training: 601it [03:55,  2.55it/s]

epoch:7 | step:600 | avg_batch_acc:0.972656 | avg_batch_loss:0.088720


Training: 687it [04:29,  2.55it/s]

epoch:7 | step:686 | avg_batch_acc:0.977834 | avg_batch_loss:0.076076





epoch:7 | avg_train_loss:0.08107912474670455 | val_loss:1.1608602089244267 | val_accuracy:0.7910091362126245


Training: 201it [01:18,  2.54it/s]

epoch:8 | step:200 | avg_batch_acc:0.984297 | avg_batch_loss:0.060047


Training: 401it [02:37,  2.54it/s]

epoch:8 | step:400 | avg_batch_acc:0.985313 | avg_batch_loss:0.051239


Training: 601it [03:55,  2.56it/s]

epoch:8 | step:600 | avg_batch_acc:0.980313 | avg_batch_loss:0.073381


Training: 687it [04:29,  2.55it/s]

epoch:8 | step:686 | avg_batch_acc:0.982922 | avg_batch_loss:0.051682





epoch:8 | avg_train_loss:0.06031756949456127 | val_loss:1.3060335652079693 | val_accuracy:0.7932412790697675


Training: 201it [01:19,  2.55it/s]

epoch:9 | step:200 | avg_batch_acc:0.985852 | avg_batch_loss:0.054313


Training: 401it [02:38,  2.51it/s]

epoch:9 | step:400 | avg_batch_acc:0.987500 | avg_batch_loss:0.042552


Training: 601it [03:57,  2.52it/s]

epoch:9 | step:600 | avg_batch_acc:0.988281 | avg_batch_loss:0.042901


Training: 687it [04:31,  2.53it/s]

epoch:9 | step:686 | avg_batch_acc:0.982195 | avg_batch_loss:0.063108





epoch:9 | avg_train_loss:0.048667650760556104 | val_loss:1.3368132090152696 | val_accuracy:0.7950581395348837


In [15]:
print('开始加载训练完成的model...')
model.load_state_dict(torch.load('bert_cla2.ckpt'))

开始加载训练完成的model...


<All keys matched successfully>

In [16]:
print('开始测试...')
model.eval()
test_result = []
for data in test_data:
    b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in data)
    b_input = b_input_ids.unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(b_input)
        pre = outputs.logits.argmax(dim=1)
        test_result.append([b_labels.item(), pre.item(), tokenizer.convert_ids_to_tokens(b_input_ids)])

# 写入csv文件
df = pd.DataFrame(test_result)
df.to_csv('test_result.csv',index=False, header=['id', 'label','text'])

开始测试...


In [17]:
import pandas as pd
df = pd.read_csv('test_result.csv')
df_e = df[df.id!=df.label]

In [18]:
df

Unnamed: 0,id,label,text
0,1,1,"['<s>', 'the', 'Ġgr', 'ates', 'Ġchildren', 'Ġc..."
1,0,0,"['<s>', 'going', 'Ġto', 'Ġsleep', 'Ġgonna', 'Ġ..."
2,1,1,"['<s>', 'loo', 'ool', 'Ġme', 'Ġana', 'Ġi', 'Ġw..."
3,2,1,"['<s>', 'oh', 'Ġno', 'Ġi', 'Ġhope', 'Ġyou', 'Ġ..."
4,1,1,"['<s>', 'see', 'Ġyou', 'Ġon', 'Ġmay', '</s>', ..."
...,...,...,...
2743,1,1,"['<s>', 'its', 'Ġbout', 'Ġsmoking', 'Ġweed', '..."
2744,2,2,"['<s>', 'i', 'Ġthink', 'Ġthats', 'Ġpretty', 'Ġ..."
2745,0,0,"['<s>', 'went', 'Ġto', 'Ġsleep', 'Ġand', 'Ġthe..."
2746,1,2,"['<s>', 'wh', 'ar', 'Ġa', 'Ġnight', 'Ġwoo', 'Ġ..."


In [19]:
df_e

Unnamed: 0,id,label,text
3,2,1,"['<s>', 'oh', 'Ġno', 'Ġi', 'Ġhope', 'Ġyou', 'Ġ..."
10,1,0,"['<s>', 'i', 'Ġhad', 'Ġit', 'Ġon', 'Ġmy', 'Ġit..."
14,0,2,"['<s>', 'i', 'Ġhope', 'Ġso', 'Ġrecorded', 'Ġth..."
18,1,0,"['<s>', 'i', 'Ġfeel', 'Ġsome', 'Ġtype', 'Ġof',..."
26,1,0,"['<s>', 'day', 'Ġof', 'Ġwork', 'Ġtoday', 'Ġwas..."
...,...,...,...
2722,2,1,"['<s>', 'my', 'Ġhub', 'by', 'Ġand', 'Ġhis', 'Ġ..."
2731,1,0,"['<s>', 'i', 'Ġcant', 'Ġfind', 'Ġmy', 'Ġtennis..."
2735,1,2,"['<s>', 'thanks', 'Ġf', 'ot', 'ore', 'port', '..."
2736,1,2,"['<s>', 'y', 'ep', 'Ġthat', 'Ġone', 'Ġworks', ..."


In [20]:
1- len(df_e)/len(df)

0.7914847161572053