In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import transformers
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
# prexy setting
# os.environ["http_proxy"] = "http://127.0.0.1:7890"
# os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [3]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")
train_data.v1 = train_data.v1.replace({"ham": 0, "spam": 1})
test_data.v1 = test_data.v1.replace({"ham": 0, "spam": 1})
train_data.shape, test_data.shape


((3537, 6), (2035, 6))

### 构造 DataSet 和 DataLoader

In [4]:
class SMSDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        super(SMSDataset, self).__init__()
        self.tokenizer=tokenizer
        self.max_length=max_length
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        text= self.data.iloc[index, 2]
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            padding='max_length',
            add_special_tokens=True,
            return_attention_mask=True,
            truncation=True,
            max_length=self.max_length,
        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.data.iloc[index, 1], dtype=torch.long)
        }


### 创建模型

In [5]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.head = nn.Sequential(
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Linear(768, 768),
            nn.ReLU(),
            # nn.Dropout(0.5),
            nn.Linear(768, 1),
            nn.Sigmoid()
        )

    def forward(self, ids, mask, token_type_ids):
        _, o2= self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        out= self.head(o2)
        return out


### 训练

In [6]:
# hyper-parameters
num_epochs = 20
lr = 0.0001 # op = 0.0001
# weight_decay = 0.005
batch_size = 64


In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = SMSDataset(train_data, tokenizer, max_length=100)
test_dataset = SMSDataset(test_data, tokenizer, max_length=100)
model = Classifier()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

device

In [None]:
# frozen bert
# for param in model.bert.parameters():
#     param.requires_grad = False

# init_weights
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)

model.head.apply(init_weights)

In [None]:
def eval(model, dataloader, loss_fn, return_pred=False):
    # set model to eval mode
    model.eval()
    
    # init
    sample_num = 0
    test_loss_total = 0
    test_correct_pred_total = 0
    pred_list = [] if return_pred else None

    # start evalation
    with torch.no_grad():
        loop= tqdm(enumerate(dataloader), leave=False, total=len(dataloader))
        for batch, inputs in loop:
            ids, token_type_ids, mask = inputs['ids'], inputs['token_type_ids'], inputs['mask']
            label = inputs['target'].unsqueeze(1)

            # forward
            output = model(
                ids=ids.to(device),
                mask=mask.to(device),
                token_type_ids=token_type_ids.to(device)
            )
            
            # update total number
            sample_num += ids.shape[0]

            # update test_loss_total 
            label = label.type_as(output)
            loss = loss_fn(output, label)
            test_loss_total += loss.item()

            # update test_correct_pred_total
            pred = torch.round(output)
            test_correct_pred_total += (pred == label).sum().item()

            # if return_pred is True, add pred to pred_list
            if return_pred:
                pred_list.append(pred.reshape(-1))
            
            # Show progress while training
            loop.set_description(f'Evalating ...')
    
    pred_list = torch.cat(pred_list) if return_pred else None
    test_accuracy_total = test_correct_pred_total / sample_num
    
    print(sample_num)
    return pred_list, test_loss_total, test_accuracy_total


def train(model, dataloader, loss_fn, optimizer, epoch):
    # set model to train mode
    model.train()
    
    # init
    sample_num = 0
    train_loss_total = 0
    train_correct_pred_total = 0

    # start training
    loop = tqdm(enumerate(dataloader), leave=False, total=len(dataloader))
    for batch, inputs in loop:
        # get input data
        ids, token_type_ids, mask = inputs['ids'], inputs['token_type_ids'], inputs['mask']
        label = inputs['target'].unsqueeze(1)
        
        # forward
        output = model(
            ids=ids.to(device),
            mask=mask.to(device),
            token_type_ids=token_type_ids.to(device)
        )

        # backward
        label = label.type_as(output)
        optimizer.zero_grad()
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()
        
        # calculate accuracy
        pred = torch.round(output)
        num_correct = (pred == label).sum().item()
        num_samples = label.shape[0]
        accuracy = num_correct / num_samples

        # update total number, train_loss_total and train_correct_pred_total
        sample_num += ids.shape[0]
        train_loss_total += loss.item()
        train_correct_pred_total += num_correct
        
        # Show progress while training
        loop.set_description(f'Epoch {epoch+1}, batch {batch} / {len(dataloader)}')
        loop.set_postfix(loss=loss.item(), acc=accuracy)

    print(sample_num)
    train_accuracy_total = train_correct_pred_total / sample_num
    return train_loss_total, train_accuracy_total


def train_and_eval(model, train_dataset, test_dataset, num_epochs, lr, batch_size):
    # dataloader
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size)
    # loss
    loss_fn = nn.BCELoss()
    # optimizer
    optimizer= optim.AdamW(model.parameters(), lr=lr)
    # move model to device
    model.to(device)
    
    for epoch in range(num_epochs):
        # training
        train_loss_total, train_accuracy_total = train(model, train_dataloader, loss_fn, optimizer, epoch)
        # evalation
        _, test_loss_total, test_accuracy_total = eval(model, test_dataloader, loss_fn)
        
        
        print(f"Epoch {epoch + 1} end.")
        print(f"Train loss: {train_loss_total}, Train accuracy: {train_accuracy_total}")
        print(f"Test loss: {test_loss_total}, Test accuracy: {test_accuracy_total}")
                
        

In [None]:
model

Classifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
train_and_eval(model, train_dataset, test_dataset, num_epochs, lr, batch_size)

In [None]:
torch.save(model.state_dict(), "model.pth")

In [None]:
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size)
loss_fn = nn.BCELoss()
pred, test_loss_total, test_accuracy_total = eval(model, test_dataloader, loss_fn, True)
print(test_accuracy_total)

with open("submission.txt", "w") as f:
    for i in pred.cpu().detach().numpy():
        if (i == 0.0):
            f.write("ham\n")
        else:
            f.write("spam\n")
