In [None]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import transformers
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

In [None]:
# os.environ["http_proxy"] = "http://127.0.0.1:7890"
# os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [None]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")
shuffled_indices = np.random.permutation(len(train_data))
split_point = int(0.8 * len(train_data))
train_data, valid_data = \
    train_data.iloc[shuffled_indices[:split_point]], \
    train_data.iloc[shuffled_indices[split_point:]]


train_data.v1 = train_data.v1.replace({"ham": 0, "spam": 1})
test_data.v1 = test_data.v1.replace({"ham": 0, "spam": 1})
valid_data.v1 = valid_data.v1.replace({"ham": 0, "spam": 1})

train_data.shape, valid_data.shape, test_data.shape

### 构造 DataSet 和 DataLoader

In [None]:
class SMSDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        super(SMSDataset, self).__init__()
        self.tokenizer=tokenizer
        self.max_length=max_length
        self.data = data

        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        text= self.data.iloc[index, 2]
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            padding='max_length',
            add_special_tokens=True,
            return_attention_mask=True,
            truncation=True,
            max_length=self.max_length,
        )

        
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.data.iloc[index, 1], dtype=torch.long)
        }
    

### 创建模型

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.hidden_size = self.bert.config.hidden_size
        self.LSTM = nn.LSTM(self.hidden_size, self.hidden_size, bidirectional=True)
        self.head = nn.Sequential(
            nn.Linear(self.hidden_size * 2, self.hidden_size),
            nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(self.hidden_size, 1),
            nn.Sigmoid()
        )
        
    def forward(self, ids, mask, token_type_ids):
        encoded_layers, pooled_output = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        encoded_layers = encoded_layers.permute(1, 0, 2)
        all_hiddens, (last_hidden, last_cell) = self.LSTM(
            pack_padded_sequence(
                encoded_layers, 
                torch.sum(mask, dim = -1).to(torch.device("cpu")),
                enforce_sorted=False
            )
        )
        out = torch.cat((last_hidden[0], last_hidden[1]), dim=1)
        out = F.dropout(out, 0.5)
        out = self.head(out)
        return out


### 训练

In [None]:
# hyper-parameters
num_epochs = 10
lr = 0.00001 # op = 0.0001
# weight_decay = 0.005
batch_size = 64


In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = SMSDataset(train_data, tokenizer, max_length=100)
test_dataset = SMSDataset(test_data, tokenizer, max_length=100)
valid_dataset = SMSDataset(valid_data, tokenizer, max_length=100)
model = Classifier()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

device

In [None]:
# frozen bert
# for param in model.bert.parameters():
#     param.requires_grad = False

# init_weights
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)

model.head.apply(init_weights)

In [None]:
def eval(model, dataloader, loss_fn, return_pred=False):
    # set model to eval mode
    model.eval()
    
    # init
    test_loss_total = 0
    test_correct_pred_total = 0
    pred_list = [] if return_pred else None

    # start evalation
    with torch.no_grad():
        loop= tqdm(enumerate(dataloader), leave=False, total=len(dataloader))
        for batch, inputs in loop:
            ids, token_type_ids, mask = inputs['ids'], inputs['token_type_ids'], inputs['mask']
            label = inputs['target'].unsqueeze(1)

            # forward
            output = model(
                ids=ids.to(device),
                mask=mask.to(device),
                token_type_ids=token_type_ids.to(device)
            )

            # update test_loss_total 
            label = label.type_as(output)
            loss = loss_fn(output, label)
            test_loss_total += loss.item()

            # update test_correct_pred_total
            pred = torch.round(output)
            test_correct_pred_total += (pred == label).sum().item()

            # if return_pred is True, add pred to pred_list
            if return_pred:
                pred_list.append(pred.reshape(-1))
            
            # Show progress while training
            loop.set_description(f'Evalating ...')
    
    pred_list = torch.cat(pred_list) if return_pred else None
    test_accuracy_total = test_correct_pred_total / len(test_dataset)

    return pred_list, test_loss_total, test_accuracy_total


def train(model, dataloader, loss_fn, optimizer, epoch):
    # set model to train mode
    model.train()
    
    # init
    train_loss_total = 0
    train_correct_pred_total = 0

    # start training
    loop = tqdm(enumerate(dataloader), leave=False, total=len(dataloader))
    for batch, inputs in loop:
        # get input data
        ids, token_type_ids, mask = inputs['ids'], inputs['token_type_ids'], inputs['mask']
        label = inputs['target'].unsqueeze(1)
        
        # forward
        output = model(
            ids=ids.to(device),
            mask=mask.to(device),
            token_type_ids=token_type_ids.to(device)
        )

        # backward
        label = label.type_as(output)
        optimizer.zero_grad()
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()
        
        # calculate accuracy
        pred = torch.round(output)
        num_correct = (pred == label).sum().item()
        num_samples = label.shape[0]
        accuracy = num_correct / num_samples

        # update train_loss_total and train_correct_pred_total
        train_loss_total += loss.item()
        train_correct_pred_total += num_correct
        
        # Show progress while training
        loop.set_description(f'Epoch {epoch}, batch {batch} / {len(dataloader)}')
        loop.set_postfix(loss=loss.item(), acc=accuracy)

    train_accuracy_total = train_correct_pred_total / len(train_dataset)
    return train_loss_total, train_accuracy_total


def train_and_eval(model, train_dataset, valid_dataset, test_dataset, num_epochs, lr, batch_size):
    # dataloader
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size)
    # loss
    loss_fn = nn.BCELoss()
    # optimizer
    optimizer= optim.Adam(model.parameters(), lr=lr)
    # move model to device
    model.to(device)
    
    for epoch in range(num_epochs):
        # training
        train_loss_total, train_accuracy_total = train(model, train_dataloader, loss_fn, optimizer, epoch)
        # evalation
        _, test_loss_total, test_accuracy_total = eval(model, test_dataloader, loss_fn)
        
        print(f"Epoch {epoch + 1} end.")
        print(f"Train loss: {train_loss_total}, Train accuracy: {train_accuracy_total}")
        print(f"Test loss: {test_loss_total}, Test accuracy: {test_accuracy_total}")


In [None]:
model

In [None]:
train_and_eval(model, train_dataset, valid_dataset, test_dataset, num_epochs, lr, batch_size)

In [None]:
torch.save(model.state_dict(), "model.pth")

from IPython.display import FileLink
os.chdir('/kaggle/working')
print(os.getcwd())
print(os.listdir("/kaggle/working"))
FileLink('model.pth')

In [None]:
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size)
loss_fn = nn.BCELoss()
pred, test_loss_total, test_accuracy_total = eval(model, test_dataloader, loss_fn, True)
print(test_accuracy_total)

with open("submission.txt", "w") as f:
    for i in pred.cpu().detach().numpy():
    #     print(i)
        if (i == 0.0):
            f.write("ham\n")
        else:
            f.write("spam\n")


In [None]:
def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device('cuda:1'):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()