## 带有灾难推文的自然语言处理
>预测哪些推文是关于真实灾难的，哪些不是
>来源：https://www.kaggle.com/search?competitionId=17777



In [108]:
# 导包
import numpy as np
import pandas as pd #data processing, CSV file I/O
import os

from transformers import BertTokenizer,BertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from torch.nn.utils.rnn import pack_padded_sequence
from torch.optim import AdamW
from tqdm import tqdm #查看进度
from argparse import ArgumentParser
# Ignite帮助您在几行代码中编写紧凑但功能齐全的train循环
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.engine.engine import Engine, State, Events
from ignite.handlers import EarlyStopping
from ignite.contrib.handlers import TensorboardLogger, ProgressBar
from ignite.utils import convert_tensor
from torch.optim.lr_scheduler import ExponentialLR
import warnings
warnings.filterwarnings('ignore')

for dirname,_,filenames in os.walk('./dataset'):
    for filename in filenames:
        print(os.path.join(dirname,filename))


./dataset\sample_submission.csv
./dataset\test.csv
./dataset\train.csv


## 读取数据&加载Bert分词器

In [92]:
def readfiles():
    path = 'dataset'
    train = pd.read_csv(os.path.join(path,'train.csv'))
    test = pd.read_csv(os.path.join(path,'test.csv'))
    sample_subs = pd.read_csv(os.path.join(path,'sample_submission.csv'))

    return train,test,sample_subs

train,test,sample_subs = readfiles()
# print(sample_subs)


#加载Bert分词器
from transformers import BertTokenizer
def Bert_Tokenizer(model_name):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    return tokenizer
tokenizer = Bert_Tokenizer('bert-base-uncased')

## 制作dataset数据集

In [93]:
#自定义dataset
class TextDataset(Dataset):
    def __init__(self,df,tokenizer,max_len):
        self.bert_encode = tokenizer
        self.texts = df.text.values
        self.labels = df.target.values
        self.max_len = max_len

    def __getitem__(self, idx):
        tokens,mask,tokens_len = self.get_token_mask(self.texts[idx],self.max_len)
        label = self.labels[idx]
        return [torch.tensor(tokens),torch.tensor(mask),torch.tensor(tokens_len)],label

    def __len__(self):
        return len(self.texts)

    def get_token_mask(self,text,max_len):
        tokens = [] #分词
        mask = [] #和网络中的ip掩码功能一样
        text = self.bert_encode.encode(text)
        size = len(text)
        pads = self.bert_encode.encode(['PAD']*(max(0,max_len-size))) #不足的补PAD
        tokens[:max(max_len,size)] = text[:max(max_len,size)] #超出的部分截断
        tokens = tokens + pads[1:-1]
        mask = [1]*size+[0]*len(pads[1:-1])
        tokens_len = len(tokens)

        return tokens,mask,tokens_len



from sklearn.model_selection import train_test_split
#加载数据集
def get_data_loaders():
    from sklearn.model_selection import train_test_split
    x_train , x_valid = train_test_split(train, test_size=0.1,random_state=2020)
    train_dataset = TextDataset(x_train,tokenizer=tokenizer,max_len=120)
    train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=1,shuffle=True)
    valid_dataset = TextDataset(x_valid,tokenizer=tokenizer,max_len=120)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=1,shuffle=True)

    return train_loader , valid_loader

## 搭建Bert模型

In [94]:
class MixedBertModel(nn.Module):
    def __init__(self,pre_trained='bert-base-uncased'):
        super().__init__()

        self.bert = BertModel.from_pretrained(pre_trained)
        self.hidden_size = self.bert.config.hidden_size
        self.LSTM = nn.LSTM(self.hidden_size,self.hidden_size,bidirectional=True)
        self.clf = nn.Linear(self.hidden_size*2,1)

    def forward(self,inputs):

        encoded_layers, pooled_output = self.bert(input_ids=inputs[0],attention_mask=inputs[1])
        encoded_layers = encoded_layers.permute(1, 0, 2)
        enc_hiddens, (last_hidden, last_cell) = self.LSTM(pack_padded_sequence(encoded_layers, inputs[2]))
        output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1)
        output_hidden = F.dropout(output_hidden,0.2)
        output = self.clf(output_hidden)

        return F.sigmoid(output)

In [106]:
class MixedBertModel(nn.Module):
    def __init__(self,pre_trained='bert-base-uncased'):
        # 待验证
        super().__init__()
        self.bert = BertModel.from_pretrained(pre_trained)
        self.hidden_size = self.bert.config.hidden_size
        # 双向LSTM
        """
        input_size – 输入数据的大小，也就是前面例子中每个单词向量的长度
        hidden_size – 隐藏层的大小（即隐藏层节点数量），输出向量的维度等于隐藏节点数
        num_layers – recurrent layer的数量，默认等于1。
        """
        self.LSTM = nn.LSTM(self.hidden_size,self.hidden_size,bidirectional=True)
        # 分类器
        self.clf = nn.Linear(in_features=self.hidden_size*2,out_features=1)

    def forward(self,inputs):
        print("input.shape:",len(inputs))
        print("input:",inputs)
        encoded_layers, pooled_output = self.bert(input_ids=inputs[0],attention_mask=inputs[1])
        # print("encoded_layers.type:",type(encoded_layers))
        # print("type:",type(self.bert(input_ids=inputs[0],attention_mask=inputs[1])))
        # print("encoded_layers:--",encoded_layers)
        # print(self.bert(input_ids=inputs[0],attention_mask=inputs[1]))
        print("--",len(encoded_layers))
        print("++++:",encoded_layers.squeeze(0),"++++")
        # encoded_layers = torch.from_numpy(encoded_layers)
        encoded_layers = encoded_layers.permute(1, 0, 2) #将tensor的维度换位
        """
        # pack_padded_sequence:压缩填充张量  去除末尾填充的PAD，防止PAD进入模型，浪费资源
        LSTM的输出:enc_hiddens,(last_hidden,last_cell)
        last_hidden为最后1个time step的隐状态结果
        last_cell为最后1个time step的cell状态结果
        """
        # enc_hiddens, (last_hidden, last_cell) = self.LSTM(pack_padded_sequence(encoded_layers, inputs[2]))
        enc_hiddens, (last_hidden, last_cell) = self.LSTM(pack_padded_sequence(encoded_layers, inputs[2]))
        output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1)
        # 避免过拟合，并增强模型的泛化能力
        output_hidden = F.dropout(output_hidden,0.2)
        output = self.clf(output_hidden)

        return F.sigmoid(output)


## 预处理

In [96]:
#将数据集转为tensor类型
def _prepare_batch(batch, device=None, non_blocking=False):

    x, y = batch
    return (convert_tensor(x, device=device, non_blocking=non_blocking),
            convert_tensor(y, device=device, non_blocking=non_blocking))

# 创建监督学习
def create_supervised_trainer1(model, optimizer, loss_fn, metrics={}, device=None):

    def _update(engine, batch):
        model.train()
        optimizer.zero_grad()#每一轮开始前  梯度都要清零
        x,y = _prepare_batch(batch,device=device)
        y_pred = model(x)#模型输出值
        loss = loss_fn(y_pred,y.float()) #计算损失值
        loss.backward()
        optimizer.step()#更新参数值
        return loss.item(),y_pred,y

    def _metrics_transform(output):
        return output[1], output[2]

    engine = Engine(_update)

    for name, metric in metrics.items():
        metric._output_transform = _metrics_transform
        metric.attach(engine, name)

    return engine

def create_supervised_evaluator1(model, metrics=None,
                                device=None, non_blocking=False,
                                prepare_batch=_prepare_batch,
                                output_transform=lambda x, y, y_pred: (y_pred, y,)):

    metrics = metrics or {}

    if device:
        model

    def _inference(engine, batch):
        model.eval()
        with torch.no_grad():
            x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
            y_pred = model(x)
            return output_transform(x, y.float(), y_pred)

    engine = Engine(_inference)

    for name, metric in metrics.items():
        metric.attach(engine, name)

    return engine

训练模型

In [107]:
def run(log_interval=100,epochs=2,lr=0.000006):
    train_loader ,valid_loader = get_data_loaders()
    model = MixedBertModel()
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    criterion = nn.BCELoss() #损失函数
    optimizer = AdamW(model.parameters(),lr=lr) #优化器
    # 学习率衰减，每个epoch中lr都乘以gamma
    lr_scheduler = ExponentialLR(optimizer, gamma=0.90)
    trainer = create_supervised_trainer1(model.to(device), optimizer, criterion, device=device)
    evaluator = create_supervised_evaluator1(model.to(device), metrics={'BCELoss': Loss(criterion)}, device=device)

    if log_interval is None:
        e = Events.ITERATION_COMPLETED
        log_interval = 1
    else:
        e = Events.ITERATION_COMPLETED(every=log_interval)

    desc = "loss: {:.4f} | lr: {:.4f}"
    pbar = tqdm(
        initial=0, leave=False, total=len(train_loader),
        desc=desc.format(0, lr)
    )

    @trainer.on(e)
    def log_training_loss(engine):
        pbar.refresh()
        lr = optimizer.param_groups[0]['lr']
        pbar.desc = desc.format(engine.state.output[0], lr)
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def update_lr_scheduler(engine):
        lr_scheduler.step()



    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_loss = metrics['BCELoss']
        tqdm.write(
            "Train Epoch: {} BCE loss: {:.2f}".format(engine.state.epoch, avg_loss)
        )

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        pbar.refresh()
        evaluator.run(valid_loader)
        metrics = evaluator.state.metrics
        avg_loss = metrics['BCELoss']
        tqdm.write(
            "Valid Epoch: {} BCE loss: {:.2f}".format(engine.state.epoch, avg_loss)
        )
        pbar.n = pbar.last_print_n = 0


    try:
        trainer.run(train_loader, max_epochs=epochs)

    except Exception as e:
        import traceback
        print(traceback.format_exc())
    return model


#开始训练
model = run()


                                                                   [ASome weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

loss: 0.0000 | lr: 0.0000:   0%|          | 0/6851 [00:00<?, ?it/s][A

input.shape: 3
input: [tensor([[  101, 11865, 24917,  1005, 15527, 15415,  1005,  4871, 11740,  2408,
          1996,  2555,  2048,  2086,  2044,  4517,  7071,  8299,  1024,  1013,
          1013,  1056,  1012,  2522,  1013, 17816, 19481,  4887,  2509, 15042,
          3081,  1030,  5653,  2239,  4179,   102,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,   100,   100,   100,   100,   100,   100,   100]]), tensor([[1, 1, 1, 1, 1, 1,

Current run is terminating due to exception: 'str' object has no attribute 'detach'
Engine run is terminating due to exception: 'str' object has no attribute 'detach'


-- 17
Traceback (most recent call last):
  File "<ipython-input-107-e504e3e4c461>", line 59, in run
    trainer.run(train_loader, max_epochs=epochs)
  File "E:\IT\code\Python\environment\Miniconda3\envs\pytorch\lib\site-packages\ignite\engine\engine.py", line 698, in run
    return self._internal_run()
  File "E:\IT\code\Python\environment\Miniconda3\envs\pytorch\lib\site-packages\ignite\engine\engine.py", line 771, in _internal_run
    self._handle_exception(e)
  File "E:\IT\code\Python\environment\Miniconda3\envs\pytorch\lib\site-packages\ignite\engine\engine.py", line 466, in _handle_exception
    raise e
  File "E:\IT\code\Python\environment\Miniconda3\envs\pytorch\lib\site-packages\ignite\engine\engine.py", line 741, in _internal_run
    time_taken = self._run_once_on_dataset()
  File "E:\IT\code\Python\environment\Miniconda3\envs\pytorch\lib\site-packages\ignite\engine\engine.py", line 845, in _run_once_on_dataset
    self._handle_exception(e)
  File "E:\IT\code\Python\environmen

## 预测

In [98]:
# 定义测试数据集的dataset
class TestTextDataset(Dataset):
    def __init__(self,df,tokenizer,max_len):

        self.bert_encode = tokenizer
        self.texts = df.text.values
        self.max_len = max_len

    def __len__(self):

        return len(self.texts)

    def __getitem__(self,idx):

        tokens,mask,tokens_len = self.get_token_mask(self.texts[idx],self.max_len)
        return [torch.tensor(tokens),torch.tensor(mask),torch.tensor(tokens_len)]

    def get_token_mask(self,text,max_len):

        tokens = []
        mask = []
        text = self.bert_encode.encode(text)
        size = len(text)
        pads = self.bert_encode.encode(['PAD']*(max(0,max_len-size)))
        tokens[:max(max_len,size)] = text[:max(max_len,size)]
        tokens = tokens + pads[1:-1]
        mask = [1]*size+[0]*len(pads[1:-1])
        tokens_len = len(tokens)

        return tokens,mask,tokens_len

In [99]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.eval()
predictions = []
test_dataset = TestTextDataset(test,tokenizer=tokenizer,max_len=120)
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=32,shuffle=False)
with torch.no_grad():
    for idx , (inputs) in tqdm(enumerate(test_loader),total=len(test_loader)):
        inputs = [a.to(device) for a in inputs]
        preds = model(inputs)
        predictions.append(preds.cpu().detach().numpy())

predictions = np.vstack(predictions)




  0%|          | 0/102 [00:12<?, ?it/s][A[A

-- 17
++++: last_hidden_state ++++





AttributeError: 'str' object has no attribute 'permute'

## 查看并输出预测结果

In [None]:
sample_subs.target = np.round(np.vstack(predictions)).astype(int)
print(sample_subs.head(20))
sample_subs.to_csv('submission.csv', index = False)
