In [93]:
import pandas  as pd
from collections import Counter
import torch
from torch import nn
import collections
from tqdm import tqdm 
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [2]:
from torchtext.vocab import GloVe
global_vectors = GloVe(name='840B', dim=300, cache='../../glove/')

In [3]:
traindata = pd.read_csv('./data/train.csv')

In [7]:
traindata = traindata.dropna()

In [8]:
Counter(traindata.is_duplicate)

Counter({0: 255024, 1: 149263})

In [12]:
traindata = traindata.reset_index()

In [13]:
loss_func = torch.nn.NLLLoss()
logsoftmax = nn.LogSoftmax(dim=1)

digits = torch.randn(len(traindata.is_duplicate),2) 
print(loss_func(logsoftmax(digits), torch.tensor(traindata['is_duplicate']))) # 随机时 0.9

tensor(0.9039)


In [15]:
traindata.iloc[0]['question1']

'What is the step by step guide to invest in share market in india?'

### 获取词表

In [21]:
def get_tokens_list(df):
    token_idx_map = collections.defaultdict()
    cnt = 0
    for i, sent in tqdm(enumerate(df.question1)):
        if isinstance(sent, str):
            for token in sent.replace('?','').split(' '):
                if token in token_idx_map:
                    continue
                token_idx_map[token] = cnt
                cnt += 1
    for i, sent in tqdm(enumerate(df.question2)):
        if isinstance(sent, str):
            for token in sent.replace('?','').split(' '):
                if token in token_idx_map:
                    continue
                token_idx_map[token] = cnt
                cnt += 1
    return token_idx_map

In [22]:
token_idx_map = get_tokens_list(traindata)

404287it [00:00, 433159.30it/s]
404287it [00:00, 440695.77it/s]


### 整理数据格式到 Dataset 要求类型
- sent1的tokenid
- sent2的tokenid
- label

In [33]:
tmp = []
for word in traindata.iloc[1000]['question1'].split(' '):
    tmp.append(token_idx_map.get(word, len(token_idx_map)+1))
torch.tensor(tmp)

tensor([    17,     18,     19,   3213,   2366,    434,      6,   3214,      8,
          2358,      8,     27, 196368])

In [99]:
train_rawdata, val_rawdata = train_test_split(traindata, test_size=0.2, random_state=24)
print(train_data.shape, val_rawdata.shape)

(323429, 7) (80858, 7)


In [102]:
# 回头用多进程改进一下数据处理的地方

def df_to_predataset(df, token_idx_map):
    n, _ = df.shape
    predataset = []
    for i in tqdm(range(n)):
        row = df.iloc[i]
        tmp_dict = {}
        tmp_dict['text1_token'] = torch.tensor([token_idx_map.get(word, len(token_idx_map)+1) for word in row['question1'].replace('?','').split(' ')])
        tmp_dict['text2_token'] = torch.tensor([token_idx_map.get(word, len(token_idx_map)+1) for word in row['question2'].replace('?','').split(' ')])
        tmp_dict['label'] = row['is_duplicate']
        tmp_dict['id'] = row['id']
        predataset.append(tmp_dict)
    return predataset

In [103]:
train_predataset = df_to_predataset(train_rawdata, token_idx_map)
eval_predataset = df_to_predataset(val_rawdata, token_idx_map)

100%|██████████| 323429/323429 [00:40<00:00, 7985.38it/s]
100%|██████████| 80858/80858 [00:09<00:00, 8321.12it/s]


In [105]:
train_predataset[2]

{'text1_token': tensor([   19,   375, 10545,     8,  6709, 10546,     8,  2061,    44, 10547,
             8,  2180,   103, 10548,   234,    99,     2,   797,  1485,  8386,
             8,  2424,     8,    27, 10549,  2024,   372,   545,    97]),
 'text2_token': tensor([  1239,   3106,   6200,      1, 138656,     22,   2061,      1, 138657,
             22,   1429,      8,   7714,      1, 138658,     44,     19,    375,
            363,    545,   8638,     75,     19,    342,    343,   7608]),
 'label': 0,
 'id': 5434}

In [106]:
class MyDataset(Dataset):

    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]['text1_token'], self.data[index]['text2_token'], self.data[index]['label'], self.data[index]['id']
    
def collate_fn_padd(batch):
    '''
    Padds batch of variable length

    note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    '''
    x, y, label, pair_id = zip(*batch)
    x_lens = [len(x_i) for x_i in x]
    y_lens = [len(y_i) for y_i in y]
    x_pad = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
    y_pad = torch.nn.utils.rnn.pad_sequence(y, batch_first=True)
    # z_pad = torch.nn.utils.rnn.pad_sequence(z, batch_first=True)
    return x_pad, torch.tensor(x_lens), y_pad, torch.tensor(y_lens), torch.tensor(label), torch.tensor(pair_id)

In [467]:
train_dataset_loader = DataLoader(
    MyDataset(train_predataset),
    batch_size=256,
    shuffle=True,
    collate_fn=lambda x: collate_fn_padd(x))

eval_dataset_loader = DataLoader(
    MyDataset(eval_predataset),
    batch_size=256,
    shuffle=False,
    collate_fn=lambda x: collate_fn_padd(x))

In [468]:
t = next(iter(train_dataset_loader))

In [469]:
for v in t:
    print(v.shape)

torch.Size([256, 49])
torch.Size([256])
torch.Size([256, 61])
torch.Size([256])
torch.Size([256])
torch.Size([256])


### 定义token-emb
将emb表用token初始化

In [114]:
glove_weight = global_vectors.get_vecs_by_tokens(list(token_idx_map.keys()))
embedding_glove = nn.Embedding.from_pretrained(glove_weight)

In [120]:
print(embedding_glove(torch.tensor(0)))

tensor([-2.0628e-01,  3.6716e-01, -7.1933e-02, -2.0267e-01, -1.7848e-01,
        -1.2100e-01, -7.3975e-04,  1.4043e-01,  7.5546e-02,  1.5698e+00,
        -5.5420e-01, -1.7005e-01,  2.8289e-01, -1.1052e-01, -2.0246e-01,
        -7.4419e-02, -9.0170e-02,  9.0817e-01, -3.8369e-01, -7.6266e-02,
         3.0301e-01, -3.0917e-01, -2.0811e-02,  1.6218e-01, -3.3647e-01,
         1.9596e-01,  7.1785e-02,  5.2944e-01, -8.1967e-02, -1.0554e-01,
        -1.9988e-01,  4.8801e-01,  1.1412e-01,  2.5509e-02,  5.0287e-01,
         4.4282e-01,  2.9300e-01,  3.8677e-01, -3.4214e-01, -4.9262e-02,
         2.1308e-01,  1.7267e-02, -1.8769e-01, -2.1153e-01,  2.6950e-01,
        -1.3365e-01, -6.9443e-01,  1.5104e-01, -1.5997e-01,  3.6818e-02,
         1.7237e-01,  1.4098e-01,  1.0714e-01, -2.3506e-01,  3.2742e-01,
        -1.9066e-01,  1.7212e-01, -2.7347e-01, -1.3840e-01, -9.0581e-02,
        -2.0053e-02, -1.1375e-01,  1.8967e-01,  1.1945e-01, -6.4538e-02,
        -4.2223e-01, -1.2985e-01,  1.0282e-01, -4.8

In [135]:
embedding_glove(t[0]).shape

torch.Size([32, 21, 300])

## 评估函数

In [None]:
import sklearn

# 积累epoch的预测数据
def res_metrics_in_one_epoch(res_dict={},
                         step=0,
                         label=None,
                         pred=None,
                         ):
    if step==0:
        res_dict['label']=label
        res_dict['pred']=pred

    res_dict['label']=torch.cat([res_dict['label'], label], dim=0)
    res_dict['pred']=torch.cat([res_dict['pred'], pred], dim=0)
    return res_dict


def cal_metrics_all_data(label, pred):
    """
    input: label(numpy), pred
    """
    label = label.cpu().numpy()
    pred01 = pred.argmax(dim=1).cpu().numpy()
    pred = pred.cpu().numpy()
    accu = sklearn.metrics.accuracy_score(label,pred01)
    recall = sklearn.metrics.recall_score(label, pred01)
    f1 = sklearn.metrics.f1_score(label, pred01)
    precision = sklearn.metrics.precision_score(label, pred01, zero_division="warn")
    auc_score = sklearn.metrics.roc_auc_score(label, pred[:,1])
    d = {'accuracy':accu, 
         'recall':recall,
         'f1':f1,
         'precision':precision,
         'auc_score':auc_score}
    return d

In [488]:
# 样本权重计算

from sklearn.utils.class_weight import compute_class_weight
import numpy as np
def get_class_weights(train_labels=None):
    class_weights = compute_class_weight(class_weight = 'balanced',
                                         classes = np.unique(train_labels),
                                         y = train_labels)
    class_weights = torch.tensor(class_weights,dtype=torch.float)
    return class_weights

train_labels = train_rawdata.is_duplicate
class_weights = get_class_weights(train_labels=train_labels)
print(class_weights)

tensor([0.7925, 1.3548])


### 实验1：双塔模式+简单模型
1. glove 代入词emb
2. 实现lstm双塔

In [470]:
config = {
    'embedding_dim':300,
    'vocab_size': glove_weight.shape[0],
    'num_layers':2, 
    'bidirectional': True,
    'batch_size':32
}

In [518]:
class Model_gloveemb_bilstm(nn.Module):
    def __init__(self, config=None, embedding_weight = None):
        super(Model_gloveemb_bilstm, self).__init__()
        
        self.config = config
        self.embedding_dim = self.config.get('embedding_dim', 300)
        self.hidden_dim = self.config.get('hidden_dim', 200)
        self.vocab_size = self.config.get('vocab_size', 20*10000)
        self.num_layers = self.config.get('num_layers',2)
        self.bidirectional = self.config.get('bidirectional',True)
        # self.batch_size = self.config.get('batch_size',32)

        if embedding_weight is not None:
            self.word_embeds = nn.Embedding.from_pretrained(embedding_weight)
        else:
            self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim)
        
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                            hidden_size=self.hidden_dim//2,
                            num_layers=self.num_layers, 
                            bidirectional=self.bidirectional,
                            dropout=0.2,
                            batch_first=True,)
        
        self.fc1 = nn.Linear(4*self.hidden_dim, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.Softmax(dim=1)
        self.layernorm = nn.LayerNorm(self.hidden_dim)
        
    def lstm_model(self, text, text_len):
        emb = self.word_embeds(text)
        emb_packed = pack_padded_sequence(emb, lengths=text_len.to('cpu'),
                                            batch_first=True,
                                            enforce_sorted=False)
        lstm_out, (hidden, cell) = self.lstm(emb_packed)
        lstm_out_with0, lens = pad_packed_sequence(lstm_out, batch_first=True)
        sent = lstm_out_with0.sum(1)/lens.view(-1,1).cuda()
        return sent
    
    def forward(self, text1, text1_len, text2, text2_len):
        sent1 = self.lstm_model(text1, text1_len)
        sent2 = self.lstm_model(text2, text2_len)
        
        # sent1 = self.layernorm(sent1)
        # sent2 = self.layernorm(sent2)
        
        x1 = torch.cat((sent1,sent2,torch.abs(sent1-sent2),sent1*sent2), dim=1)
        x = self.dropout(self.relu(self.fc1(x1)))
        x = self.fc2(self.dropout(x)) # 在这里不能加relu，不然loss不会变化
        output = self.softmax(x)
        return output

In [519]:
model1 = Model_gloveemb_bilstm(config, glove_weight)

In [520]:
def train_valid(if_train,
                model,
                data_loader,
                optimizer,
                scheduler,
                loss_func,
                epoch):
    model.cuda()
    if if_train:
        model.train()
    else:
        model.eval()
    loss_total, res_dict = 0, {}
    with tqdm(data_loader) as tq:
        for step, batch_data in enumerate(tq):
            if if_train:
                optimizer.zero_grad()
            batch_data = [v.cuda() for v in batch_data]
            text1, text1_len, text2, text2_len, labels, ids = batch_data
            logit = model(text1, text1_len, text2, text2_len)
            batch_data = [v.cpu() for v in batch_data]
            loss = loss_func(torch.log(logit+1e-6), labels) # 防止为0
            if if_train:
                loss.backward()
                optimizer.step()

            loss_curr = loss.item()
            res_dict = res_metrics_in_one_epoch(res_dict=res_dict,
                         step=step,
                         label=labels,
                         pred=logit)

            loss_total += loss.item()

            tq.set_postfix({
                'train:' if if_train else 'eval:' :'===',
                'epoch':'%d' % epoch,
                'lr':'%.10f' %optimizer.param_groups[0]['lr'],
                'loss': '%.03f' % loss.item()},
                refresh=True)
        with torch.no_grad():
            curr_step = cal_metrics_all_data(label=res_dict['label'],
                                             pred=res_dict['pred'])
            print(('train:' if if_train else 'eval:')+" loss = {:.5f}".format(loss_total/len(data_loader)))
            print(curr_step)
        if if_train:
            scheduler.step()

In [521]:
loss_func = nn.NLLLoss(weight=class_weights.cuda())
optimizer = torch.optim.Adam(model1.parameters(), lr=1e-3, eps=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [523]:
for epoch in range(20):
    train_valid(if_train=True,
                model=model1, 
                data_loader=train_dataset_loader,
                optimizer=optimizer,
                scheduler=scheduler,
                loss_func=loss_func,
                epoch=epoch)
    train_valid(if_train=False,
                model=model1, 
                data_loader=eval_dataset_loader,
                optimizer=optimizer,
                scheduler=scheduler,
                loss_func=loss_func,
                epoch=epoch)

100%|██████████| 1264/1264 [00:54<00:00, 23.40it/s, train:====, epoch=0, lr=0.0010000000, loss=0.487]


train:
{'accuracy': 0.7550488901246583, 'recall': 0.8137593865369643, 'f1': 0.7103131542314732, 'precision': 0.6302011073220699, 'auc_score': 0.8468113581237229}


100%|██████████| 316/316 [00:07<00:00, 43.76it/s, eval:====, epoch=0, lr=0.0010000000, loss=0.418]


eval: loss = 0.42668
{'accuracy': 0.7810119091648791, 'recall': 0.8708258348330334, 'f1': 0.7463262071033804, 'precision': 0.6529724866931554, 'auc_score': 0.8817439229928139}


100%|██████████| 1264/1264 [00:52<00:00, 24.25it/s, train:====, epoch=1, lr=0.0010000000, loss=0.389]


train:
{'accuracy': 0.8033056829942691, 'recall': 0.8613929348735978, 'f1': 0.763733388750552, 'precision': 0.6859630293778456, 'auc_score': 0.8942506427862419}


100%|██████████| 316/316 [00:07<00:00, 43.65it/s, eval:====, epoch=1, lr=0.0010000000, loss=0.400]


eval: loss = 0.40339
{'accuracy': 0.7911951081194368, 'recall': 0.8969539425448244, 'f1': 0.7606585176287712, 'precision': 0.6603204200299321, 'auc_score': 0.897287231721692}


100%|██████████| 1264/1264 [00:50<00:00, 25.21it/s, train:====, epoch=2, lr=0.0010000000, loss=0.387]


train:
{'accuracy': 0.825305466734634, 'recall': 0.8793413750094173, 'f1': 0.7879312931293129, 'precision': 0.7137363346673099, 'auc_score': 0.9138433824811122}


100%|██████████| 316/316 [00:07<00:00, 43.64it/s, eval:====, epoch=2, lr=0.0010000000, loss=0.372]


eval: loss = 0.37626
{'accuracy': 0.8105505831299159, 'recall': 0.8946544024528428, 'f1': 0.7774736811619386, 'precision': 0.6874343806816727, 'auc_score': 0.910741580845606}


100%|██████████| 1264/1264 [00:48<00:00, 26.17it/s, train:====, epoch=3, lr=0.0010000000, loss=0.434]


train:
{'accuracy': 0.8423034740565674, 'recall': 0.8942138659612584, 'f1': 0.8071526261305849, 'precision': 0.7355399785165395, 'auc_score': 0.9282138338729495}


100%|██████████| 316/316 [00:07<00:00, 43.67it/s, eval:====, epoch=3, lr=0.0010000000, loss=0.372]


eval: loss = 0.36699
{'accuracy': 0.8216707350149173, 'recall': 0.8917883090048657, 'f1': 0.7872262183192857, 'precision': 0.7046106854148564, 'auc_score': 0.9161903446708848}


100%|██████████| 1264/1264 [00:49<00:00, 25.52it/s, train:====, epoch=4, lr=0.0010000000, loss=0.348]


train:
{'accuracy': 0.8571543321439054, 'recall': 0.9068901147654883, 'f1': 0.8241357407203143, 'precision': 0.7552212586788613, 'auc_score': 0.9393091468387225}


100%|██████████| 316/316 [00:07<00:00, 43.35it/s, eval:====, epoch=4, lr=0.0005000000, loss=0.346]


eval: loss = 0.36647
{'accuracy': 0.8173188352195675, 'recall': 0.9118176364727054, 'f1': 0.786907877707153, 'precision': 0.6920975412324193, 'auc_score': 0.9197421759810727}


100%|██████████| 1264/1264 [00:48<00:00, 25.91it/s, train:====, epoch=5, lr=0.0005000000, loss=0.310]


train:
{'accuracy': 0.8795959034246258, 'recall': 0.9265209419919799, 'f1': 0.8502863816039675, 'precision': 0.7856432972712043, 'auc_score': 0.954240845195669}


100%|██████████| 316/316 [00:07<00:00, 43.25it/s, eval:====, epoch=5, lr=0.0005000000, loss=0.338]


eval: loss = 0.38011
{'accuracy': 0.8291910151145302, 'recall': 0.9023861894287809, 'f1': 0.796276963343087, 'precision': 0.7124963818645896, 'auc_score': 0.9246694395304759}


100%|██████████| 1264/1264 [00:48<00:00, 26.06it/s, train:====, epoch=6, lr=0.0005000000, loss=0.164]


train:
{'accuracy': 0.8895005947139967, 'recall': 0.9349065369708436, 'f1': 0.8619743529384525, 'precision': 0.799597634491745, 'auc_score': 0.9602314047471284}


100%|██████████| 316/316 [00:07<00:00, 43.62it/s, eval:====, epoch=6, lr=0.0005000000, loss=0.329]


eval: loss = 0.37731
{'accuracy': 0.8390660058682842, 'recall': 0.8831567019929347, 'f1': 0.8023738153631877, 'precision': 0.7351309365290724, 'auc_score': 0.9247971187244902}


100%|██████████| 1264/1264 [00:48<00:00, 26.22it/s, train:====, epoch=7, lr=0.0005000000, loss=0.200]


train:
{'accuracy': 0.89680399153498, 'recall': 0.9413414368710968, 'f1': 0.8706798762664685, 'precision': 0.8098859863298834, 'auc_score': 0.9647896225710261}


100%|██████████| 316/316 [00:07<00:00, 44.13it/s, eval:====, epoch=7, lr=0.0005000000, loss=0.329]


eval: loss = 0.40625
{'accuracy': 0.8359222822200854, 'recall': 0.891621675664867, 'f1': 0.8008141640600447, 'precision': 0.7267936214718427, 'auc_score': 0.9250254472859047}


100%|██████████| 1264/1264 [00:49<00:00, 25.64it/s, train:====, epoch=8, lr=0.0005000000, loss=0.174]


train:
{'accuracy': 0.904240233560406, 'recall': 0.9467190128996074, 'f1': 0.8794811577523056, 'precision': 0.8211607019683868, 'auc_score': 0.9687047437429382}


100%|██████████| 316/316 [00:08<00:00, 38.06it/s, eval:====, epoch=8, lr=0.0005000000, loss=0.361]


eval: loss = 0.41420
{'accuracy': 0.8396824222698918, 'recall': 0.8844897687129241, 'f1': 0.8032201440590763, 'precision': 0.7356283607738788, 'auc_score': 0.9240540373180745}


100%|██████████| 1264/1264 [00:49<00:00, 25.59it/s, train:====, epoch=9, lr=0.0005000000, loss=0.296]


train:
{'accuracy': 0.9096281878987287, 'recall': 0.9506478722335688, 'f1': 0.8859108566436295, 'precision': 0.8294286048142089, 'auc_score': 0.9718062584701959}


100%|██████████| 316/316 [00:07<00:00, 43.92it/s, eval:====, epoch=9, lr=0.0002500000, loss=0.342]


eval: loss = 0.42658
{'accuracy': 0.8406070468723031, 'recall': 0.8844231153769246, 'f1': 0.8041208999318235, 'precision': 0.7371871440873358, 'auc_score': 0.9243394632622874}


100%|██████████| 1264/1264 [00:48<00:00, 26.04it/s, train:====, epoch=10, lr=0.0002500000, loss=0.161]


train:
{'accuracy': 0.9226562862041799, 'recall': 0.9590675093123509, 'f1': 0.9015087436316069, 'precision': 0.8504676365795725, 'auc_score': 0.9778973902406594}


100%|██████████| 316/316 [00:07<00:00, 43.45it/s, eval:====, epoch=10, lr=0.0002500000, loss=0.337]


eval: loss = 0.45390
{'accuracy': 0.8395098256774416, 'recall': 0.8954875691528361, 'f1': 0.804997153899158, 'precision': 0.7311166739225076, 'auc_score': 0.925214615643045}


100%|██████████| 1264/1264 [00:48<00:00, 25.93it/s, train:====, epoch=11, lr=0.0002500000, loss=0.119]


train:
{'accuracy': 0.9270309096807081, 'recall': 0.9625394486811375, 'f1': 0.9068603674478581, 'precision': 0.8572706668257187, 'auc_score': 0.9802364747629899}


100%|██████████| 316/316 [00:07<00:00, 43.62it/s, eval:====, epoch=11, lr=0.0002500000, loss=0.413]


eval: loss = 0.49650
{'accuracy': 0.8461301378307075, 'recall': 0.8755582216889956, 'f1': 0.8080584390618992, 'precision': 0.7502213084325652, 'auc_score': 0.9249690355685873}


100%|██████████| 1264/1264 [00:48<00:00, 26.11it/s, train:====, epoch=12, lr=0.0002500000, loss=0.218]


train:
{'accuracy': 0.9305435840400389, 'recall': 0.9645909543700455, 'f1': 0.9111186665823264, 'precision': 0.8632634867361387, 'auc_score': 0.9818395903469134}


100%|██████████| 316/316 [00:07<00:00, 43.93it/s, eval:====, epoch=12, lr=0.0002500000, loss=0.409]


eval: loss = 0.50666
{'accuracy': 0.845291811524521, 'recall': 0.877991068452976, 'f1': 0.80764573338034, 'precision': 0.7477364970340306, 'auc_score': 0.9244672216844101}


100%|██████████| 1264/1264 [00:48<00:00, 26.19it/s, train:====, epoch=13, lr=0.0002500000, loss=0.233]


train:
{'accuracy': 0.9334260160341072, 'recall': 0.966012590409858, 'f1': 0.9146036514082135, 'precision': 0.8683899612446853, 'auc_score': 0.9831041231226917}


100%|██████████| 316/316 [00:07<00:00, 44.07it/s, eval:====, epoch=13, lr=0.0002500000, loss=0.431]


eval: loss = 0.52109
{'accuracy': 0.8423823261089332, 'recall': 0.8849563420649204, 'f1': 0.8059733203830454, 'precision': 0.7399336807200378, 'auc_score': 0.9240158578469273}


100%|██████████| 1264/1264 [00:48<00:00, 26.21it/s, train:====, epoch=14, lr=0.0002500000, loss=0.170]


train:
{'accuracy': 0.9366019432472928, 'recall': 0.9685893679363751, 'f1': 0.9185406304457421, 'precision': 0.8734099813538466, 'auc_score': 0.9844275875867231}


 89%|████████▊ | 280/316 [00:06<00:00, 47.49it/s, eval:====, epoch=14, lr=0.0001250000, loss=0.387]

eval: loss = 0.54897
{'accuracy': 0.8454767364450033, 'recall': 0.8737252549490102, 'f1': 0.8070742519394164, 'precision': 0.7498712888278702, 'auc_score': 0.9238372825813119}


100%|██████████| 1264/1264 [00:48<00:00, 26.26it/s, train:====, epoch=15, lr=0.0001250000, loss=0.151]


train:
{'accuracy': 0.943114447688339, 'recall': 0.9719406333447737, 'f1': 0.92653342962363, 'precision': 0.8851795380041168, 'auc_score': 0.9867722745677834}


100%|██████████| 316/316 [00:07<00:00, 43.94it/s, eval:====, epoch=15, lr=0.0001250000, loss=0.447]


eval: loss = 0.56160
{'accuracy': 0.8450822299479744, 'recall': 0.8821569019529427, 'f1': 0.8081702439471193, 'precision': 0.7456338028169014, 'auc_score': 0.923462710669366}


100%|██████████| 1264/1264 [00:48<00:00, 26.21it/s, train:====, epoch=16, lr=0.0001250000, loss=0.108]


train:
{'accuracy': 0.9450144430542039, 'recall': 0.9732204893811163, 'f1': 0.9288973049849389, 'precision': 0.8884354677243097, 'auc_score': 0.9877681449818894}


100%|██████████| 316/316 [00:07<00:00, 43.36it/s, eval:====, epoch=16, lr=0.0001250000, loss=0.481]


eval: loss = 0.58316
{'accuracy': 0.8471164040732796, 'recall': 0.8720255948810238, 'f1': 0.808428467705807, 'precision': 0.753477121548075, 'auc_score': 0.9234247033481827}


100%|██████████| 1264/1264 [00:48<00:00, 26.22it/s, train:====, epoch=17, lr=0.0001250000, loss=0.103]


train:
{'accuracy': 0.9467074470550072, 'recall': 0.9739992298548443, 'f1': 0.930987301663506, 'precision': 0.8916135360470819, 'auc_score': 0.988247885104587}


100%|██████████| 316/316 [00:07<00:00, 43.65it/s, eval:====, epoch=17, lr=0.0001250000, loss=0.468]


eval: loss = 0.59647
{'accuracy': 0.8479424020514338, 'recall': 0.8682263547290542, 'f1': 0.8085912039479811, 'precision': 0.7566217472118959, 'auc_score': 0.923128935038434}


100%|██████████| 1264/1264 [00:48<00:00, 26.12it/s, train:====, epoch=18, lr=0.0001250000, loss=0.068]


train:
{'accuracy': 0.9481409394936435, 'recall': 0.9746197567447662, 'f1': 0.9327613279497533, 'precision': 0.8943503475822867, 'auc_score': 0.9888514596919138}


100%|██████████| 316/316 [00:07<00:00, 43.45it/s, eval:====, epoch=18, lr=0.0001250000, loss=0.511]


eval: loss = 0.60821
{'accuracy': 0.8490519515743274, 'recall': 0.8638272345530894, 'f1': 0.8089382685225641, 'precision': 0.7606080169024004, 'auc_score': 0.923139850911141}


100%|██████████| 1264/1264 [00:48<00:00, 26.04it/s, train:====, epoch=19, lr=0.0001250000, loss=0.102]


train:
{'accuracy': 0.9492037011291843, 'recall': 0.9751326648475819, 'f1': 0.9340864629101056, 'precision': 0.8963561943157863, 'auc_score': 0.9893530104592612}


100%|██████████| 316/316 [00:07<00:00, 43.93it/s, eval:====, epoch=19, lr=0.0000625000, loss=0.496]


eval: loss = 0.60657
{'accuracy': 0.8459821978943216, 'recall': 0.8775244951009799, 'f1': 0.8082572327526668, 'precision': 0.7491251529204245, 'auc_score': 0.9226171481938514}


## 以下为草稿

In [266]:
# lstm 实验

In [251]:
lstm = nn.LSTM(input_size=300,
                            hidden_size=200//2,
                            num_layers=2, 
                            bidirectional=True,
                            dropout=0.2,
                            batch_first=True,)
lstm = lstm.cuda()

In [246]:
embedding_glove = embedding_glove.cuda()
emb1 = embedding_glove(t[0])

In [253]:
tmp_lstm_out = lstm(emb1)

In [None]:
tmp_lstm_out

In [256]:
embed_packed = pack_padded_sequence(emb1, lengths=t[1].to('cpu'),
                                    batch_first=True,
                                    enforce_sorted=False)
lstm_out, (hidden, cell) = lstm(embed_packed) #, self.hidden_init)

In [257]:
lstm_out

PackedSequence(data=tensor([[ 2.1798e-02,  3.7612e-03,  5.0111e-02,  ...,  3.5564e-02,
         -2.0459e-02,  2.5435e-03],
        [ 1.6436e-02,  1.8754e-02,  3.2143e-02,  ...,  2.6828e-02,
         -4.5456e-02, -4.9270e-02],
        [ 2.5027e-03,  1.2035e-02,  4.0902e-02,  ...,  2.5554e-02,
         -5.0032e-02,  1.8674e-02],
        ...,
        [ 1.0788e-02,  4.6041e-02,  7.1057e-02,  ..., -1.8851e-02,
          8.1092e-03,  3.0886e-02],
        [ 6.0315e-03,  4.1953e-02,  7.6154e-02,  ..., -2.4376e-02,
         -2.9777e-02, -4.1289e-02],
        [-5.3446e-05,  4.3082e-02,  6.1371e-02,  ...,  6.7779e-03,
         -1.0706e-02,  2.5017e-02]], device='cuda:0',
       grad_fn=<CudnnRnnBackward0>), batch_sizes=tensor([32, 32, 32, 32, 32, 32, 31, 29, 23, 20, 14, 10,  9,  7,  7,  6,  3,  3,
         3,  2,  1]), sorted_indices=tensor([ 2,  3, 10, 17, 16, 12, 20, 15,  6, 26,  0, 19, 21, 27, 22, 25, 30, 31,
        11,  7, 14, 24,  5, 13,  9, 23,  8, 28, 29, 18,  1,  4],
       device='cuda:

In [258]:
lstm_out, lens = pad_packed_sequence(lstm_out, batch_first=True)

In [262]:
lstm_out.shape, lens

(torch.Size([32, 21, 200]),
 tensor([11,  7, 21, 20,  6,  9, 13, 10,  8,  8, 19, 10, 16,  8,  9, 13, 16, 16,
          7, 11, 15, 11, 10,  8,  9, 10, 12, 11,  8,  8, 10, 10]))

In [265]:
lstm_out[0][0:11], lstm_out[0][12]

(tensor([[-0.0034,  0.0332,  0.0318,  ...,  0.0412, -0.0013,  0.0066],
         [ 0.0127,  0.0398,  0.0434,  ...,  0.0501,  0.0161, -0.0084],
         [ 0.0202,  0.0002,  0.0609,  ...,  0.0473, -0.0124, -0.0212],
         ...,
         [ 0.0082,  0.0268,  0.0966,  ...,  0.0080, -0.0216, -0.0165],
         [ 0.0067,  0.0392,  0.0966,  ...,  0.0046, -0.0061,  0.0052],
         [ 0.0143,  0.0135,  0.1116,  ...,  0.0185, -0.0284, -0.0106]],
        device='cuda:0', grad_fn=<SliceBackward0>),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [274]:
lstm_out.sum(1)/lens.view(32,1).cuda()

tensor([[-0.0004,  0.0042,  0.0763,  ...,  0.0250, -0.0094, -0.0167],
        [ 0.0144,  0.0132,  0.0642,  ...,  0.0358, -0.0122, -0.0194],
        [ 0.0088,  0.0154,  0.0739,  ..., -0.0097, -0.0171,  0.0118],
        ...,
        [ 0.0372,  0.0294,  0.0614,  ...,  0.0163,  0.0021, -0.0073],
        [-0.0061,  0.0143,  0.0740,  ..., -0.0012, -0.0260,  0.0061],
        [-0.0003,  0.0511,  0.0738,  ...,  0.0124, -0.0034, -0.0126]],
       device='cuda:0', grad_fn=<DivBackward0>)

In [278]:
lstm_out[0][0:11].mean(0)

tensor([-0.0004,  0.0042,  0.0763,  0.0378,  0.0028,  0.0561, -0.0073,  0.0086,
         0.0589,  0.0663,  0.0036, -0.0290, -0.0574,  0.0918, -0.0510, -0.0485,
         0.0072, -0.1047,  0.0128,  0.0142, -0.0080,  0.0321,  0.0638, -0.0602,
        -0.0164,  0.0308,  0.0051,  0.0881,  0.0458, -0.0882, -0.0301,  0.0348,
         0.0584,  0.0750, -0.0235,  0.0631, -0.0719,  0.0722,  0.0197,  0.0270,
        -0.1015,  0.0850, -0.0298, -0.0841,  0.0077,  0.0400,  0.0780, -0.0844,
        -0.0514, -0.0167,  0.0504, -0.0650, -0.0303,  0.0235,  0.0097, -0.0331,
        -0.1171, -0.0282, -0.0704, -0.0288,  0.0165, -0.0255, -0.1225,  0.0124,
         0.0724, -0.0685,  0.0780,  0.0951, -0.0069, -0.0688,  0.0712,  0.1270,
         0.0195,  0.0401, -0.0174, -0.0060,  0.0320,  0.0320,  0.0465, -0.0183,
         0.0152,  0.0359,  0.0472,  0.0140, -0.0318, -0.0239,  0.0374, -0.0394,
        -0.0323,  0.0885,  0.0140,  0.0496,  0.0258, -0.1242,  0.0012, -0.0382,
         0.0128,  0.0735,  0.0237,  0.06