In [1]:
import pickle

train_path = './phrase_model_data.pkl'
with open(train_path, 'rb') as f:
    traindata = pickle.load(f)
len(traindata)

362560

In [2]:
with open('./e3_process_data.pkl', 'rb') as f:
    process_data = pickle.load(f)
    
w2i = process_data['w2i']
# i2w = process_data['i2w']
phrase_rule_133 = process_data['sampled_phrase']
i2phrase_rule_133 = {i: p for p, i in phrase_rule_133.items()}
len(phrase_rule_133)


133

In [3]:
MAXLEN = 128 # 256 may be better

In [4]:
def prepare_data(tempdata, phrase_rule_133, w2i, max_len=1000):
    half_len = max_len // 2
    lnis = []
    labels = []
    inputs = []
    num_classes = len(phrase_rule_133)
    for lni, items in tempdata.items():
        label = [0] * num_classes
        phrase = [phrase_rule_133[p] for p in items['phrases'] if p in phrase_rule_133]
        if not phrase:
            continue

        if len(items['pred_sents']) > 6:
            sent = ' '.join(s for _, s in items['pred_sents'][:3] + items['pred_sents'][-3:])
        else:
            sent = ' '.join(s for _, s in items['pred_sents'])
        sentid = [w2i.get(w, 1) for w in sent.split()]
        if not sentid:
            continue
        if len(sentid) > max_len:
            sentid = sentid[:half_len] + sentid[-half_len:]

        inputs.append(sentid)

        for i in set(phrase):
            label[i] = 1
        assert sum(label) > 0
        labels.append(label[:])
        lnis.append(lni)

    print(len(labels), len(inputs), len(lnis))
    return lnis, inputs, labels

In [5]:
_, inputs, labels = prepare_data(traindata, phrase_rule_133, w2i, max_len=MAXLEN)

320635 320635 320635


In [9]:
from scipy import stats

lens = [len(x) for x in inputs]
stats.describe(lens)

DescribeResult(nobs=320635, minmax=(1, 128), mean=78.48248943502736, variance=2591.396846363558, skewness=-0.3131667640799144, kurtosis=-1.6436714830091887)

In [6]:
import numpy as np

labels = np.array(labels)
labels.shape

(320635, 133)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score

x_train, x_val, y_train, y_val = train_test_split(inputs, labels, test_size=0.150012, random_state=0)
print(len(x_train), y_train.shape, y_val.shape)

272535 (272535, 133) (48100, 133)


In [8]:
print(y_val.sum(0))

[ 206  235  490  115   98   19   62    4 3570   85 1107  871    6   27
   12   25  687 2196    6 1543   21  380  250 1846  960   29    6   80
  180  269 1406 1538   85   27   17   53    6   47  210   67  525   13
  507  692  211    7    8  236  737   41   84    9   13 6262 1764   76
    5   10  300  562   64  119 3957   14   91  672  148  556    5  151
 2906   71 1341 1468 1075 3405 3376   64    4   69 3754   24   60  298
   30    6  144    4  377   87 1538  147 1111  153    5 1585    2   58
  134 1054   86  793   61 1335  137 2239   18  394  164  107  169 4163
   29   51   12    7   19    4  362   27  148 1981  318   25   74  121
   87  274   41  281  175    1  122]


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class PhraseData(Dataset):
    def __init__(self, src, tgt):
        super(PhraseData, self).__init__()
        self.src = src
        self.tgt = tgt
        
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, i):
        return self.src[i], self.tgt[i]
    
def collate_func(seqs, pad_token=0):
    seq_lens = [len(seq) for seq in seqs]
    max_len = max(seq_lens)
    seqs = [seq + [pad_token] * (max_len - len(seq)) for seq in seqs]
    return torch.LongTensor(seqs), torch.LongTensor(seq_lens)    
    
def pair_collate_func(inps):
    pairs = sorted(inps, key=lambda p: len(p[0]), reverse=True)
    seqs, tgt = zip(*pairs)
    seqs, seq_lens = collate_func(seqs)
    return seqs, seq_lens, torch.FloatTensor(tgt) 

In [11]:
batch_sz = 512
train_loader = DataLoader(PhraseData(x_train, y_train), 
                          num_workers=1, 
                          batch_size=batch_sz, 
                          collate_fn=pair_collate_func, 
                          shuffle=True,
                          drop_last=True)

val_loader = DataLoader(PhraseData(x_val, y_val), 
                        num_workers=1, 
                        batch_size=100, 
                        collate_fn=pair_collate_func, 
                        shuffle=False,
                        drop_last=True) # 48100

In [12]:
for batch in val_loader:
    x1, x2, x3 = batch
    print(x1.size(), x2.size(), x3.size())
    break

torch.Size([100, 128]) torch.Size([100]) torch.Size([100, 133])


In [13]:
class BiGRU(nn.Module):
    def __init__(self, emb_dim, hidden_size, n_layers, num_classes, dropout, weights):
        super(BiGRU, self).__init__()
        self.emb = nn.Embedding.from_pretrained(torch.FloatTensor(weights), freeze=False)
        self.dropout1 = nn.Dropout(p=dropout)
        self.gru = nn.GRU(emb_dim, hidden_size, n_layers, bidirectional=True, batch_first=True)
        self.maxpooling = nn.AdaptiveMaxPool1d(1)
        # self.dropout2 = nn.Dropout(p=dropout)
        self.linear = nn.Linear(2*hidden_size, num_classes)
        
    def forward(self, input_var, input_len):
        embeded = self.emb(input_var) # b x l x emb_dim
        embeded = self.dropout1(embeded)
        
        total_length = embeded.size(1)
        
        packed1 = torch.nn.utils.rnn.pack_padded_sequence(embeded, input_len, batch_first=True)
        self.gru.flatten_parameters()
        rnn1, hidden1 = self.gru(packed1)
        rnn1, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn1, batch_first=True, total_length=total_length) # b x l x 2hs

        rnn1 = rnn1.permute(0, 2, 1) # b x 2hs x l
        out = self.maxpooling(rnn1).squeeze(2) # b x 2hs x 1 -> b x 2hs
        # out = self.dropout2(out)
        out = self.linear(out)
        return out

In [14]:
import time

# GRU
def train_epoch(model, device, epoch, train_loader, test_loader, criterion, optimizer, clip=5.):
    model.train()
    train_loss = 0
    t0 = time.time()
    for i, batch in enumerate(train_loader, 1):
        seqs, seq_lens, tgts = batch
        seqs = seqs.to(device)
        tgts = tgts.to(device)
        
        optimizer.zero_grad()
        outputs = model(seqs, seq_lens)
        loss = criterion(outputs, tgts)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        train_loss += loss.item()
        
        if i % 100 == 0:
            # print loss info every 20 Iterations
            log_str = "Epoch : {} , Iteration : {} , Time : {:.2f} , TrainLoss : {:.4f}".format \
                        (epoch, i, time.time()-t0, train_loss/i)
            print(log_str)
            t0 = time.time()
    train_loss /= len(train_loader)
    
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            seqs, seq_lens, tgts = batch
            seqs = seqs.to(device)
            tgts = tgts.to(device)

            outputs = model(seqs, seq_lens)
            loss = criterion(outputs, tgts)
            eval_loss += loss.item()
            
    eval_loss /= len(val_loader)
    return model, optimizer, train_loss, eval_loss

In [15]:
emb_dim = 300
hidden_size = 256
n_layers = 1
num_classes = 133
dropout = 0.5
emb_weights = np.load('./e3_EmbeddingMatrix.npy')
bi_gru = BiGRU(emb_dim, hidden_size, n_layers, num_classes, dropout, emb_weights)
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
bi_gru.to(device)

BiGRU(
  (emb): Embedding(14456, 300)
  (dropout1): Dropout(p=0.5, inplace=False)
  (gru): GRU(300, 256, batch_first=True, bidirectional=True)
  (maxpooling): AdaptiveMaxPool1d(output_size=1)
  (linear): Linear(in_features=512, out_features=133, bias=True)
)

In [16]:
criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(bi_gru.parameters())
optimizer = optim.Adam(bi_gru.parameters())
device

device(type='cuda', index=1)

In [18]:
n_epochs = 25
best_eval_loss = float('inf')
save_path = './bi_gru_128.pt'
for epoch in range(1, 1+n_epochs):
    bi_gru, optimizer, train_loss, eval_loss = train_epoch(bi_gru, device, epoch, train_loader, val_loader, 
                                                           criterion, optimizer, clip=5.)

    print(">> Epoch : {} , TrainLoss : {:.4f} , EvalLoss : {:.4f}\n".format \
          (epoch, train_loss, eval_loss))

    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss
        torch.save(bi_gru.state_dict(), save_path)

Epoch : 1 , Iteration : 100 , Time : 17.36 , TrainLoss : 0.0513
Epoch : 1 , Iteration : 200 , Time : 17.12 , TrainLoss : 0.0495
Epoch : 1 , Iteration : 300 , Time : 17.47 , TrainLoss : 0.0470
Epoch : 1 , Iteration : 400 , Time : 17.55 , TrainLoss : 0.0446
Epoch : 1 , Iteration : 500 , Time : 17.61 , TrainLoss : 0.0426
>> Epoch : 1 , TrainLoss : 0.0420 , EvalLoss : 0.0322

Epoch : 2 , Iteration : 100 , Time : 17.56 , TrainLoss : 0.0313
Epoch : 2 , Iteration : 200 , Time : 17.55 , TrainLoss : 0.0305
Epoch : 2 , Iteration : 300 , Time : 17.32 , TrainLoss : 0.0299
Epoch : 2 , Iteration : 400 , Time : 17.42 , TrainLoss : 0.0293
Epoch : 2 , Iteration : 500 , Time : 17.56 , TrainLoss : 0.0288
>> Epoch : 2 , TrainLoss : 0.0287 , EvalLoss : 0.0258

Epoch : 3 , Iteration : 100 , Time : 17.70 , TrainLoss : 0.0256
Epoch : 3 , Iteration : 200 , Time : 17.31 , TrainLoss : 0.0253
Epoch : 3 , Iteration : 300 , Time : 17.62 , TrainLoss : 0.0250
Epoch : 3 , Iteration : 400 , Time : 17.54 , TrainLoss : 0

In [19]:
PATH = './bi_gru_128.pt'
bi_gru.load_state_dict(torch.load(PATH))

def predict_res(model, data_loader, device):
    model.eval()
    y_true = None
    y_pred = None
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            seqs, seq_lens, tgts = batch
            seqs = seqs.to(device)
            tgts = tgts.to(device)
            outputs = model(seqs, seq_lens)
            
            if y_true is None:
                y_true = tgts
            else:
                y_true = torch.cat((y_true, tgts), 0)
            
            if y_pred is None:
                y_pred = outputs
            else:
                y_pred = torch.cat((y_pred, outputs), 0)

    print(y_true.size(), y_pred.size())
    y_pred = torch.sigmoid(y_pred)
    return y_true.cpu().numpy(), y_pred.cpu().numpy()

val_true, val_pred = predict_res(bi_gru, val_loader, device)

torch.Size([48100, 133]) torch.Size([48100, 133])


In [20]:
auc_val = roc_auc_score(val_true, val_pred)
# 0.9696475056679279 - 256
# 0.9678317424057077 - 512
auc_val

0.9683570006799356

In [21]:
def fit_active_value(s1, active_value):
    # return np.round(s1)
    return (s1 > active_value).astype(int)

def cal_avg_p_r(arr_true, arr_pred, max_num=6, active_value=0.05):
    ps, rs = [], []
    for i in range(arr_true.shape[0]):
        t1, s1 = arr_true[i], arr_pred[i]
        if sum(t1) <= 0:
            continue
        s2 = fit_active_value(s1, active_value)
        # if sum(s2) > max_num:
        #     s2 = cut_max_num(s1, max_num)
        # if sum(s2) > 4:
        #     s2 = get_maxf_adj(s1, s2, adj_phrase_map, phrase_prob, cannot_be_only)
        # s2 = fit_threshold(s1, thresholds)

        p, r = precision_score(t1, s2), recall_score(t1, s2)
        ps.append(p)
        rs.append(r)
    return np.average(ps), np.average(rs), len(ps), ps, rs

In [22]:
avg_p, avr_r, num, ps, rs = cal_avg_p_r(val_true, val_pred, active_value=0.02)
# (0.4204987318031663, 0.9493316008316008, 48100) - 256
# (0.4182258776883548, 0.9502048312048311, 48100) - 512
avg_p, avr_r, num

(0.3951539838274333, 0.9472644210144208, 48100)

# test data

In [31]:
test_path = './phrase_model_data_ftc.pkl'
with open(test_path, 'rb') as f:
    testdata = pickle.load(f)
len(testdata)

112089

In [32]:
_, test_inputs, test_labels = prepare_data(testdata, phrase_rule_133, w2i, max_len=256)

93581 93581 93581


In [33]:
test_labels = np.array(test_labels)
test_labels.shape

(93581, 133)

In [34]:
test_loader = DataLoader(PhraseData(test_inputs, test_labels), 
                        num_workers=1, 
                        batch_size=128, 
                        collate_fn=pair_collate_func, 
                        shuffle=False,
                        drop_last=True) # 93581

In [35]:
test_true, test_pred = predict_res(bi_gru, test_loader, device)

torch.Size([93568, 133]) torch.Size([93568, 133])


In [36]:
def mean_column_wise_auc(y_true, y_pred):
    assert y_true.shape[1] == y_pred.shape[1],'Arrays must have the same dimension'
    list_of_aucs = []
    for column in range(y_true.shape[1]):
        #print(sum(y_true[:,column]), sum(y_pred[:,column]))
        if sum(y_true[:,column]) == 0:
            continue
        list_of_aucs.append(roc_auc_score(y_true[:,column],y_pred[:,column]))
    # print(list_of_aucs)
    return np.array(list_of_aucs).mean(), len((list_of_aucs))


# (0.9820563869473041, 129) - 256
# (0.9819577986246634, 129) - 512
mean_column_wise_auc(test_true, test_pred)

(0.9819577986246634, 129)

In [37]:
avg_p, avr_r, num, ps, rs = cal_avg_p_r(test_true, test_pred, active_value=0.02)
# (0.4358161358395861, 0.9669629540285107, 93568) - 256
# (0.44155424302135166, 0.966464254669891, 93568) - 512
avg_p, avr_r, num 

(0.44155424302135166, 0.966464254669891, 93568)