In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
import os
import copy
from google.colab import drive
drive.mount('/content/drive/') 
!pip install transformers

data_dir = "drive/My Drive/模型压缩/hotel"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# -*- coding: utf-8 -*-

import csv, random
import torch.nn.functional as F
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
from transformers import BertModel, BertPreTrainedModel
from transformers import BertTokenizer
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm, trange
from sklearn.metrics import f1_score
from sklearn import metrics


class InputExample(object):
    def __init__(self, text, label=None):
        self.text  = text
        self.label = label


class InputFeatures(object):
    def __init__(self, input_ids, input_mask, label_id=None):
        self.input_ids  = input_ids
        self.input_mask = input_mask
        self.label_id   = label_id


def create_examples(set_type):
  examples  = []
  data_path = data_dir + "/" + set_type + ".txt"
  with open(data_path, encoding="utf-8") as f:
    for i, line in enumerate(f):
        label, text = line.strip().split('\t', 1)
        examples.append(InputExample(text=text, label=label))
  random.shuffle(examples)
  return examples


def convert_examples_to_features(examples, label_list, max_seq, tokenizer):
    label_map = {label: i for i, label in enumerate(label_list)}
    features  = []
    for _, example in enumerate(examples):
        tokens     = tokenizer.tokenize(example.text)
        tokens     = ["[CLS]"] + tokens[:max_seq - 2] + ["[SEP]"]
        input_ids  = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        padding    = [0] * (max_seq - len(input_ids))
        label_id   = label_map[example.label]
        features.append(InputFeatures(
            input_ids=input_ids + padding,
            input_mask=input_mask + padding,
            label_id=label_id))
    return features


class BertClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=2):
        super(BertClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert       = BertModel(config)
        self.dropout    = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.init_weights()

    def forward(self, input_ids, input_mask, label_ids):
        _, pooled_output = self.bert(input_ids, None, input_mask)
        pooled_output    = self.dropout(pooled_output)
        logits           = self.classifier(pooled_output)
        if label_ids is not None:
            loss_fct = CrossEntropyLoss()
            return loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1))
        return logits


def createDataset(examples, label_list, max_seq, tokenizer, batch_size):
  features   = convert_examples_to_features(examples, label_list, max_seq, tokenizer)
  input_ids  = torch.tensor([f.input_ids  for f in features], dtype=torch.long)
  input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  label_ids  = torch.tensor([f.label_id   for f in features], dtype=torch.long)
  data       = TensorDataset(input_ids, input_mask, label_ids)
  sampler    = RandomSampler(data)
  sets       = DataLoader(data, sampler=sampler, batch_size=batch_size)
  return sets


def compute_metrics(preds, labels):
    return {'ac': (preds == labels).mean(), 'f1': f1_score(y_true=labels, y_pred=preds)}


def fine_tune(bert_model='bert-base-chinese', max_seq=128, batch_size=64, num_epochs=5, lr=2e-5):
    train_examples  = create_examples(set_type="train")
    label_list      = ['0', '1']
    tokenizer       = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
    model           = BertClassification.from_pretrained(bert_model, num_labels=len(label_list), return_dict=False)
    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay        = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00}]
    optimizer        = AdamW(optimizer_grouped_parameters, lr=lr)

    print('train...')
    train_dataloader = createDataset(train_examples, label_list, max_seq, tokenizer, batch_size)
    model.train()
    for _ in trange(num_epochs, desc='Epoch'):
        tr_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader, desc='Iteration')):
            input_ids, input_mask, label_ids = tuple(t.to(device) for t in batch)
            loss = model(input_ids, input_mask, label_ids)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            tr_loss += loss.item()
        print('tr_loss', tr_loss)

        print('eval...')
        eval_examples = create_examples(set_type="train")
        eval_dataloader = createDataset(eval_examples, label_list, max_seq, tokenizer, batch_size)
        model.eval()
        all_predictions, all_labels = [], []
        for batch in tqdm(eval_dataloader, desc='Evaluating'):
            input_ids, input_mask, label_ids = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                logits = model(input_ids, input_mask, None)
                preds = logits.detach().cpu().numpy()
                preds = np.argmax(np.vstack(preds), axis=1)
                # print(compute_metrics(preds, label_ids.cpu().numpy()))
                all_predictions.extend(preds)
                all_labels.extend(label_ids.cpu().numpy())
                # 计算auc
        fpr, tpr, thresholds = metrics.roc_curve(y_true=all_labels,
                                                y_score=all_predictions)
        auc = metrics.auc(fpr, tpr)
        print("AUC: ",  auc)
    # torch.save(model, 'drive/My Drive/模型压缩/test_model')

fine_tune()

In [None]:
class Teacher(object):
    def __init__(self, bert_model='bert-base-chinese', max_seq=128):
        self.max_seq = max_seq
        self.tokenizer = BertTokenizer.from_pretrained(
            bert_model, do_lower_case=True)
        self.model = torch.load('drive/My Drive/模型压缩/test_model')
        self.model.eval()

    def predict(self, text):
        tokens = self.tokenizer.tokenize(text)[:self.max_seq]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        padding = [0] * (self.max_seq - len(input_ids))
        input_ids = torch.tensor([input_ids + padding], dtype=torch.long).to(device)
        input_mask = torch.tensor([input_mask + padding], dtype=torch.long).to(device)
        logits = self.model(input_ids, input_mask, None)
        # print(logits)
        return F.softmax(logits, dim=1).detach().cpu().numpy()


In [None]:
import jieba, fileinput
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

def load_part_set(path, name, tokenizer):
  X, Y = [], []
  TEXT = []
  data_path = "drive/My Drive/模型压缩/" + name + "/" + path + ".txt"
  for line in open(data_path, encoding="utf-8").read().strip().split('\n'):
        label, text = line.split('\t', 1)
        TEXT.append(text.strip())
        X.append(' '.join(jieba.cut(text.strip())))
        Y.append(int(label))
  X = tokenizer.texts_to_sequences(X)
  return X, Y, TEXT


def load_data(name):
    def get_w2v():
        for line in open('drive/My Drive/word2vec', encoding="utf-8").read().strip().split('\n'):
            line = line.strip().split()
            if not line: continue
            yield line[0], np.array(list(map(float, line[1:])))

    tokenizer = Tokenizer(filters='', lower=True, split=' ', oov_token=1)
    texts = [' '.join(jieba.cut(line.split('\t', 1)[1].strip())) \
             for line in open('drive/My Drive/模型压缩/{}/{}.txt'.format(name, name), encoding="utf-8",
                              ).read().strip().split('\n')]
    tokenizer.fit_on_texts(texts)
    
    x_train, y_train, text_train = load_part_set("train", name, tokenizer)
    x_dev,   y_dev,   text_dev   = load_part_set("eval",   name, tokenizer)
    x_test,  y_test,  text_test  = load_part_set("test",  name, tokenizer)

    v_size = len(tokenizer.word_index) + 1
    embs, w2v = np.zeros((v_size, 300)), dict(get_w2v())
    for word, index in tokenizer.word_index.items():
        if word in w2v: embs[index] = w2v[word]
    return (x_train, y_train, text_train), \
           (x_dev, y_dev, text_dev), \
           (x_test, y_test, text_test), \
           v_size, embs



In [None]:
from tqdm import tqdm

teacher = Teacher()

x_len      = 50
lr         = 0.001
epochs     = 10
name       = 'hotel'  


(x_train, y_train, T_train), (x_val, y_val, T_val), (x_test, y_test, T_test), vec_size, embs = load_data(name)

x_train = sequence.pad_sequences(x_train, maxlen=x_len)
x_val   = sequence.pad_sequences(x_val,   maxlen=x_len)
x_test  = sequence.pad_sequences(x_test,  maxlen=x_len)

with torch.no_grad():
    teacher_train = np.vstack([teacher.predict(text) for text in tqdm(T_train)])
    teacher_val   = np.vstack([teacher.predict(text) for text in tqdm(T_val[:2000])])


In [None]:
class RNN(nn.Module):
    def __init__(self, x_dim, e_dim, h_dim, o_dim):
        super(RNN, self).__init__()
        self.h_dim       = h_dim
        self.dropout     = nn.Dropout(0.2)
        self.emb         = nn.Embedding(x_dim, e_dim, padding_idx=0)
        self.lstm        = nn.LSTM(e_dim, h_dim, bidirectional=True, batch_first=True)
        self.fc          = nn.Linear(h_dim * 2, o_dim)
        self.softmax     = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        embed  = self.dropout(self.emb(x))
        out, _ = self.lstm(embed)
        hidden = self.fc(out[:, -1, :])
        return self.softmax(hidden), self.log_softmax(hidden)

In [None]:
from torch.autograd import Variable
from keras.preprocessing import sequence

LTensor = torch.cuda.LongTensor 
FTensor = torch.cuda.FloatTensor 

model    = RNN(vec_size, 256, 256, 2)
model    = model.cuda()
opt      = optim.Adam(model.parameters(), lr=lr)
ce_loss  = nn.NLLLoss()
mse_loss = nn.MSELoss()
batch_size = 64
for epoch in range(epochs):
    losses = []
    accu   = []  

    model.train()
    for i in range(0, len(x_train), batch_size):
            model.zero_grad()
            student = Variable(LTensor(x_train[i:i + batch_size]))
            teacher = Variable(FTensor(teacher_train[i:i + batch_size]))
            pred1, pred2 = model(student)
            loss = mse_loss(pred1, teacher)
            # by = Variable(LTensor(y_de[i:i + b_size]))
            # loss = alpha * ce_loss(py2, by) + (1-alpha) * mse_loss(py1, bt)
            loss.backward()             
            opt.step()                      
            losses.append(loss.item())
    model.eval()

    with torch.no_grad():
        for i in range(0, len(x_val), batch_size):
            input = Variable(LTensor(x_val[i:i + batch_size]))
            label = Variable(LTensor(y_val[i:i + batch_size]))
            _, pred = torch.max(model(input)[1], 1)
            fpr, tpr, thresholds = metrics.roc_curve(y_true=label.cpu(), y_score=pred.cpu())
            auc = metrics.auc(fpr, tpr)
            print("AUC: ",  auc)
            accu.append((pred == label).float().mean().item())
    print("loss: ", np.mean(losses),  "  acc",  np.mean(accu))

In [None]:
torch.save(model, "drive/My Drive/模型压缩/BiLSTM")