In [0]:
# !pip install allennlp
!pip install transformers

In [0]:
import os
# go the working directory
os.chdir("drive")
os.chdir("My Drive") 
!ls

In [0]:
import json
import torch
import torch.utils.data as Data
device='cuda' if torch.cuda.is_available() else 'cpu'

train_data = []
train_labels = []
with open('Colab/train_total_balance_new3.json','r') as f:
  data = json.load(f)
  for v in data.values():
    s = '[CLS] ' + v['text'] + ' [SEP]'
    train_data.append(s)
    train_labels.append(v['label'])
f.close()
print("length_dataset:",len(train_data))
print(train_data[0])

dev_data = []
dev_labels = []
with open('Colab/dev.json') as f:
  dev = json.load(f)
  for v in dev.values():
    s = '[CLS] ' + v['text'] + ' [SEP]'
    dev_data.append(s)
    dev_labels.append(v['label'])
print("dev:",dev_data[0])
# print(dev_label)

In [0]:
from keras_preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)
#train_set
tokenized_train=[tokenizer.tokenize(sent) for sent in train_data] 
#dev_set
tokenized_dev=[tokenizer.tokenize(sent) for sent in dev_data] 
print("tokenized_dev:",tokenized_dev[0])

In [0]:
# a function for pad_sequences from both pre and post 
def truncating_from_middle(input_lists,maxlen,value=0):
  half = int(maxlen/2)
  head = 128
  tail = 384
  new_lists = []
  for l in input_lists:
    if len(l) > maxlen:
      post = (len(l)-tail)
      new_l = l[:head] + l[post:]
      new_lists.append(new_l)
    else:
      pad_need = maxlen-len(l)
      l = l + [0] * pad_need
      new_lists.append(l)
  return new_lists

In [0]:
MAX_LEN = 256
#convert to ids format
input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_train]
dev_input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_dev]

#padding sent to fixed length
# input_ids = truncating_from_middle(input_ids,maxlen=MAX_LEN,value=0)
# dev_input_ids = truncating_from_middle(dev_input_ids,maxlen=MAX_LEN,value=0)
input_ids=pad_sequences(input_ids, value=0, maxlen=MAX_LEN, dtype="long", truncating="pre", padding="post")
dev_input_ids=pad_sequences(dev_input_ids, value=0, maxlen=MAX_LEN, dtype="long", truncating="pre", padding="post")
print("input_ids:",len(input_ids[0]))
#build attention mask
train_masks = []
for seq in input_ids:
  seq_mask = [int(i>0) for i in seq]
  train_masks.append(seq_mask)

validation_masks = []
for seq in dev_input_ids:
  seq_mask = [int(i>0) for i in seq]
  validation_masks.append(seq_mask)

train_inputs = input_ids
validation_inputs = dev_input_ids
validation_labels = dev_labels


In [0]:
#transfer dataset to tensor
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

#create dataloader
batch_size = 32
train_data = Data.TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = Data.RandomSampler(train_data)
train_dataloader = Data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = Data.TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = Data.SequentialSampler(validation_data)
validation_dataloader = Data.DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
print("Create dataloader done!")

In [0]:
from transformers import BertForSequenceClassification
# load pre trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
model.cuda()

In [0]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
# set up optimizer, epochs
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-5)
epochs = 4

# learning rate scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, #default
                                            num_training_steps=total_steps) 

In [0]:
import numpy as np
from tqdm import trange
from sklearn.metrics import precision_recall_fscore_support

# function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# start train
train_loss_set = [] # store loss for plotting after training
for epoch in trange(epochs, desc="Epoch"):
    model.train()
    total_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        #BertForSequenceClassification [0]Loss，[1]logits
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        train_loss_set.append(loss.item())
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    print("Train loss: {}".format(total_loss / nb_tr_steps))
    
    # save models
    PATH = 'Colab/model_epoch_' + str(epoch) +'.pth'
    torch.save(model.state_dict(), PATH)
    # evaluation
    print("Running evaluation...")
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    dev_predicts = []
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        preds = np.argmax(logits, axis=1)
        dev_predicts = np.concatenate((dev_predicts,preds))
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    p, r, f, _ = precision_recall_fscore_support(dev_labels, dev_predicts, pos_label=1, average="binary")
    print("scoring.py:precision:",p," recall:",r," f1_score:",f)
    print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
print("\nTraining complete!")

In [0]:
test_data = []
test_labels = []
with open('Colab/test-unlabelled.json','r') as f:
  data = json.load(f)
  for v in data.values():
    s = '[CLS] ' + v['text'] + ' [SEP]'
    test_data.append(s)
    test_labels.append(1)
f.close()
print("length_dataset:",len(test_data))
print(test_data[0])
print(test_labels[0])

tokenized_test=[tokenizer.tokenize(sent) for sent in test_data] 
test_input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_test]
# test_input_ids = truncating_from_middle(test_input_ids,maxlen=MAX_LEN,value=0)
test_input_ids=pad_sequences(test_input_ids, value=0, maxlen=MAX_LEN, dtype="long", truncating="pre", padding="post")

print("generate masks")
test_masks = []
for seq in test_input_ids:
  seq_mask = [int(i>0) for i in seq]
  test_masks.append(seq_mask)

test_inputs = test_input_ids
# test_labels
# test_masks

print("transfer to tensor")
test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_masks)

#create dataloader
test_data = Data.TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = Data.SequentialSampler(test_data)
test_dataloader = Data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [0]:
from tqdm import tqdm
model.load_state_dict(torch.load('Colab/model_epoch_2.pth'))
model.eval()
test_predicts = []
for batch in test_dataloader:
# for batch in validation_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask)
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  # label_ids = b_labels.to('cpu').numpy()
  preds = np.argmax(logits, axis=1)
  test_predicts = np.concatenate((test_predicts,preds))
  
  # print(preds)

print(test_predicts)


In [0]:
print("store now...")
test_predicts = [int(i) for i in test_predicts]
print(test_predicts)
state = {}
counttt = 0
u_r = [3, 5, 6, 8, 13, 17, 20, 22, 23, 24, 28, 30, 32, 35, 36, 39, 45, 46, 47, 49, 50, 53, 54, 55, 60, 62, 64, 65, 66, 68, 69, 73, 74, 75, 78, 79, 80, 81, 82, 88, 89, 91, 93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 106, 111, 114, 119, 120, 121, 122, 123, 131, 137, 138, 140, 141, 144, 145, 147, 149, 151, 152, 154, 159, 161, 163, 165, 167, 168, 170, 172, 175, 176, 178, 184, 187, 191, 193, 194, 196, 199, 200, 202, 204, 206, 209, 210, 211, 213, 216, 218, 220, 221, 225, 227, 228, 229, 230, 232, 235, 238, 239, 240, 241, 243, 245, 246, 250, 251, 252, 254, 258, 259, 262, 263, 264, 268, 271, 272, 276, 278, 279, 280, 284, 286, 288, 292, 297, 298, 299, 300, 301, 305, 310, 312, 313, 314, 315, 316, 317, 321, 322, 323, 325, 326, 327, 328, 331, 332, 333, 334, 335, 336, 338, 339, 343, 346, 349, 350, 351, 353, 354, 355, 360, 362, 365, 367, 369, 371, 373, 374, 375, 376, 378, 382, 384, 385, 393, 394, 395, 396, 398, 405, 412, 413, 416, 418, 420, 422, 423, 424, 425, 428, 429, 430, 431, 433, 435, 437, 438, 439, 440, 441, 442, 443, 444, 446, 448, 453, 454, 457, 459, 460, 461, 462, 466, 467, 468, 470, 471, 473, 474, 481, 482, 486, 489, 490, 493, 497, 500, 503, 504, 505, 507, 512, 513, 514, 515, 519, 522, 523, 528, 529, 530, 531, 532, 533, 534, 536, 537, 541, 548, 549, 551, 559, 562, 563, 565, 573, 574, 577, 585, 587, 588, 590, 597, 599, 600, 603, 606, 607, 609, 611, 613, 614, 618, 619, 621, 628, 635, 641, 646, 647, 649, 651, 652, 654, 656, 661, 662, 668, 669, 671, 674, 675, 676, 679, 680, 683, 685, 686, 689, 690, 694, 696, 697, 698, 699, 701, 702, 704, 705, 709, 710, 711, 712, 716, 718, 721, 725, 727, 728, 732, 733, 734, 736, 737, 739, 741, 744, 746, 747, 748, 751, 753, 754, 756, 757, 759, 761, 762, 766, 767, 774, 787, 788, 789, 790, 793, 794, 795, 796, 797, 803, 804, 812, 814, 818, 821, 823, 827, 829, 830, 832, 834, 836, 842, 843, 844, 847, 848, 849, 851, 853, 854, 855, 856, 858, 860, 861, 863, 864, 865, 866, 870, 871, 872, 878, 880, 883, 884, 889, 892, 893, 894, 898, 899, 901, 907, 909, 910, 912, 914, 915, 917, 918, 920, 921, 922, 923, 924, 925, 926, 930, 938, 945, 947, 949, 951, 952, 953, 955, 956, 957, 960, 962, 964, 966, 968, 969, 971, 972, 973, 975, 978, 980, 985, 986, 988, 989, 992, 997, 999, 1000, 1004, 1006, 1007, 1009, 1010, 1011, 1014, 1018, 1019, 1021, 1022, 1023, 1024, 1026, 1030, 1033, 1034, 1036, 1037, 1039, 1042, 1043, 1044, 1045, 1048, 1051, 1053, 1054, 1055, 1056, 1057, 1058, 1062, 1065, 1070, 1072, 1073, 1074, 1075, 1081, 1083, 1084, 1085, 1089, 1090, 1091, 1094, 1097, 1098, 1099, 1103, 1104, 1106, 1113, 1115, 1116, 1117, 1118, 1119, 1120, 1124, 1125, 1132, 1136, 1137, 1138, 1139, 1142, 1143, 1148, 1151, 1152, 1154, 1156, 1157, 1160, 1161, 1165, 1167, 1168, 1170, 1172, 1175, 1176, 1178, 1179, 1181, 1182, 1184, 1186, 1190, 1192, 1193, 1198, 1199, 1201, 1202, 1203, 1205, 1216, 1218, 1221, 1222, 1224, 1225, 1228, 1230, 1236, 1237, 1240, 1241, 1242, 1244, 1245, 1247, 1256, 1258, 1260, 1261, 1262, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1278, 1279, 1280, 1282, 1283, 1284, 1285, 1286, 1288, 1292, 1299, 1301, 1302, 1303, 1304, 1307, 1313, 1315, 1316, 1317, 1318, 1320, 1321, 1325, 1326, 1328, 1329, 1330, 1331, 1332, 1335, 1338, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1352, 1355, 1356, 1358, 1359, 1361, 1363, 1365, 1366, 1368, 1370, 1373, 1375, 1378, 1379, 1381, 1382, 1384, 1390, 1397, 1398, 1403, 1406]
for i in u_r:
    test_predicts[i] = 0

for label in test_predicts:
  key = "test-" + str(counttt)
  a = {"label":label}
  state[key] = a
  counttt += 1

with open('Colab/test-output1.json','w') as f:
  json.dump(state,f)
  f.close()

print("finish all tasks in prediction!!!")

In [0]:
!nvidia-smi
# check gpu usage
# check gpu use for a particular process
# !pmap -d 125
# !sudo apt-get install psmisc
# !fuser -v /dev/nvidia*
# !kill -9 542