In [None]:
import torch
import ipywidgets as widgets
torch.manual_seed(17)
import numpy as np
import pickle

In [None]:
import pandas as pd
dataset = pd.read_csv('../data/task2_private_testset.csv', dtype=str)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset.to_csv('testset.csv',index=False)
dataset.head()

In [None]:
#For BERT
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool

def label_to_onehot(labels):
    """ Convert label to onehot .
        Args:
            labels (string): sentence's labels.
        Return:
            outputs (onehot list): sentence's onehot label.
    """
    label_dict = {'THEORETICAL': 0, 'ENGINEERING':1, 'EMPIRICAL':2, 'OTHERS':3}
    onehot = [0,0,0,0]
    for l in labels.split():
        onehot[label_dict[l]] = 1
    return onehot
    
def get_dataset(data_path, n_workers=4):
    """ Load data and return dataset for training and validating.

    Args:
        data_path (str): Path to the data.
    """
    dataset = pd.read_csv(data_path, dtype=str)

    results = [None] * n_workers
    with Pool(processes=n_workers) as pool:
        for i in range(n_workers):
            batch_start = (len(dataset) // n_workers) * i
            if i == n_workers - 1:
                batch_end = len(dataset)
            else:
                batch_end = (len(dataset) // n_workers) * (i + 1)
            
            batch = dataset[batch_start: batch_end]
            results[i] = pool.apply_async(preprocess_samples, args=(batch,))

        pool.close()
        pool.join()

    processed = []
    for result in results:
        processed += result.get()
    return processed

def preprocess_samples(dataset):
    """ Worker function.

    Args:
        dataset (list of dict)
    Returns:
        list of processed dict.
    """
    processed = []
    for sample in tqdm(dataset.iterrows(), total=len(dataset)):
        processed.append(preprocess_sample(sample[1]))

    return processed

def preprocess_sample(data):
    """
    Args:
        data (dict)
    Returns:
        dict
    """
    processed = {}
#     processed['PaperId'] = int(data['PaperId'])
    processed['Abstract'] = [data['Title'] + "."] + [sent for sent in data['Abstract'].split('$$$')]
    #processed['Abstract'] = [sent for sent in data['Abstract'].split('$$$')]
    #print (processed['Abstract'])
    if 'Task 2' in data:
        processed['Label'] = label_to_onehot(data['Task 2'])
        
    return processed

In [None]:
print('[INFO] Start processing testset...')
#test = get_dataset('testset.csv', embedder, n_workers=4)
test = get_dataset('testset.csv', n_workers=4)

In [None]:
from torch.utils.data import Dataset
import torch
from pytorch_transformers import *
from multiprocessing import Pool
from tqdm import tqdm_notebook as tqdm
from functools import reduce
import numpy as np
import itertools

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (device)
#bert-large-cased-whole-word-masking-finetuned-squad 1024
#bert-base-uncased 768 
#bert-large-uncased 1024
#./scibert-scivocab-uncased/
tokenizer = BertTokenizer.from_pretrained('../scibert_scivocab_uncased/')
bert_model = BertModel.from_pretrained('../scibert_scivocab_uncased/')
bert_model.to(device)
bert_model.train(False)
bert_model.eval()

class BertDataset(Dataset):
    def __init__(self, data, max_len = 256, n_workers=1):        
        processed_data = []
        for d in tqdm(data, total=len(data)):
            processed_d = {'Abstract':[], 'Label':[]}
            token_sent = None
            for idx, sentence in enumerate(d['Abstract']):
                if idx==0:
                    token_sent = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(sentence) + ['[SEP]'])
                else: 
                    token_sent += tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence) + ['[SEP]'])
            if (len(token_sent) > 511):
                token_sent = token_sent[:511] + tokenizer.convert_tokens_to_ids(['[SEP]'])
            #print (len(token_sent))
            encode_sentence_tensor = torch.tensor([token_sent])
            encode_sentence_tensor = encode_sentence_tensor.to(device)
            with torch.no_grad():
                out = bert_model(encode_sentence_tensor)[0]
                #print (out.shape)
                out = out[:,-1,:]
                #print (out.shape)
            #processed_d['Abstract'] = [list(itertools.chain(out.to('cpu').tolist()[0], DocEmbed_32[d['PaperId']], DocEmbed_64[d['PaperId']], DocEmbed_128[d['PaperId']]))]
            #processed_d['Abstract'] = [list(itertools.chain(out.to('cpu').tolist()[0], DocEmbed_64[d['PaperId']]))]
            #print(DocEmbed_title_32[d['PaperId']].tolist())
#             processed_d['Abstract'] = [list(itertools.chain(out.to('cpu').tolist()[0], DocEmbed_title_32[d['PaperId']].tolist(), CateEmbed_64[d['PaperId']].tolist()))]
            processed_d['Abstract'] = [list(itertools.chain(out.to('cpu').tolist()[0]))]
            #print(len(processed_d['Abstract'][0]))
            #processed_d['Abstract'] = [out.to('cpu').tolist()[0]]
            if 'Label' in d:
                processed_d['Label'] = d['Label']
            processed_data.append(processed_d)
        
        self.data = processed_data
        
        self.max_len = max_len
        
    
    def __len__(self):
        return len(self.data) # return data筆數

    def __getitem__(self, index):
        return self.data[index]
     
    
    def collate_fn(self, datas):
        # get max length in this batch
        max_sent = max([len(d['Abstract']) for d in datas])# Get max length of sentence in datas
        batch_abstract = None
        batch_label = []
        for idx, data in enumerate(datas):
            # padding abstract to make them in same length
            #pad_abstract = data['Abstract']
            if idx==0:
                batch_abstract = data['Abstract']
            else: 
                batch_abstract += data['Abstract']
            #print (len(batch_abstract))
            # gather labels
            if 'Label' in data:
                batch_label.append(data['Label'])
        #print (batch_abstract)
        #print (batch_label)
        return torch.FloatTensor(batch_abstract), torch.FloatTensor(batch_label)

In [None]:
testData = BertDataset(test)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class simpleNet(nn.Module):
    def __init__(self, vocabulary_size):
        super(simpleNet, self).__init__()
        self.hidden_dim1 = 512
        self.l0 = nn.Linear((vocabulary_size), (vocabulary_size))
        self.b0 = nn.Parameter(torch.zeros(vocabulary_size))
        self.l1 = nn.Linear((vocabulary_size), self.hidden_dim1)
        self.b1 = nn.Parameter(torch.zeros(self.hidden_dim1))
        self.relu1 = nn.ReLU()
        self.l2 = nn.Linear(self.hidden_dim1, 4)
        self.b2 = nn.Parameter(torch.zeros(4))
        self.dropout = nn.Dropout(0.6)
        self.conv1 = nn.Conv1d(in_channels=1,out_channels=5,kernel_size=(3,))
        self.b3 = nn.Parameter(torch.zeros(5, (vocabulary_size-2)))
    
    def forward(self, x):
        b,e = x.shape
        x0 = self.relu1(self.l0(x)+self.b0)
        x0 = self.dropout(x0)
        x = self.relu1(self.l1(x+x0)+self.b1)
        x = self.dropout(x)
        x = torch.sigmoid(self.l2(x)+self.b2)
        return x

In [None]:
from torch.utils.data import DataLoader
from tqdm import trange
import scipy.io
from util import *

In [None]:
def SubmitGenerator(prediction, sampleFile, public=True, filename='prediction.csv'):
    """
    Args:
        prediction (numpy array)
        sampleFile (str)
        public (boolean)
        filename (str)
    """
    sample = pd.read_csv(sampleFile)
    submit = {}
    submit['order_id'] = list(sample.order_id.values)
    redundant = len(sample) - prediction.shape[0]
    if public:
        submit['THEORETICAL'] = list(prediction[:,0]) + [0]*redundant
        submit['ENGINEERING'] = list(prediction[:,1]) + [0]*redundant
        submit['EMPIRICAL'] = list(prediction[:,2]) + [0]*redundant
        submit['OTHERS'] = list(prediction[:,3]) + [0]*redundant
    else:
        submit['THEORETICAL'] = [0]*redundant + list(prediction[:,0])
        submit['ENGINEERING'] = [0]*redundant + list(prediction[:,1])
        submit['EMPIRICAL'] = [0]*redundant + list(prediction[:,2])
        submit['OTHERS'] = [0]*redundant + list(prediction[:,3])
    df = pd.DataFrame.from_dict(submit) 
    df.to_csv(filename,index=False)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (device)
dataloader = DataLoader(dataset=testData,
                            batch_size=64,
                            shuffle=False,
                            collate_fn=testData.collate_fn,
                            num_workers=4)

In [None]:
import os
cnt = 0
for f in os.listdir('./well-trained-model'):
    cnt += 1
    print(f)
    model = torch.load('./well-trained-model/'+f)
    d = scipy.io.loadmat('./Results/'+f[:-4]+'.mat')
    
    prediction = []
    result = []
    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='Predict')
    for i, (x,y) in trange:
        o_labels = model(x.to(device))
        o_labels = o_labels.to('cpu').detach().numpy()
        result.extend(o_labels.copy())
        #print (o_labels)
        o_labels[:,:3] = o_labels[:,:3] > d['best_weight']
        o_labels = ExtendLabel(o_labels[:,:3])
        prediction.extend(list(o_labels))
#         print(len(prediction))

    result = np.array(result)
    prediction = np.array(prediction).astype(int)
    scipy.io.savemat('Results_Testing/'+f[:-4]+'.mat', mdict={'result': result, 'prediction': prediction, 'best_weight': d['best_weight']})    