In [1]:
import numpy as np
import pandas as pd
import json
import re
import random
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch import nn
import torch.nn.functional as F
import torch



In [2]:
# torchでGPUが使えるか確認
torch.cuda.is_available()

True

In [3]:
# json読み込み
def get_metadata():
    with open('../input/arxiv/arxiv-metadata-oai-snapshot.json', 'r') as f:
        for line in f:
            yield line

In [4]:
metadata = get_metadata()

In [5]:
# abstract text tag を取得し辞書に保存
text_tags_dict = {"text":[], "tags":[]}
for paper in metadata:
    parsed = json.loads(paper)
    text = parsed['abstract']
    text_tags_dict["text"].append(text)
    text_tags_dict["tags"].append(parsed['categories'])

In [6]:
# DataFrame化
text_tags_df = pd.DataFrame.from_records(text_tags_dict)

In [7]:
# 500000個を取得
text_tags_df = text_tags_df.sample(n=500000, random_state=33)

In [8]:
len(text_tags_df)

500000

## Label付け

In [9]:
categories = text_tags_df['tags'].apply(lambda x: x.split(' ')).explode().unique()

In [10]:
label_to_int_dict = {}
for i, key in enumerate(categories):
    label_to_int_dict[key] = i

In [11]:
int_to_label_dict = {}
for key, val in label_to_int_dict.items():
    int_to_label_dict[val] = key

In [12]:
def generate_label_array(label):
    result = np.zeros(len(label_to_int_dict))
    labels = label.split(' ')
    for l in labels:
        result[label_to_int_dict[l]] = 1
    return np.expand_dims(result, 0)

In [13]:
tag_labels = [generate_label_array(tag) for tag in text_tags_df["tags"]]

In [14]:
# リストを結合
tag_labels = np.concatenate(tag_labels, axis = 0)

In [15]:
tag_labels[1].shape

(176,)

## titleのtokenizedを行う前処理

In [16]:
stop = stopwords.words('english')

text = text_tags_df['text'].apply(lambda x : x.lower())
text = text.apply(lambda x: x.split(' '))
text = text.apply(lambda x: [item for item in x if item not in stop])
text = text.apply(lambda x: ' '.join(x))
text = text.apply(lambda x: re.sub('[^A-Za-z\s]+', ' ', x))
text = text.apply(lambda x: re.sub('\n', ' ', x))
text = text.apply(lambda x: re.sub(r'\s+', ' ', x))
text = text.apply(lambda x: re.sub(r'^\s', '', x))
text = text.apply(lambda x: re.sub(r'\s$', '', x))

In [17]:
text = list(text)

In [18]:
text

['paper motivated following question sieve theory given a subset x subset n alpha in suppose x pmod p leq alpha o p every prime p large x be one hand we have bound x ll alpha n alpha gallagher s larger sieve on the hand prove assuming truth inverse sieve conjecture that bound improved for example to x ll alpha n o alpha small alpha result follows from studying average size x pmod p p varies when x f mathbb z cap n value set polynomial f x in mathbb z x',
 'wavelets known useful non linear multi scale processes in multi resolution analysis shown q deformed algebraic structure the translation dilation operators theory associate scaling equation non linear two parameter algebra structure mapped onto the quantum group su q one limit approaches fourier series generating algebra another limit duality scaling function and corresponding non linear algebra obtained examples haar and b wavelets worked detail',
 'field quantum algorithms aims find ways speed solution of computational problems usi

## BERTで学習

In [19]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 864 kB/s eta 0:00:01
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [20]:
from transformers import BertTokenizer

In [21]:
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [22]:
from transformers import *
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




In [23]:
text_tokens = tokenizer.batch_encode_plus(text, pad_to_max_length=True, max_length=250, return_tensors='pt')

In [24]:
text_tokens['input_ids'].shape

torch.Size([500000, 250])

## マルチラベルモデル作成

In [25]:
random.seed(33)
sample_indices = random.sample(range(text_tokens['input_ids'].shape[0]), 500000)#text_tokens["input_ids"].shape[0])

In [26]:
x_train, x_test, y_train, y_test = train_test_split(text_tokens["input_ids"][sample_indices,:], tag_labels[sample_indices, :], test_size = 0.2)

In [27]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

torch.Size([400000, 250])
(400000, 176)
torch.Size([100000, 250])
(100000, 176)


## Data

In [28]:
class arxiv_dataset(torch.utils.data.Dataset):
    def __init__(self, text, labels):
        self.text = text
        
        self.labels = labels
        
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, index):
        x = self.text[index, :]
        y = self.labels[index, :]
        return x, y

In [29]:
train_data = arxiv_dataset(x_train, y_train)

In [30]:
train_gen = torch.utils.data.DataLoader(train_data, batch_size=128)

In [31]:
test_data = arxiv_dataset(x_test, y_test)

In [32]:
test_gen = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=True)

## BERT Model作成
以下のscibertモデルを用いてArxivのtextデータをファインチューニングを行う<br>
https://huggingface.co/allenai/scibert_scivocab_uncased

In [33]:
class BERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased",
                                                                    output_hidden_states=True)
        for param in self.encoder.parameters():
            param.requires_grad = False
            
        self.dense_1 = nn.Linear(768, 384)
        self.dense_2 = nn.Linear(384, 176)
        
    def forward(self, tokens):
        hidden_states = self.encoder(tokens)[1][-1][:, 0]
        x = F.relu(self.dense_1(hidden_states))
        x = self.dense_2(x)
        return x

In [34]:
model = BERT()
model = model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




In [35]:
for toks, _ in train_gen:
    print(model(toks.cuda()).shape)
    break

torch.Size([128, 176])


In [36]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0006)

In [37]:
EPOCHS = 2

In [38]:
train_loss = []
for epoch in range(EPOCHS):
    running_loss = 0.0
    num_batches = 0
    for data in train_gen:
        inputs, labels = data
        inputs = inputs.cuda()
        labels = labels.cuda()
        
        #Zero the gradients from last step
        optimizer.zero_grad()
        logits = model(inputs)
        #Calculate BCE with logits
        loss = criterion(logits, labels)
        #Back prop and optimizer step
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        num_batches += 1
        
        # Check GPU memory againというエラーを避けるため
        del inputs
        del labels
        del logits
        torch.cuda.empty_cache()
        
    train_loss.append(running_loss / num_batches)
        

In [39]:
print(train_loss)

[0.03432679702266374, 0.027810324225955706]


In [40]:
torch.save(model.state_dict(), '10_26_1209.pt')