In [42]:
from typing import List

In [222]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer

In [17]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
embedding_model = BertModel.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [340]:
def embedding_fn(text: List[str]) -> torch.Tensor:
    inputs = tokenizer.batch_encode_plus(text)
    output = embedding_model(
        input_ids = torch.tensor(inputs['input_ids']),
        attention_mask = torch.tensor(inputs['attention_mask'])
    ).pooler_output
    return output

In [429]:
class Model(nn.Module):
    def __init__(self, base_count, embedding_fn, embedding_out):
        super().__init__()
        self.base_count = base_count
        self.embedding_fn = embedding_fn
        self.embedding_out = embedding_out
        self.embedding_drop = nn.Dropout(.3)
        
        self.regressor = nn.Sequential(
            nn.Linear(self.embedding_out + self.base_count, 200),
            nn.ReLU(),
            nn.Dropout(.3),
#             nn.BatchNorm1d(200),
            nn.Linear(200, 70),
            nn.ReLU(),
            nn.Dropout(.3),
#             nn.BatchNorm1d(70),
            nn.Linear(70, 1),
        )
    
    def __call__(self, x, x_embedding: str):
        embedding_out = self.embedding_fn(x_embedding)
        embedding_out = self.embedding_drop(embedding_out)
        x = torch.cat([embedding_out, x], 1)
        x = self.regressor(x)
        return x

In [430]:
df_train = pd.read_csv('./cap-train.csv')
df_test = pd.read_csv('./cap-test.csv')

In [431]:
train_y = df_train['views'].values
train_y[train_y != train_y] = 0
train_y[train_y <= 0] = 1
train_y_log_max = np.log(train_y.max())
train_y = np.log(train_y + np.finfo(train_y.dtype).eps) / train_y_log_max

In [432]:
train_x_counts = df_train.subscriber_count.values
train_x_embeddings = df_train.preprocessed_title.values

In [433]:
train_x_counts[train_x_counts != train_x_counts] = 0
train_x_counts[train_x_counts <= 0] = 1

In [434]:
log_max = np.log(train_x_counts.max())
train_x_counts = np.log(train_x_counts + np.finfo(train_x_counts.dtype).eps) / log_max

In [435]:
batch_size = 1
def batch_maker(counts, embeddings, ys):
    batch_count, batch_embedding, batch_y = [], [], []
    for count, embedding, y in zip(counts, embeddings, ys):
        batch_count.append([count])
        batch_embedding.append(embedding)
        batch_y.append(y)
        
        if len(batch_count) == batch_size:
            yield batch_count, batch_embedding, batch_y
            batch_count, batch_embedding, batch_y = [], [], []

In [436]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [437]:
lr = .001
decay = .0
epochs = 100

In [438]:
model = Model(1, embedding_fn, 768).to(device)

In [439]:
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=decay)

In [440]:
criterion = torch.nn.MSELoss()

In [441]:
train_x1, train_x2, ys = train_x_counts, train_x_embeddings, train_y

In [445]:
def train(model, epochs, optim, criterion):
    model.train()
    
    for ep in range(epochs):
        total, total_loss = 0, 0
        for x1, x2 , y in batch_maker(train_x1, train_x2, ys):
            x1 = torch.from_numpy(np.array(x1, dtype=np.float32))
            y = torch.from_numpy(np.array(y, dtype=np.float32))
            x1 = x1.to(device)
            y = y.to(device)
            
            pred = model(x1, x2)
            loss = criterion(pred, y)
            
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            total += 1
            total_loss += loss.item()
            print(f'loss: {loss}')
        print(f'trainig loss: {total_loss / total}')
#         if not (ep % 10):
#             validation(model, criterion, val_loader)

In [446]:
train(model, epochs, optimizer, criterion)

loss: 0.021554136648774147
loss: 0.021419446915388107
loss: 0.035821333527565
loss: 0.00015687169798184186
loss: 0.0036772647872567177
loss: 0.0105270491912961
loss: 0.10175984352827072
loss: 0.02936566434800625
loss: 0.010524761863052845
loss: 0.02857932634651661
loss: 0.011014967225492
loss: 3.744450805243105e-05
loss: 0.01181840617209673
loss: 0.022864053025841713
loss: 0.0002868060255423188
loss: 0.0060671367682516575
loss: 0.019967464730143547
loss: 6.70112858642824e-05
loss: 0.025042658671736717
loss: 0.004553244449198246
loss: 1.092250386136584e-06
loss: 0.021938100457191467
loss: 0.0005671984981745481
loss: 0.01293319370597601
loss: 0.02573792263865471
loss: 0.0351872593164444
loss: 0.0013290608767420053
loss: 0.023734411224722862
loss: 0.0011931638000532985
loss: 0.006235924549400806
loss: 0.02742835506796837
loss: 0.004610045813024044
loss: 0.004695686046034098
loss: 0.0044922553934156895
loss: 0.01698998734354973
loss: 0.01159173995256424
loss: 0.00247577135451138
loss: 0.01

KeyboardInterrupt: 