In [None]:
# load necessary packages
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

import torch
from torch import nn, optim
import torchtext

import random
import json
from tqdm import tqdm, tqdm_notebook
from nltk import word_tokenize

In [None]:
# Load dataset and basic re-transformatting 
with open('reviews.json', 'r') as read_file:
    json_yelp = json.load(read_file)
    
df = json_normalize(json_yelp)['stars', 'text']

In [None]:
# split the test and train datasets
test_size = 0.2

num_data = len(df)
indices = list(range(num_data))
np.random.shuffle(indices)
split = int(np.floor(test_size * num_data))
train_idx, test_idx = indices[split:], indices[:split]

train_df = df[train_idx, :]
test_df = df[test_idx, :]

train_df.to_csv('train_dataset.csv')  # save it for future use
test_df.to_csv('test_dataset.csv') 

In [None]:
# Load corpus
text = torchtext.data.Field(lower=True, batch_first=True, tokenize=word_tokenize, fix_length=70)
target = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True)

train = torchtext.data.TabularDataset(path='mydir/train_dataset.csv', format='csv',
                                      fields={'text': ('text',text),
                                              'stars': ('target',target)})
test = torchtext.data.TabularDataset(path='mydir/test_dataset.csv', format='csv',
                                     fields={'text': ('text', text)})

In [None]:
# build vocabulary
text.build_vocab(train, test, min_freq=3)
qid.build_vocab(test)

In [None]:
# load pre-trained language model
glove = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
tqdm_notebook().pandas() 

In [None]:
text.vocab.set_vectors(glove.stoi, glove.vectors, dim=300)

In [None]:
# define the network for the text CNN
class TextCNN(nn.Module):
    
    def __init__(self, lm, padding_idx, static=True, kernel_num=128, fixed_length=50, kernel_size=[2, 5, 10], dropout=0.2):
        super(TextCNN, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(lm)
        if static:
            self.embedding.weight.requires_grad = False
        self.embedding.padding_idx = padding_idx
        self.conv = nn.ModuleList([nn.Conv2d(1, kernel_num, (i, self.embedding.embedding_dim)) for i in kernel_size])
        self.maxpools = [nn.MaxPool2d((fixed_length+1-i,1)) for i in kernel_size]
        self.fc = nn.Linear(len(kernel_size)*kernel_num, 1)
        
    def forward(self, input):
        x = self.embedding(input).unsqueeze(1)  # B x Ci x H x W
        x = [self.maxpools[i](torch.tanh(cov(x))).squeeze(3).squeeze(2) for i, cov in enumerate(self.conv)]  # B x Kn
        x = torch.cat(x, dim=1)  # B x Kn * len(Kz)
        y = self.fc(self.dropout(x))
        return y


In [None]:
# Define the training process and strategy
def search_best_f1(true, pred):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in np.arange(0.1, 0.501, 0.01):
        tmp[1] = f1_score(true, np.array(pred)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    return tmp[2], delta


In [None]:
def training(epoch, model, loss_func, optimizer, train_iter, val_iter):
    e = 0
    
    while e < epoch:
        train_iter.init_epoch()
        losses, preds, true = [], [], []
        for train_batch in tqdm(list(iter(train_iter)), 'epcoh {} training'.format(e)):
            model.train()
            x = train_batch.text.cuda()
            y = train_batch.target.type(torch.Tensor).cuda()
            true.append(train_batch.target.numpy())
            model.zero_grad()
            pred = model.forward(x).view(-1)
            loss = loss_function(pred, y)
            preds.append(torch.sigmoid(pred).cpu().data.numpy())
            losses.append(loss.cpu().data.numpy())
            loss.backward()
#             clip_grad_norm_(model.parameters(), 2)
            optimizer.step()
        with torch.no_grad():
            model.eval()
            model.zero_grad()
            val_loss = []
            val_preds = []
            val_true =[]
            for val_batch in tqdm(val_iter, 'epcoh {} validating'.format(e)):
                val_x = val_batch.text.cuda()
                val_y = val_batch.target.type(torch.Tensor).cuda()
                val_true.append(val_batch.target.numpy())
                val_pred = model.forward(val_x).view(-1)
                val_preds.append(torch.sigmoid(val_pred).cpu().data.numpy())
                val_loss.append(loss_function(val_pred, val_y).cpu().data.numpy())
            train_f1, alpha_train = search_best_f1([j for i in true for j in i], [j for i in preds for j in i])
            val_f1, alpha_val = search_best_f1([j for i in val_true for j in i], [j for i in val_preds for j in i])
            print('epcoh {:02} - train_loss {:.4f} - val_loss {:.4f} '
                      '- train f1 {:.4f} - val f1 {:.4f}'.format(
                            e, np.mean(losses), np.mean(val_loss),
                            train_f1, val_f1))
                
        e += 1
    return alpha_val
                

In [None]:
# Get the batch set and also complete the train/validation split
random.seed(1234)
train, val = train.split(split_ratio=0.8, random_state=random.getstate())
batch_size = 512
train_iter = torchtext.data.BucketIterator(dataset=train,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               sort=False)
val_iter = list(torchtext.data.BucketIterator(dataset=val,
                                             batch_size=batch_size,
                                             train=False,
                                             sort=False))

In [None]:
# Define network init
def init_network(model, method='xavier', exclude='embedding', seed=123):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    for name, w in model.named_parameters():
        if not exclude in name:
            if 'weight' in name:
                if method is 'xavier':
                    nn.init.xavier_normal_(w)
                elif method is 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0.0)
            else: 
                pass


In [None]:
def print_model(model, ignore='embedding'):
    total = 0
    for name, w in model.named_parameters():
        if not ignore or ignore not in name:
            total += w.nelement()
            print('{} : {}  {} parameters'.format(name, w.shape, w.nelement()))
    print('-------'*4)
    print('Total {} parameters'.format(total))

In [None]:
# build the CNN model for text classification 
text.fix_length = 70
model = TextCNN(text.vocab.vectors, padding_idx=text.vocab.stoi[text.pad_token], kernel_size=[1, 2, 3, 5], kernel_num=128, static=False, fixed_length=text.fix_length, dropout=0.1).cuda()
init_network(model)
# choose ideal optimizer and loss function for the task
optimizer = optim.Adam(params=model.parameters(), lr=1e-3)
loss_function = nn.BCEWithLogitsLoss()
print_model(model, ignore=None)

In [None]:
alpha = training(3, model, loss_function, optimizer, train_iter, val_iter)

In [None]:
def predict(model, test_list):
    pred = []
    with torch.no_grad():
        for test_batch in test_list:
            model.eval()
            x = test_batch.text.cuda()
            pred += torch.sigmoid(model.forward(x).view(-1)).cpu().data.numpy().tolist()
    return pred

In [None]:
test_list = list(torchtext.data.BucketIterator(dataset=test,
                                    batch_size=batch_size,
                                    sort=False,
                                    train=False))

In [None]:
# predict the review stars and put the result in a dataframe
preds = predict(model, test_list)
sub = pd.DataFrame()
sub['qid'] = [qid.vocab.itos[j] for i in test_list for j in i.qid.view(-1).numpy()]
sub['prediction'] = (preds > alpha).astype(int)

In [None]:
# show the prediction result
sub.head()