In [1]:
from __future__ import print_function

import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import re
import operator

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

from tqdm import tqdm_notebook
import utils
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
# DBpedia, YahooAnswersUpper, YahooAnswersLower, YelpReviews, IMDB, AGNews
x_train, y_train, x_test, y_test, TopicList, Idx2Topic = utils.LoadDatasets("IMDB")
Idx2Topic_list = []
for i in range(len(set(TopicList))): Idx2Topic_list.append(Idx2Topic[i])
Idx2Topic_list = np.array(Idx2Topic_list)

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.15, random_state=42)

In [3]:
WordDict = {"<NONE>":0, "<OOV>":1}
WordCnt = {}

data_train, WordDict, WordCnt, MaxSeqLen = utils.DataProcessing(x_train, WordDict, WordCnt, TrainFlag=True)
data_valid, _, _, _ = utils.DataProcessing(x_valid, WordDict, WordCnt, TrainFlag=False)
data_test, _, _, _ = utils.DataProcessing(x_test, WordDict, WordCnt, TrainFlag=False)

WordCnt = sorted(WordCnt.items(), key=operator.itemgetter(1))
WordIdx = dict(zip(WordDict.values(), WordDict.keys()))
x_train_emb = utils.EmbeddingNumpy(data_train, WordDict, MaxSeqLen)
x_valid_emb = utils.EmbeddingNumpy(data_valid, WordDict, MaxSeqLen)
x_test_emb = utils.EmbeddingNumpy(data_test, WordDict, MaxSeqLen)
y_train = np.asarray(y_train)
y_valid = np.asarray(y_valid)
y_test = np.asarray(y_test)

NumClass = max(len(set(y_train)), max(y_train)+1)

HBox(children=(IntProgress(value=0, max=21250), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3750), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=21250), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3750), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




In [10]:
import dataloader
import copy
import random

### Hyperparameters
Hyperparams = {
    "NumClfEpoch" : 100,
    "EmbeddingSize" : 300, # 300, 768
    "KernelSize" : [2,3,4,5],
    "BatchSize" : 2**8,
    "LearningRate" : 1e-3,
    "ChannelSize" : [32, 16],
#     "UsePreWordVector" : False,
#     "UsePreWordVector" : "../../Data/PretrainedWV/glove.42B.300d.txt",
#     "UsePreWordVector" : "../../Data/PretrainedWV/GloVeFullExtro1.txt",
    "UsePreWordVector" : "../../Data/PretrainedWV/GloVeExtro6_.txt",
    "WordVectorNorm" : False,
    "MaxSeqLen" : MaxSeqLen,
    "NumClass" : NumClass,
}

WordIdx = dict(zip(WordDict.values(), WordDict.keys()))
train_dataset = dataloader.ClassifyDataset(x=x_train_emb, y=y_train)
valid_dataset = dataloader.ClassifyDataset(x=x_valid_emb, y=y_valid)
test_dataset = dataloader.ClassifyDataset(x=x_test_emb, y=y_test)
train_loader = DataLoader(dataset=train_dataset, batch_size=Hyperparams["BatchSize"], shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=Hyperparams["BatchSize"], shuffle=True, num_workers=0)
test_loader = DataLoader(dataset=test_dataset, batch_size=Hyperparams["BatchSize"], shuffle=True, num_workers=0)

In [14]:
import TextCNN
from imp import reload
reload(TextCNN)

for it in range(5):
    print(it+1)

    model = TextCNN.ConvNet(WordDict, Hyperparams); model = model.to(device)
    crit_crossentropy = nn.CrossEntropyLoss()
    optimizer_cnn = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=Hyperparams["LearningRate"])
    
    # Train the model
    maxacc_all = 0; Step = 0
    maxacc_val = 0; EarlyStopCnt = 5
    total_step = len(train_loader)
    pbar1 = tqdm_notebook(total = Hyperparams["NumClfEpoch"], leave=False, desc="Epoch")
    model.train() # train mode
    for epoch in range(Hyperparams["NumClfEpoch"]):
        pbar1.update(1)
        pbar2 = tqdm_notebook(total = total_step*Hyperparams["BatchSize"], leave=False, desc="Training")
        for i, (texts, labels) in enumerate(train_loader):
            pbar2.update(Hyperparams["BatchSize"])
            texts = texts.to(device); labels = labels.to(device)
            outputs = model(texts)
            loss_cnn = crit_crossentropy(outputs, labels)
            optimizer_cnn.zero_grad()
            loss_cnn.backward()
            optimizer_cnn.step()
        pbar2.close()
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, Hyperparams["NumClfEpoch"], loss_cnn.item()), end=' ')

        # Valid the model
        model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
        with torch.no_grad():
            correct = 0.; total = 0
            for texts, labels in valid_loader:
                texts = texts.to(device); labels = labels.to(device)
                outputs = model(texts)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            if (correct/total) >= maxacc_val:
                maxacc_val = correct/total
                torch.save(model.state_dict(), "./save/model")
                torch.save(model.embedding.weight, "./save/emb")

            else: # (correct/total) < maxacc_val:
                EarlyStopCnt = EarlyStopCnt-1
                if EarlyStopCnt == 0: break
            print('ValidAcc: {:.4f} % , MaxAcc: {:.4f}'.format(100 * correct/total, maxacc_val))

    ### Test Acc.
    model = TextCNN.ConvNet(WordDict, Hyperparams).to(device)
    model.load_state_dict(torch.load("./save/model"))
    model.embedding.weight.data = torch.load("./save/emb")

    with torch.no_grad():
        model.eval()
        correct = 0.; total = 0
        for texts, labels in test_loader:
            texts = texts.to(device); labels = labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print('TestAcc: {:.4f} % , ValidAcc: {:.4f}'.format(100 * correct / total, maxacc_val))
    pbar1.close()
    #     print("MaxAcc: {:.4f}".format(maxacc))

1
Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 


HBox(children=(IntProgress(value=0, description='Epoch', style=ProgressStyle(description_width='initial')), HT…

HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [1/100], Loss: 0.0358 ValidAcc: 85.8667 % , MaxAcc: 0.8587


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [2/100], Loss: 0.4106 ValidAcc: 83.7333 % , MaxAcc: 0.8587


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [3/100], Loss: 0.0808 ValidAcc: 89.8133 % , MaxAcc: 0.8981


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [4/100], Loss: 0.1286 ValidAcc: 90.0000 % , MaxAcc: 0.9000


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [5/100], Loss: 0.0395 ValidAcc: 90.2400 % , MaxAcc: 0.9024


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [6/100], Loss: 0.0335 ValidAcc: 89.2267 % , MaxAcc: 0.9024


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [7/100], Loss: 0.0000 ValidAcc: 90.5333 % , MaxAcc: 0.9053


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [8/100], Loss: 0.0073 ValidAcc: 90.4000 % , MaxAcc: 0.9053


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [9/100], Loss: 0.0040 ValidAcc: 90.4800 % , MaxAcc: 0.9053


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [10/100], Loss: 0.0028 ValidAcc: 90.7467 % , MaxAcc: 0.9075


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [11/100], Loss: 0.0010 Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 
TestAcc: 89.8200 % , ValidAcc: 0.9075
2
Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 


HBox(children=(IntProgress(value=0, description='Epoch', style=ProgressStyle(description_width='initial')), HT…

HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [1/100], Loss: 0.2486 ValidAcc: 82.8800 % , MaxAcc: 0.8288


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [2/100], Loss: 0.2909 ValidAcc: 89.3600 % , MaxAcc: 0.8936


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [3/100], Loss: 0.0666 ValidAcc: 88.4000 % , MaxAcc: 0.8936


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [4/100], Loss: 0.0036 ValidAcc: 90.0533 % , MaxAcc: 0.9005


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [5/100], Loss: 0.0067 ValidAcc: 89.7600 % , MaxAcc: 0.9005


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [6/100], Loss: 0.2121 ValidAcc: 79.5733 % , MaxAcc: 0.9005


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [7/100], Loss: 0.0015 ValidAcc: 90.2133 % , MaxAcc: 0.9021


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [8/100], Loss: 0.0008 ValidAcc: 90.0800 % , MaxAcc: 0.9021


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [9/100], Loss: 0.0001 Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 
TestAcc: 89.4720 % , ValidAcc: 0.9021
3
Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 


HBox(children=(IntProgress(value=0, description='Epoch', style=ProgressStyle(description_width='initial')), HT…

HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [1/100], Loss: 0.1858 ValidAcc: 84.8800 % , MaxAcc: 0.8488


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [2/100], Loss: 0.0388 ValidAcc: 87.8400 % , MaxAcc: 0.8784


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [3/100], Loss: 1.1555 ValidAcc: 79.7333 % , MaxAcc: 0.8784


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [4/100], Loss: 0.0212 ValidAcc: 89.4133 % , MaxAcc: 0.8941


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [5/100], Loss: 0.0198 ValidAcc: 90.1600 % , MaxAcc: 0.9016


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [6/100], Loss: 0.0623 ValidAcc: 89.9467 % , MaxAcc: 0.9016


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [7/100], Loss: 0.0027 ValidAcc: 89.7333 % , MaxAcc: 0.9016


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [8/100], Loss: 0.1006 ValidAcc: 87.7333 % , MaxAcc: 0.9016


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [9/100], Loss: 0.0028 Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 
TestAcc: 90.1040 % , ValidAcc: 0.9016
4
Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 


HBox(children=(IntProgress(value=0, description='Epoch', style=ProgressStyle(description_width='initial')), HT…

HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [1/100], Loss: 1.2821 ValidAcc: 86.9067 % , MaxAcc: 0.8691


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [2/100], Loss: 0.1114 ValidAcc: 87.8667 % , MaxAcc: 0.8787


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [3/100], Loss: 0.6306 ValidAcc: 80.5867 % , MaxAcc: 0.8787


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [4/100], Loss: 0.0125 ValidAcc: 90.1067 % , MaxAcc: 0.9011


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [5/100], Loss: 0.0550 ValidAcc: 88.0533 % , MaxAcc: 0.9011


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [6/100], Loss: 0.0640 ValidAcc: 89.0400 % , MaxAcc: 0.9011


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [7/100], Loss: 0.0053 ValidAcc: 90.5067 % , MaxAcc: 0.9051


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [8/100], Loss: 0.0153 ValidAcc: 90.3733 % , MaxAcc: 0.9051


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [9/100], Loss: 0.0109 ValidAcc: 90.5600 % , MaxAcc: 0.9056


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [10/100], Loss: 0.0007 Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 
TestAcc: 89.6880 % , ValidAcc: 0.9056
5
Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 


HBox(children=(IntProgress(value=0, description='Epoch', style=ProgressStyle(description_width='initial')), HT…

HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [1/100], Loss: 0.4245 ValidAcc: 82.0800 % , MaxAcc: 0.8208


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [2/100], Loss: 0.1784 ValidAcc: 83.2533 % , MaxAcc: 0.8325


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [3/100], Loss: 0.0212 ValidAcc: 88.2133 % , MaxAcc: 0.8821


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [4/100], Loss: 0.0045 ValidAcc: 88.8533 % , MaxAcc: 0.8885


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [5/100], Loss: 0.0492 ValidAcc: 90.1333 % , MaxAcc: 0.9013


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [6/100], Loss: 0.0070 ValidAcc: 88.9600 % , MaxAcc: 0.9013


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [7/100], Loss: 0.0395 ValidAcc: 89.4133 % , MaxAcc: 0.9013


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [8/100], Loss: 0.0084 ValidAcc: 90.1600 % , MaxAcc: 0.9016


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [9/100], Loss: 0.0001 ValidAcc: 90.5067 % , MaxAcc: 0.9051


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [10/100], Loss: 0.0001 ValidAcc: 90.5067 % , MaxAcc: 0.9051


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [11/100], Loss: 0.0059 ValidAcc: 90.5867 % , MaxAcc: 0.9059


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [12/100], Loss: 0.0024 ValidAcc: 90.7733 % , MaxAcc: 0.9077


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [13/100], Loss: 0.0017 ValidAcc: 90.6400 % , MaxAcc: 0.9077


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [14/100], Loss: 0.0116 ValidAcc: 90.1600 % , MaxAcc: 0.9077


HBox(children=(IntProgress(value=0, description='Training', max=21504, style=ProgressStyle(description_width='…

Epoch [15/100], Loss: 0.0006 Loading Pretrained Word Vectors ... 
64022 Word Vectors Loaded . 
TestAcc: 89.9800 % , ValidAcc: 0.9077
