In [274]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
from collections import defaultdict
import re
from sklearn.metrics import confusion_matrix

## Read file into DataFrame

In [275]:
# read the file into DataFrame
df = pd.read_csv('./CAMEO_IDEA_labeled_data.csv')

# separate content and label
text = df['Content']
labels = df['Category Code']

## Tokenize the text

In [276]:
# funtion tokenize sentence
tokenizer = spacy.load("en_core_web_sm")
stopwords = tokenizer.Defaults.stop_words
# tokenize, lemmatize the text, drop punctuations and stopwords
tokenize = lambda t: [token.lemma_ for token in tokenizer(t) if (not token.is_punct) and (not token.is_stop)]

# only tokenize the text
# tokenize = lambda t: [token.text for token in tokenizer(t)]

In [277]:
# build dictionary <key=word : value=count>
cnt = Counter()
size = text.size
for idx in range(size):
    for word in tokenize(text[idx]):
        cnt[word] += 1

In [278]:
counter = dict(cnt)

### Optional

In [279]:
# filter out low-frequency word
min_threshold = 1
count = {x: count for x, count in cnt.items() if count >= min_threshold}

In [280]:
# filter out high-frequency word
max_threshold = 1
count = {x: count for x, count in cnt.items() if count <= max_threshold}

## Split dataset into train set and test set

In [281]:
X = np.array(text)
y = np.array(labels)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

## Prepare for word embedding

In [282]:
# download glove dictionary
# def download_glove():
#     ! wget http://nlp.stanford.edu/data/glove.6B.zip
#     ! unzip glove.6B.zip -C data
    
# download_glove()
# ! unzip glove.6B.zip

In [283]:
# load word embedding dictionary (<key=word : value=vector>)
def load_embedding_dict():
    embeddings_dict = {}
    with open("glove.6B.50d.txt", 'r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_dic = load_embedding_dict()

In [284]:
# create dictionaries(<key=word : value=index number>) (<key=word : value=vector>)
def create_embedding_matrix(count,emb_size=50):
    size = len(count) + 2
    word_idx_dict = {}
    word_vec = np.zeros((size, emb_size), dtype="float32")
    
    # add padding and UNK keyword
    word_idx_dict[""] = 0
    word_vec[0] = np.zeros(emb_size, dtype='float32')
    word_idx_dict["UNK"] = 1
    word_vec[1] = np.random.uniform(-0.25, 0.25, emb_size)

    for i, word in enumerate(count.keys()):
        word_idx_dict[word] = i + 2

        if word in glove_dic:
            word_vec[i + 2] = glove_dic[word]
        else:
            word_vec[i + 2] = np.random.uniform(-0.25,0.25, emb_size)

    return word_idx_dict, word_vec
    
word_idx_dict, pretrained_weight = create_embedding_matrix(count)

## Prepare for encoding sentence

In [285]:
def encode_sentence(line, word_idx_dict, N=400, padding_start=True):
    tokens = tokenize(line)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([word_idx_dict.get(word, word_idx_dict["UNK"]) for word in tokens])
    length = min(N, len(enc1))
    if padding_start:
        enc[:length] = enc1[:length]
    else:
        enc[N - length:] = enc1[:length]
    return enc, length

## Build DataSet and DataLoader for model

In [286]:
class EventDataset(Dataset):
    def __init__(self, X, y, N=40, padding_start=False):
        self.y = y
        self.X = [encode_sentence(line, word_idx_dict, N, padding_start) for line in X]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [287]:
train_ds = EventDataset(X_train, y_train)
valid_ds = EventDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=30, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=30)

## Training loop

In [288]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [289]:

def train_epocs(model, optimizer, train_dl, val_dl, epochs=10):
    global max_acc
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long() 
            y = y.long() 
            y_pred = model(x)           
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if val_acc > max_acc:
            max_acc = val_acc
        if i % 5 == 0:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

## Evaluation

In [290]:
def val_metrics(model, val_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, s, y in val_dl:
        x = x.long()  #.cuda()
        y = y.long()
        batch = y.shape[0]
#         print(y.size())
        out = model(x)
        
        loss = F.cross_entropy(out, y)
        sum_loss += batch*(loss.item())
        total += batch
        _, pred = torch.max(out, 1) 
#         print(pred.size())
        correct += (pred == y.data).float().sum().item()
    val_loss = sum_loss/total
    val_acc = correct/total
    return val_loss, val_acc

## Model



In [None]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights=None) :
        super(GRUModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
            
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 3)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, ht = self.gru(x)
        x = self.linear(ht[-1])
        return x

### hyperparameter tuning

In [298]:
for i in range(4): 
    vocab_size = len(word_idx_dict)
    model = GRUModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.01)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=60)
    print(max_acc)
    
    vocab_size = len(word_idx_dict)
    model = GRUModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.1)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=60)
    print(max_acc)

train loss 0.677 val loss 0.609 and val accuracy 0.759
train loss 0.457 val loss 0.498 and val accuracy 0.824
train loss 0.387 val loss 0.459 and val accuracy 0.839
train loss 0.309 val loss 0.483 and val accuracy 0.819
train loss 0.267 val loss 0.537 and val accuracy 0.799
train loss 0.275 val loss 0.531 and val accuracy 0.814
train loss 0.225 val loss 0.580 and val accuracy 0.804
train loss 0.246 val loss 0.603 and val accuracy 0.799
train loss 0.229 val loss 0.665 and val accuracy 0.794
train loss 0.226 val loss 0.615 and val accuracy 0.804
train loss 0.231 val loss 0.641 and val accuracy 0.809
train loss 0.188 val loss 0.551 and val accuracy 0.819
0.8542713567839196
train loss 0.909 val loss 0.701 and val accuracy 0.704
train loss 0.802 val loss 0.655 and val accuracy 0.754
train loss 0.620 val loss 0.793 and val accuracy 0.734
train loss 0.824 val loss 0.723 and val accuracy 0.724
train loss 0.723 val loss 0.661 and val accuracy 0.729
train loss 0.765 val loss 0.953 and val accura

### decreasing learning rate

In [305]:
model = GRUModel1(vocab_size, 50, 50, 50, glove_weights = pretrained_weight)
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.1)
max_acc = 0
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

update_optimizer(optimizer, 0.005)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)



train loss 2.593 val loss 1.238 and val accuracy 0.719
train loss 0.598 val loss 0.486 and val accuracy 0.814
train loss 0.608 val loss 0.690 and val accuracy 0.724
train loss 0.653 val loss 0.570 and val accuracy 0.769
train loss 0.832 val loss 0.779 and val accuracy 0.729
train loss 0.807 val loss 0.647 and val accuracy 0.754
train loss 0.694 val loss 0.774 and val accuracy 0.744
train loss 0.547 val loss 0.667 and val accuracy 0.739
train loss 0.520 val loss 0.638 and val accuracy 0.754
train loss 0.502 val loss 0.668 and val accuracy 0.714
train loss 0.523 val loss 0.849 and val accuracy 0.648
train loss 0.516 val loss 0.762 and val accuracy 0.744


In [306]:
max_acc

0.8241206030150754

## Model with two linear layer

In [301]:
class GRUModel1(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim1, glove_weights=None) :
        super(GRUModel1,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
            
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, hidden_dim1)
        self.linear1 = nn.Linear(hidden_dim1, 3)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, ht = self.gru(x)
        x = self.linear(ht[-1])
        x = self.linear1(x)
        return x

### hyperparameter tuning

In [302]:
for i in range(4):
    model1 = GRUModel1(vocab_size, 50, 50*(i+1), 50, glove_weights = pretrained_weight)
    parameters1 = filter(lambda p: p.requires_grad, model1.parameters())
    optimizer1 = torch.optim.Adam(parameters1, lr=0.01)
    max_acc = 0
    train_epocs(model1, optimizer1, train_dl, valid_dl, epochs=30)
    print(max_acc)
    
    model1 = GRUModel1(vocab_size, 50, 50*(i+1), 50, glove_weights = pretrained_weight)
    parameters1 = filter(lambda p: p.requires_grad, model1.parameters())
    optimizer1 = torch.optim.Adam(parameters1, lr=0.1)
    max_acc = 0
    train_epocs(model1, optimizer1, train_dl, valid_dl, epochs=30)
    print(max_acc)

train loss 0.704 val loss 0.526 and val accuracy 0.804
train loss 0.447 val loss 0.468 and val accuracy 0.814
train loss 0.376 val loss 0.480 and val accuracy 0.834
train loss 0.326 val loss 0.612 and val accuracy 0.819
train loss 0.309 val loss 0.519 and val accuracy 0.794
train loss 0.284 val loss 0.502 and val accuracy 0.839
0.8391959798994975
train loss 2.369 val loss 0.861 and val accuracy 0.744
train loss 0.680 val loss 0.678 and val accuracy 0.714
train loss 0.980 val loss 1.079 and val accuracy 0.653
train loss 3.440 val loss 1.982 and val accuracy 0.739
train loss 0.842 val loss 0.928 and val accuracy 0.749
train loss 0.767 val loss 0.737 and val accuracy 0.754
0.7587939698492462
train loss 0.738 val loss 0.568 and val accuracy 0.754
train loss 0.437 val loss 0.504 and val accuracy 0.794
train loss 0.360 val loss 0.509 and val accuracy 0.814
train loss 0.306 val loss 0.616 and val accuracy 0.779
train loss 0.271 val loss 0.666 and val accuracy 0.774
train loss 0.225 val loss 0