In [113]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
from collections import defaultdict
import re
from sklearn.metrics import confusion_matrix

## Read file into DataFrame

In [114]:
# read the file into DataFrame
df = pd.read_csv('./CAMEO_IDEA_labeled_data.csv')

# separate content and label
text = df['Content']
labels = df['Category Code']

## Tokenize the text

In [115]:
# funtion tokenize sentence
tokenizer = spacy.load("en_core_web_sm")
stopwords = tokenizer.Defaults.stop_words
# tokenize, lemmatize the text, drop punctuations and stopwords
tokenize = lambda t: [token.lemma_ for token in tokenizer(t) if (not token.is_punct) and (not token.is_stop)]

# only tokenize the text
# tokenize = lambda t: [token.text for token in tokenizer(t)]

In [116]:
# build dictionary <key=word : value=count>
cnt = Counter()
size = text.size
for idx in range(size):
    for word in tokenize(text[idx]):
        cnt[word] += 1

In [117]:
# filter out low-frequency word
min_threshold = 1
count = {x: count for x, count in cnt.items() if count >= min_threshold}

In [118]:
# filter out high-frequency word
min_threshold = 1
count = {x: count for x, count in cnt.items() if count <= min_threshold}

## Split dataset into train set and test set

In [119]:
X = np.array(text)
y = np.array(labels)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

## Prepare for word embedding

In [120]:
# download glove dictionary
# def download_glove():
#     ! wget http://nlp.stanford.edu/data/glove.6B.zip
#     ! unzip glove.6B.zip -C data
    
# download_glove()
# ! unzip glove.6B.zip

In [121]:
# load word embedding dictionary (<key=word : value=vector>)
def load_embedding_dict():
    embeddings_dict = {}
    with open("glove.6B.50d.txt", 'r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_dic = load_embedding_dict()

In [122]:
# create dictionaries(<key=word : value=index number>) (<key=word : value=vector>)
def create_embedding_matrix(count,emb_size=50):
    size = len(count) + 2
    word_idx_dict = {}
    word_vec = np.zeros((size, emb_size), dtype="float32")
    
    # add padding and UNK keyword
    word_idx_dict[""] = 0
    word_vec[0] = np.zeros(emb_size, dtype='float32')
    word_idx_dict["UNK"] = 1
    word_vec[1] = np.random.uniform(-0.25, 0.25, emb_size)

    for i, word in enumerate(count.keys()):
        word_idx_dict[word] = i + 2

        if word in glove_dic:
            word_vec[i + 2] = glove_dic[word]
        else:
            word_vec[i + 2] = np.random.uniform(-0.25,0.25, emb_size)

    return word_idx_dict, word_vec
    
word_idx_dict, pretrained_weight = create_embedding_matrix(count)

## Prepare for encoding sentence

In [123]:
def encode_sentence(line, word_idx_dict, N=400, padding_start=True):
    tokens = tokenize(line)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([word_idx_dict.get(word, word_idx_dict["UNK"]) for word in tokens])
    length = min(N, len(enc1))
    if padding_start:
        enc[:length] = enc1[:length]
    else:
        enc[N - length:] = enc1[:length]
    return enc, length

## Build DataSet and DataLoader for model

In [124]:
class EventDataset(Dataset):
    def __init__(self, X, y, N=40, padding_start=False):
        self.y = y
        self.X = [encode_sentence(line, word_idx_dict, N, padding_start) for line in X]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [125]:
train_ds = EventDataset(X_train, y_train)
valid_ds = EventDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=30, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=30)

In [126]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [127]:
def train_epocs(model, optimizer, train_dl, val_dl, epochs=10):
    global max_acc
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long() 
            y = y.long() 
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred.squeeze(0), y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if val_acc > max_acc:
            max_acc = val_acc
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [128]:
def val_metrics(model, val_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, s, y in val_dl:
        x = x.long()  
        y = y.long()
        batch = y.shape[0]
        out = model(x)
        loss = F.cross_entropy(out.squeeze(0), y)
        sum_loss += batch*(loss.item())
        total += batch
        _, pred = torch.max(out.squeeze(0), 1) 
        correct += (pred == y.data).float().sum().item()
    val_loss = sum_loss/total
    val_acc = correct/total
    return val_loss, val_acc

## Model

In [139]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights=None) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 3)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, ht = self.lstm(x)
        x = self.linear(ht[-1])
        return x

### hyperparameter tuning

In [140]:
for i in range(4): 
    vocab_size = len(word_idx_dict)
    model = LSTMModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.01)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)
    print(max_acc)
    
    vocab_size = len(word_idx_dict)
    model = LSTMModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.1)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)
    print(max_acc)

train loss 0.521 val loss 0.483 and val accuracy 0.819
train loss 0.411 val loss 0.457 and val accuracy 0.814
train loss 0.356 val loss 0.441 and val accuracy 0.854
train loss 0.317 val loss 0.493 and val accuracy 0.834
train loss 0.235 val loss 0.584 and val accuracy 0.824
train loss 0.243 val loss 0.579 and val accuracy 0.809
0.8542713567839196
train loss 0.694 val loss 0.556 and val accuracy 0.769
train loss 0.797 val loss 1.016 and val accuracy 0.784
train loss 0.513 val loss 0.503 and val accuracy 0.809
train loss 0.511 val loss 0.511 and val accuracy 0.759
train loss 0.483 val loss 0.507 and val accuracy 0.804
train loss 0.485 val loss 0.549 and val accuracy 0.779
0.8291457286432161
train loss 0.543 val loss 0.506 and val accuracy 0.784
train loss 0.387 val loss 0.520 and val accuracy 0.849
train loss 0.342 val loss 0.501 and val accuracy 0.834
train loss 0.311 val loss 0.518 and val accuracy 0.789
train loss 0.291 val loss 0.590 and val accuracy 0.794
train loss 0.216 val loss 0

In [130]:
for i in range(4): 
    vocab_size = len(word_idx_dict)
    model = LSTMModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.01)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)
    print(max_acc)
    
    vocab_size = len(word_idx_dict)
    model = LSTMModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.1)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)
    print(max_acc)

train loss 0.560 val loss 0.512 and val accuracy 0.804
train loss 0.135 val loss 0.670 and val accuracy 0.809
train loss 0.090 val loss 0.594 and val accuracy 0.819
train loss 0.064 val loss 0.818 and val accuracy 0.829
train loss 0.060 val loss 0.754 and val accuracy 0.814
train loss 0.067 val loss 0.582 and val accuracy 0.839
0.8542713567839196
train loss 0.700 val loss 0.651 and val accuracy 0.779
train loss 0.310 val loss 0.911 and val accuracy 0.749
train loss 0.197 val loss 0.936 and val accuracy 0.784
train loss 0.158 val loss 1.180 and val accuracy 0.764
train loss 0.191 val loss 1.317 and val accuracy 0.784
train loss 0.137 val loss 1.296 and val accuracy 0.779
0.7889447236180904
train loss 0.540 val loss 0.495 and val accuracy 0.784
train loss 0.125 val loss 0.882 and val accuracy 0.819
train loss 0.081 val loss 0.714 and val accuracy 0.824
train loss 0.064 val loss 0.546 and val accuracy 0.854
train loss 0.065 val loss 0.631 and val accuracy 0.844
train loss 0.054 val loss 0

### two lenear layer model

In [133]:
class LSTMModel1(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden_dim1, glove_weights=None) :
        super(LSTMModel1,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = True ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, hidden_dim1)
        self.linear1 = nn.Linear(hidden_dim1, 3)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, ht = self.lstm(x)
        x = self.linear(ht[-1])
        x = self.linear1(x)
        return x

### hyperparameter tuning

In [134]:
for i in range(4):
    model1 = LSTMModel1(vocab_size, 50, 50*(i+1), 50, glove_weights = pretrained_weight)
    parameters1 = filter(lambda p: p.requires_grad, model1.parameters())
    optimizer1 = torch.optim.Adam(parameters1, lr=0.01)
    max_acc = 0
    train_epocs(model1, optimizer1, train_dl, valid_dl, epochs=30)
    print(max_acc)
    
    model1 = LSTMModel1(vocab_size, 50, 50*(i+1), 50, glove_weights = pretrained_weight)
    parameters1 = filter(lambda p: p.requires_grad, model1.parameters())
    optimizer1 = torch.optim.Adam(parameters1, lr=0.1)
    max_acc = 0
    train_epocs(model1, optimizer1, train_dl, valid_dl, epochs=30)
    print(max_acc)

train loss 0.512 val loss 0.484 and val accuracy 0.799
train loss 0.156 val loss 0.657 and val accuracy 0.814
train loss 0.089 val loss 0.800 and val accuracy 0.794
train loss 0.080 val loss 0.682 and val accuracy 0.824
train loss 0.065 val loss 0.782 and val accuracy 0.784
train loss 0.062 val loss 0.792 and val accuracy 0.809
0.8341708542713567
train loss 1.379 val loss 0.972 and val accuracy 0.764
train loss 0.524 val loss 0.762 and val accuracy 0.754
train loss 0.693 val loss 1.230 and val accuracy 0.693
train loss 0.478 val loss 1.159 and val accuracy 0.709
train loss 0.546 val loss 2.760 and val accuracy 0.709
train loss 0.639 val loss 1.275 and val accuracy 0.683
0.7638190954773869
train loss 0.495 val loss 0.490 and val accuracy 0.809
train loss 0.158 val loss 0.493 and val accuracy 0.824
train loss 0.078 val loss 0.522 and val accuracy 0.829
train loss 0.078 val loss 0.711 and val accuracy 0.819
train loss 0.066 val loss 0.721 and val accuracy 0.829
train loss 0.066 val loss 0

In [135]:
for i in range(4):
    model1 = LSTMModel1(vocab_size, 50, 50*(i+1), 50, glove_weights = pretrained_weight)
    parameters1 = filter(lambda p: p.requires_grad, model1.parameters())
    optimizer1 = torch.optim.Adam(parameters1, lr=0.01)
    max_acc = 0
    train_epocs(model1, optimizer1, train_dl, valid_dl, epochs=60)
    print(max_acc)
    
    model1 = LSTMModel1(vocab_size, 50, 50*(i+1), 50, glove_weights = pretrained_weight)
    parameters1 = filter(lambda p: p.requires_grad, model1.parameters())
    optimizer1 = torch.optim.Adam(parameters1, lr=0.1)
    max_acc = 0
    train_epocs(model1, optimizer1, train_dl, valid_dl, epochs=60)
    print(max_acc)

train loss 0.509 val loss 0.482 and val accuracy 0.794
train loss 0.165 val loss 0.491 and val accuracy 0.804
train loss 0.093 val loss 0.587 and val accuracy 0.819
train loss 0.075 val loss 0.842 and val accuracy 0.824
train loss 0.066 val loss 0.587 and val accuracy 0.849
train loss 0.086 val loss 0.549 and val accuracy 0.834
train loss 0.053 val loss 0.587 and val accuracy 0.829
train loss 0.070 val loss 0.567 and val accuracy 0.839
train loss 0.055 val loss 0.754 and val accuracy 0.859
train loss 0.055 val loss 0.736 and val accuracy 0.864
train loss 0.061 val loss 0.554 and val accuracy 0.834
train loss 0.054 val loss 0.556 and val accuracy 0.834
0.864321608040201
train loss 0.961 val loss 1.116 and val accuracy 0.754
train loss 0.407 val loss 0.695 and val accuracy 0.744
train loss 0.311 val loss 0.868 and val accuracy 0.719
train loss 0.301 val loss 1.058 and val accuracy 0.709
train loss 0.441 val loss 3.917 and val accuracy 0.568
train loss 0.453 val loss 1.518 and val accurac

In [136]:
for i in range(4): 
    vocab_size = len(word_idx_dict)
    model = LSTMModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.01)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=60)
    print(max_acc)
    
    vocab_size = len(word_idx_dict)
    model = LSTMModel(vocab_size, 50, 50*(i+1), glove_weights = pretrained_weight)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=0.1)
    max_acc = 0
    train_epocs(model, optimizer, train_dl, valid_dl, epochs=60)
    print(max_acc)

train loss 0.521 val loss 0.539 and val accuracy 0.799
train loss 0.166 val loss 0.658 and val accuracy 0.794
train loss 0.084 val loss 1.044 and val accuracy 0.804
train loss 0.075 val loss 0.762 and val accuracy 0.794
train loss 0.061 val loss 0.856 and val accuracy 0.799
train loss 0.067 val loss 0.898 and val accuracy 0.809
train loss 0.062 val loss 0.879 and val accuracy 0.819
train loss 0.056 val loss 0.816 and val accuracy 0.814
train loss 0.067 val loss 1.071 and val accuracy 0.834
train loss 0.070 val loss 1.074 and val accuracy 0.824
train loss 0.056 val loss 0.928 and val accuracy 0.819
train loss 0.054 val loss 0.989 and val accuracy 0.829
0.8341708542713567
train loss 0.514 val loss 0.578 and val accuracy 0.739
train loss 0.278 val loss 0.722 and val accuracy 0.784
train loss 0.259 val loss 0.649 and val accuracy 0.774
train loss 0.356 val loss 0.942 and val accuracy 0.794
train loss 0.361 val loss 1.028 and val accuracy 0.784
train loss 0.316 val loss 1.000 and val accura

### decreasing learning rate

In [138]:
model = LSTMModel(vocab_size, 50, 50, glove_weights = pretrained_weight)
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.1)
max_acc = 0
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)
update_optimizer(optimizer, 0.005)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.686 val loss 0.679 and val accuracy 0.749
train loss 0.460 val loss 0.750 and val accuracy 0.719
train loss 0.299 val loss 0.728 and val accuracy 0.774
train loss 0.301 val loss 0.813 and val accuracy 0.819
train loss 0.279 val loss 0.940 and val accuracy 0.749
train loss 0.234 val loss 1.322 and val accuracy 0.779
train loss 0.170 val loss 1.008 and val accuracy 0.749
train loss 0.156 val loss 1.004 and val accuracy 0.759
train loss 0.149 val loss 0.990 and val accuracy 0.759
train loss 0.155 val loss 0.995 and val accuracy 0.754
train loss 0.126 val loss 0.959 and val accuracy 0.749
train loss 0.166 val loss 0.994 and val accuracy 0.759
