In [34]:
import torch
import pandas as pd
import spacy
from collections import Counter
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os

## Read file into DataFrame

In [35]:
# read the file into DataFrame
df = pd.read_csv('./CAMEO_IDEA_labeled_data.csv')

# separate content and label
text = df['Content']
labels = df['Category Code']

## Tokenize the text

In [44]:
# funtion tokenize sentence
tokenizer = spacy.load("en_core_web_sm")
# tokenize, lemmatize the text and drop punctuations
tokenize = lambda t: [token.lemma_ for token in tokenizer(t) if not token.is_punct]

# only tokenize the text
#tokenize = lambda t: [token.text for token in tokenizer(t)]

In [45]:
# build dictionary <key=word : value=count>
cnt = Counter()
size = text.size
for idx in range(size):
    for word in tokenize(text[idx]):
        cnt[word] += 1 

Diehard
Croat
fighter
surrender
to
serbian
force
on
Monday
after
an
86-day
siege
bosnian
Serbs
have
return
at
least
four
heavy
weapon
include
a
tank
-PRON-
have
seize
from
the
United
Nations
in
Sarajevo
NATO
on
Monday
decline
comment
on
an
estimate
that
yugoslav
army
and
special
police
troop
in
Kosovo
be
lose
90
to
100
dead
per
day
in
NATO
air
strike
accord
to
a
dutch
banking
expert
the
conflict
in
Yugoslavia
could
have
a
negative
impact
on
nearby
Bosnia
further
complicate
effort
to
sell
off
state
control
bank
an
administration
spokesman
say
the
ongoing
negotiation
in
the
Middle
East
be
likely
to
culminate
in
a
peaceful
accord
by
the
weekend
colombian
marxist
guerrilla
claim
responsibility
for
the
shoot
death
of
a
congressman
early
today
U.S.
Secretary
of
State
Madeleine
Albright
mediate
another
discussion
between
israeli
and
palestinian
negotiator
in
New
York
on
Thursday
the
European
Community
have
begin
negotiate
an
agreement
with
Israel
establish
close
link
in
trade
and
cooperation


In [None]:
# filter out low-frequency word
min_threshold = 1
count = {x: count for x, count in cnt.items() if count >= min_threshold}

In [None]:
# filter out high-frequency word
min_threshold = 1
count = {x: count for x, count in cnt.items() if count <= min_threshold}

## Split dataset into train set and test set

In [14]:
X = np.array(text)
y = np.array(labels)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

## Prepare for word embedding

In [None]:
# download glove dictionary
# def download_glove():
#     ! wget http://nlp.stanford.edu/data/glove.6B.zip
#     ! unzip glove.6B.zip -C data
    
# download_glove()
# ! unzip glove.6B.zip

In [20]:
# load word embedding dictionary (<key=word : value=vector>)
def load_embedding_dict():
    embeddings_dict = {}
    with open("glove.6B.50d.txt", 'r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_dic = load_embedding_dict()

In [23]:
# create dictionaries(<key=word : value=index number>) (<key=word : value=vector>)
def create_embedding_matrix(emb_size=50):
    word_idx_dict = {}
    word_vec = np.zeros((size, emb_size), dtype="float32")
    
    # add padding and UNK keyword
    word_idx_dict[""] = 0
    word_vec[0] = np.zeros(emb_size, dtype='float32')
    word_idx_dict["UNK"] = 1
    word_vec[1] = np.random.uniform(-0.25, 0.25, emb_size)

    for i, word in enumerate(count.keys()):
        word_idx_dict[word] = i + 2

        if word in glove_dic:
            word_vec[i] = glove_dic[word]
        else:
            word_vec[i] = np.random.uniform(-0.25,0.25, emb_size)

        return word_idx_dict, word_vec
    
word_idx_dict, pretrained_weight = create_embedding_matrix()

## Prepare for encoding sentence

In [24]:
def encode_sentence(line, word_idx_dict, N=400, padding_start=True):
    tokens = tokenize(line)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([word_idx_dict.get(word, word_idx_dict["UNK"]) for word in tokens])
    length = min(N, len(enc1))
    if padding_start:
        enc[:length] = enc1[:length]
    else:
        enc[N - length:] = enc1[:length]
    return enc, length

## Build DataSet and DataLoader for model

In [30]:
class EventDataset(Dataset):
    def __init__(self, X, y, N=40, padding_start=False):
        self.y = y
        self.X = [encode_sentence(line, word_idx_dict, N, padding_start) for line in X]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [26]:
train_ds = EventDataset(X_train, y_train)
valid_ds = EventDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=30, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=30)