In [144]:
import torch
import pandas as pd
import spacy
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split

## Read file into DataFrame

In [145]:
# read the file into DataFrame
df = pd.read_csv('./labeled_data.csv')

# separate content and label
text = df['Content']
labels = df['Category Code']

## Tokenize the text

In [146]:
# funtion tokenize sentence and transform words to lowercase
tokenizer = spacy.load("en_core_web_sm")
tokenize = lambda i : [token.text.lower() for token in tokenizer(text[i])]

In [147]:
# build dictionary <key=word : value=count>
cnt = Counter()
size = text.size
for idx in range(size):
    for word in tokenize(idx):
        cnt[word] += 1 
        
# filter out low-frequency word
min_threshold = 2
count = {x: count for x, count in cnt.items() if count >= min_threshold}

In [148]:
# build dictionary <key=word : value=count>
cnt = Counter()
size = text.size
# data: list of list of word
data = []
for idx in range(size):
    sentence = []
    for word in tokenize(idx):
        cnt[word] += 1 
        sentence.append(word)
    data.append(sentence)
    
# filter out low-frequency word
# min_threshold = 2
# count = {x: count for x, count in cnt.items() if count >= min_threshold}

## Word embedding

In [149]:
# load embedding dictionary (<key=word : value=vector>)
def load_embedding_dict():
    embeddings_dict = {}
    with open("glove.6B.50d.txt", 'r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

word_dic = load_embedding_dict()

In [150]:
# create 
def create_embedding_matrix(emb_size=50):
    word_idx_dict = {}
    word_idx_dict[""] = 0
    word_vec = np.zeros((size, emb_size), dtype="float32")
    word_vec[0] = np.zeros(emb_size, dtype='float32')
    word_idx_dict["UNK"] = 1
    word_vec[1] = np.random.uniform(-0.25, 0.25, emb_size)

    for i, word in enumerate(count.keys()):
        word_idx_dict[word] = i + 2

        if word in embeddings_dic:
            word_vec[i] = word_dic[word]
        else:
            word_vec[i] = np.random.uniform(-0.25,0.25, emb_size)

        return word_idx_dict, word_vec
    
word_idx_dict, word_vec = create_embedding_matrix()

In [151]:
X_train, X_val, y_train, y_val = train_test_split(text, labels, test_size=0.2, random_state=42)