In [22]:
import pandas as pd
import numpy as np
import spacy
import itertools
import pickle
import re
import importlib

### vocab

In [17]:
df1 = pd.read_csv("./data/train.csv")
df2 = pd.read_csv("./data/test.csv")
df = pd.concat((df1, df2))

In [18]:
splitter = lambda x: list(x)

vocab1 = set(list(itertools.chain.from_iterable(list(df1["text"].apply(splitter)))))
vocab2 = set(list(itertools.chain.from_iterable(list(df2["text"].apply(splitter)))))
vocab = sorted(list(vocab1.union(vocab2)))
print("train vocab", len(vocab1))
print("test vocab", len(vocab2))
print("Combined vocab", len(vocab))
print("test vocab not in train", len([i for i in vocab2 if i not in vocab1]))

train vocab 43
test vocab 43
Combined vocab 45
test vocab not in train 2


In [19]:
np.save("./data/character_level/vocab.npy", np.array(vocab))

In [10]:
f = open("../fasttext/crawl-300d-2M-subword.vec", "r", encoding="utf-8")
f_out = open("../fasttext/hw1_vocab_character.vec", "w")
f_out.write("{} {}\n".format(len(vocab), 300))
d = {}
for i, line in enumerate(f):
    if i == 0:
        continue
    if i % 100000 == 0:
        print(i, end=",")
    word = line.split(" ")[0]
    if word in vocab:
        d[word] = line.strip().split(' ')
        f_out.write(line)
    if len(d) == len(vocab):
        break
f.close()
f_out.close()

100000,200000,300000,400000,500000,600000,700000,800000,900000,1000000,1100000,1200000,1300000,1400000,1500000,1600000,1700000,1800000,1900000,2000000,

In [11]:
pickle.dump(d, open("./data/character_level/vocab_ft.pkl", "wb"))

In [49]:
### save grams

df1 = pd.read_csv("./data/character_level/train.csv", index_col="ID")
df2 = pd.read_csv("./data/character_level/test.csv", index_col="ID")
df = pd.concat([df1, df2])
print(df1.shape)
print(df2.shape)
print(df.shape)
vocab = np.load("./data/character_level/vocab.npy")

def get_ngram(sent, gram):
    chars = list(sent)
    if len(chars) < gram:
        return []
    ngrams = []
    for i in range(len(chars)-(gram-1)):
        ngrams.append("_".join(chars[i:i+gram]))
    return ngrams

def save_ngram(df, n=2):
    all_ngrams = []
    for ngrams in df.raw_text.apply(lambda x: get_ngram(x, n)):
        all_ngrams.extend(ngrams)
    all_ngrams = np.array(sorted(list(set(all_ngrams))))
    np.save("./data/character_level/{}grams.npy".format(n), all_ngrams)
    print("number of {} grams: {}".format(n, len(all_ngrams)))
    return all_ngrams

for n in range(1, 6):
    save_ngram(df, n)

(3338, 4)
(1084, 4)
(4422, 4)
number of 1 grams: 45
number of 2 grams: 570
number of 3 grams: 3393
number of 4 grams: 9393
number of 5 grams: 16732


In [50]:
import data_utils
importlib.reload(data_utils)

train_val_data, holdout_test_data, test_data = data_utils.prep_all_data(
    path="./data/character_level/",
    char_level=True,
    train_file="train_val.csv",
    val_file="holdout_test.csv",
    test_file="test.csv"
    )

In [53]:
import torch

vocab = train_val_data.text_field.vocab.itos
ft_emb = pickle.load(open("./data/character_level/vocab_ft.pkl", "rb"))
emb_matrix = []
num_unknown = 0
for word in vocab:
    if word in ft_emb:
        vec = ft_emb[word][1:]
        emb_matrix.append([float(i) for i in vec])
    else:
        num_unknown += 1
        vec = [np.random.normal() for i in range(300)]
        emb_matrix.append(vec)
emb_matrix = torch.tensor(emb_matrix)
torch.save(emb_matrix, "./data/character_level/emb_matrix_ft.pt")
num_unknown

3

In [54]:
import pandas as pd
import numpy as np
import datetime
import importlib
import pickle

import data_utils
import model_utils
import train_utils
import evaluation
import submission
importlib.reload(data_utils)
importlib.reload(model_utils)
importlib.reload(train_utils)
importlib.reload(evaluation)
importlib.reload(submission)

import torch
import torch.nn as nn

In [64]:
PATH = "./data/character_level/"
NGRAM = 3

In [65]:
train_data, val_data, test_data = data_utils.prep_all_data(path=PATH, ngram=NGRAM)

In [66]:
importlib.reload(model_utils)
importlib.reload(data_utils)

m = model_utils.BaseModelNGram(path=PATH, ngram=NGRAM)
result = train_utils.train(train_data, val_data, m,
                          lr=1e-2, print_freq=5, max_epoch=100)

Epoch: 0, LR: 0.01, Train Loss: 384.6906, Val Loss: 513.0171, Val f1 0.108
Epoch: 5, LR: 0.01, Train Loss: 274.7169, Val Loss: 266.3980, Val f1 0.124
Epoch: 10, LR: 0.01, Train Loss: 267.0656, Val Loss: 266.5909, Val f1 0.121


In [67]:
importlib.reload(model_utils)
importlib.reload(data_utils)

m = model_utils.GRU(path=PATH)
result = train_utils.train(train_data, val_data, m,
                          lr=1e-2, print_freq=5, max_epoch=100)

Epoch: 0, LR: 0.01, Train Loss: 366.7863, Val Loss: 297.4173, Val f1 0.102
Epoch: 5, LR: 0.01, Train Loss: 279.2007, Val Loss: 285.6728, Val f1 0.124
Epoch: 10, LR: 0.01, Train Loss: 275.5213, Val Loss: 281.4061, Val f1 0.121
Epoch: 15, LR: 0.01, Train Loss: 277.9197, Val Loss: 293.3754, Val f1 0.121
Epoch: 20, LR: 0.01, Train Loss: 273.6257, Val Loss: 294.2516, Val f1 0.117
Epoch: 25, LR: 0.001, Train Loss: 259.8817, Val Loss: 282.6400, Val f1 0.114
