In [1]:
import pandas as pd
import numpy as np
import spacy
import itertools
import pickle
import re

### take a look at data

In [120]:
df_train = pd.read_csv("./data/hw2_train.csv")

In [121]:
df_train.shape

(2312, 2)

In [122]:
df_train.sample(5)

Unnamed: 0,utterances,IOB Slot tags
556,who was the director of the movie jaws,O O O O O O O B_movie
557,look up info about jurassic park 's director,O O O O B_movie I_movie O O
530,goodfellas director,B_movie O
130,who is in movie,O O O O
1179,find me pg-13 movies,O O B_mpaa_rating O


In [14]:
df_val = pd.read_csv("./data/original_data/hw2_utterance_dev.txt", header=None)
df_val.head(5)

Unnamed: 0,0
0,show me ones by david fincher
1,who is director of the words
2,what rating did the campaign movie get
3,how much did looper gross
4,what is the budget for epic


In [15]:
dev_tags = pd.read_csv("./data/original_data/hw2_tags_dev.txt", header=None)
dev_tags.head(5)

Unnamed: 0,0
0,O O O O B_director I_director
1,O O O O B_movie I_movie
2,O O O B_movie I_movie O O
3,O O O B_movie O
4,O O O O O B_movie


In [16]:
test = pd.read_csv("./data/original_data/hw2_utterance_test.txt", header=None)
test.head(5)

Unnamed: 0,0
0,find out what language the father of my childr...
1,search for zombie movies
2,summary of star wars four
3,spain has how many movies
4,who stars in house at the end of the street


### split 10% from the train and not use it

In [47]:
np.random.seed(0)
holdout_size = int(df_train.shape[0]/10)
holdout_idx = np.random.choice(df_train.index, size=holdout_size, replace=False)
train_real_idx = [i for i in df_train.index if i not in holdout_idx]

In [48]:
df_train.loc[holdout_idx].to_csv("./data/holdout.csv", index=None)
df_train.loc[train_real_idx].to_csv("./data/train_real.csv", index=None)

### Use the provided evaluation script 

In [53]:
!python ./evaluation.py

F1 Score is 100.00%


### Get vocabulary from train

In [126]:
import re

df = pd.read_csv("./data/train_real.csv")
splitter = lambda x: re.split(" ", x.lower())
vocab = sorted(list(set(list(itertools.chain.from_iterable(
    list(df["utterances"].apply(splitter)))))))
print("train vocab", len(vocab))

train vocab 1097


In [127]:
np.save("./data/vocab.npy", np.array(vocab))

### Turn labels into index

In [87]:
df = pd.read_csv("./data/hw2_train.csv")
df_val = pd.read_csv("./hw2_tags_dev.txt", header=None)

In [93]:
labels = sorted(list(set(" ".join(df["IOB Slot tags"]).split(" "))))
labels_val = sorted(list(set(" ".join(df_val[0]).split(" "))))

In [94]:
[i for i in labels_val if i not in labels]

['B_gross_rev', 'B_org', 'I_gross_rev', 'I_location']

In [95]:
[i for i in labels if i not in labels_val]

['I-movie', 'I_char', 'I_language']

In [116]:
label2idx = {text: i for i, text in enumerate(labels)}
idx2label = {i: text for i, text in enumerate(labels)}
print("number of classes:", len(label2idx))

with open("data/label2idx.pkl", "wb") as f:
    pickle.dump(label2idx, f)
    f.close()
    
with open("data/idx2label.pkl", "wb") as f:
    pickle.dump(idx2label, f)
    f.close()

number of classes: 27


### fasttext vocab filtering

In [6]:
vocab = np.load("./data/vocab.npy")

In [7]:
f = open("../fasttext/crawl-300d-2M-subword.vec", "r", encoding="utf-8")
f_out = open("../fasttext/hw1_vocab.vec", "w")
f_out.write("{} {}\n".format(len(vocab), 300))
d = {}
for i, line in enumerate(f):
    if i == 0:
        continue
    if i % 10000 == 0:
        print(i, end=",")
    word = line.split(" ")[0]
    if word in vocab:
        d[word] = line.strip().split(' ')
        f_out.write(line)
    if len(d) == len(vocab):
        break
f.close()
f_out.close()

10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,140000,150000,160000,170000,180000,190000,200000,210000,220000,230000,240000,250000,260000,270000,280000,290000,300000,310000,320000,330000,340000,350000,360000,370000,380000,390000,400000,410000,420000,430000,440000,450000,460000,470000,480000,490000,500000,510000,520000,530000,540000,550000,560000,570000,580000,590000,600000,610000,620000,630000,640000,650000,660000,670000,680000,690000,700000,710000,720000,730000,740000,750000,760000,770000,780000,790000,800000,810000,820000,830000,840000,850000,860000,870000,880000,890000,900000,910000,920000,930000,940000,950000,960000,970000,980000,990000,1000000,1010000,1020000,1030000,1040000,1050000,1060000,1070000,1080000,1090000,1100000,1110000,1120000,1130000,1140000,1150000,1160000,1170000,1180000,1190000,1200000,1210000,1220000,1230000,1240000,1250000,1260000,1270000,1280000,1290000,1300000,1310000,1320000,1330000,1340000,1350000,1360000,1370000,1380000,1390

In [9]:
pickle.dump(d, open("./data/vocab_ft.pkl", "wb"))

In [None]:
train_val_data, holdout_test_data, test_data = data_utils.prep_all_data(
    data_path, train_val_file, holdout_test_file, test_file, 
    batch_size=BATCH_SIZE)
vocab = train_val_data.text_field.vocab.itos
ft_emb = pickle.load(open("./data/vocab_ft.pkl", "rb"))
emb_matrix = []
for word in vocab:
    if word in ft_emb:
        vec = ft_emb[word][1:]
        emb_matrix.append([float(i) for i in vec])
    else:
        vec = [np.random.normal() for i in range(300)]
        emb_matrix.append(vec)
emb_matrix = torch.tensor(emb_matrix)
torch.save(emb_matrix, "./data/emb_matrix_ft.pt")

In [13]:
# same for label vocab
label_vocab = np.load("./data/label_vocab.npy")
f = open("../fasttext/hw1_label_vocab.vec", "r")
d = {}
for i, line in enumerate(f):
    if i == 0:
        continue
    word, vec = line.split(" ")[0], line.split(" ")[1:]
    vec = [float(i) for i in vec]
    d[word] = vec
f.close()

In [15]:
len(label_vocab)

48

In [16]:
len(d)

48

In [14]:
pickle.dump(d, open("./data/label_vocab_ft.pkl", "wb"))

### add hand craft features

In [3]:
df = pd.read_csv("./data/original_data/hw1_train.csv")

In [5]:
labels = np.load("./data/labels.npy")

In [15]:
label_words = []
for i in labels:
    for j in i.split("."):
        if j not in label_words and j not in ["NO_REL", "other", "gr", "rt"]:
            label_words.append(j)

In [16]:
# max cosine simiarity between word and potential label

### create a feature to link label meaning to text

In [4]:
train_df = pd.read_csv("./data/train_real.csv")

In [3]:
labels = np.load("./data/labels.npy")

In [55]:
label_words = [re.split("\.|_", i) for i in labels]
words = []
for i in label_words:
    words.extend(i)
words = sorted(list(set(words)))
    
words.remove("REL")
words.remove("NO")
words.remove("gr")
words.remove("rt")
words.remove("nom"), words.append("nomination")

(None, None)

In [67]:
np.save("./data/label_vocab", np.array(words))

In [66]:
vocab = np.load("./data/vocab.npy")
np.save("./data/label_vocab_restrict", np.array([i for i in words if i in vocab]))

In [60]:
f = open("../fasttext/crawl-300d-2M-subword.vec", "r", encoding="utf-8")
f_out = open("../fasttext/hw1_label_vocab.vec", "w")
f_out.write("{} {}\n".format(len(words), 300))
d = {}
for i, line in enumerate(f):
    if i == 0:
        continue
    if i % 10000 == 0:
        print(i, end=",")
    word = line.split(" ")[0]
    if word in words:
        d[word] = line.strip().split(' ')
        f_out.write(line)
    if len(d) == len(words):
        break
f.close()
f_out.close()

10000,

In [65]:
np.load("./data/label_vocab_restrict.np.npy")

array(['actor', 'amount', 'award', 'budget', 'by', 'category',
       'character', 'companies', 'country', 'date', 'description',
       'directed', 'director', 'genre', 'gross', 'language', 'locations',
       'media', 'movie', 'music', 'of', 'other', 'person', 'picture',
       'produced', 'production', 'rating', 'release', 'revenue', 'review',
       'showing', 'star', 'starring', 'synopsis', 'trailer', 'winning',
       'work', 'written'], dtype='<U11')

### see what labels co-occur

In [8]:
train_df = pd.read_csv("./data/train_real.csv")

In [11]:
train_df["num_label"] = train_df.raw_label.apply(lambda x: len(x.split(" ")))

In [18]:
train_df[train_df.raw_label.apply(lambda x: "other" in x)].num_label.value_counts()

1    224
Name: num_label, dtype: int64

In [19]:
train_df[train_df.raw_label.apply(lambda x: "NO_REL" in x)].num_label.value_counts()

1    263
Name: num_label, dtype: int64

In [20]:
labels = np.load("./data/labels.npy")

In [21]:
np.where(labels=="NO_REL")

(array([0]),)

In [22]:
np.where(labels=="other")

(array([37]),)

In [28]:
np.delete(np.arange(5), [0, 1])

array([2, 3, 4])

### N GRAM

In [3]:
# unigram is the same as vocab 
vocab = np.load("./data/vocab.npy"); len(vocab)

1977

In [36]:
# bigram
df1 = pd.read_csv("./data/train.csv", index_col="ID")
df2 = pd.read_csv("./data/test.csv", index_col="ID")
df = pd.concat([df1, df2])
print(df1.shape)
print(df2.shape)
print(df.shape)

(3338, 4)
(1084, 4)
(4422, 4)


In [43]:
def get_ngram(sent, gram):
    words = re.split("'| ", sent.lower())
    if len(words) < gram:
        return []
    ngrams = []
    for i in range(len(words)-(gram-1)):
        ngrams.append("_".join(words[i:i+gram]))
    return ngrams

In [44]:
def save_ngram(df, n=2):
    all_ngrams = []
    for ngrams in df.raw_text.apply(lambda x: get_ngram(x, n)):
        all_ngrams.extend(ngrams)
    all_ngrams = np.array(sorted(list(set(all_ngrams))))
    np.save("./data/{}grams.npy".format(n), all_ngrams)
    print("number of {} grams: {}".format(n, len(all_ngrams)))
    return all_ngrams

In [45]:
for n in range(1, 5):
    save_ngram(df, n)

number of 1 grams: 1977
number of 2 grams: 6950
number of 3 grams: 10113
number of 4 grams: 10512
