In [28]:
import pandas as pd
import numpy as np
import spacy
import itertools
import pickle

### save 10% for heldout test, use 90% for cv

In [15]:
df = pd.read_csv("./data/train.csv", index_col="ID")
split_size = int(df.shape[0]/10)

In [16]:
np.random.seed(0)
test_idx = np.random.choice(df.index, size=split_size)
rest = np.array([i for i in df.index if i not in test_idx])
val_idx = np.random.choice(rest, size=split_size)
train_idx = np.array([i for i in rest if i not in val_idx])

In [17]:
df.loc[train_idx].to_csv("./data/train_real.csv")
df.loc[val_idx].to_csv("./data/val.csv")
df.loc[test_idx].to_csv("./data/holdout_test.csv")

### Vocabulary exploration

In [4]:
df1 = pd.read_csv("./data/original_data/hw1_train.csv")
df2 = pd.read_csv("./data/original_data/hw1_test.csv")

vocab1 = set(list(itertools.chain.from_iterable(list(df1["UTTERANCE"].apply(lambda x: x.split(" "))))))
vocab2 = set(list(itertools.chain.from_iterable(list(df2["UTTERANCE"].apply(lambda x: x.split(" "))))))
vocab = sorted(list(vocab1.union(vocab2)))
print("train vocab", len(vocab1))
print("test vocab", len(vocab2))
print("Combined vocab", len(vocab))
print("test vocab not in train", len([i for i in vocab2 if i not in vocab1]))

train vocab 1785
test vocab 1061
Combined vocab 2043
test vocab not in train 258


In [5]:
np.save("./data/vocab.npy", np.array(vocab))

### Turn labels into Manyhot representation

In [40]:
df = pd.read_csv("./data/original_data/hw1_train.csv")

In [41]:
labels = sorted(list(set(" ".join(df["CORE RELATIONS"]).split(" "))))
np.save("./data/labels.npy", labels)

In [42]:
label2idx = {text: i for i, text in enumerate(labels)}
idx2label = {i: text for i, text in enumerate(labels)}
print("number of classes:", len(label2idx))

number of classes: 46


In [9]:
def lable_mapper(labels):
    return ",".join([str(label2idx[i]) for i in labels.split(" ")])

def label2manyhot(core_relations):
    manyhot = [0] * len(labels)
    for each_label in core_relations.split(" "):
        idx = label2idx[each_label]
        manyhot[idx] = 1
    return "".join([str(i) for i in manyhot])

In [10]:
df["label"] = df["CORE RELATIONS"].apply(label2manyhot)

### rename text and label columns and save to final version

In [12]:
df["text"] = df["UTTERANCE"]
df[["ID", "text", "label"]].set_index("ID").to_csv("./data/train.csv")

In [13]:
df_test = pd.read_csv("./data/original_data/hw1_test.csv")

In [14]:
df_test["text"] = df_test["UTTERANCE"]
df[["ID", "text"]].set_index("ID").to_csv("./data/test.csv")

### fasttext vocab filtering

In [72]:
vocab = np.load("./data/vocab.npy")

In [None]:
f = open("../fasttext/crawl-300d-2M-subword.vec", "r", encoding="utf-8")
f_out = open("../fasttext/hw1_vocab.vec", "w")
f_out.write("{} {}\n".format(len(vocab), 300))
d = {}
for i, line in enumerate(f):
    if i == 0:
        print(line)
        continue
    word = line.split(" ")[0]
    if word in vocab:
        print(word)
        d[word] = line.strip().split(' ')
        f_out.write(line)
f.close()
f_out.close()

In [29]:
pickle.dump(d, open("./data/vocab_ft.pkl", "wb"))

In [31]:
x = pickle.load(open("./data/vocab_ft.pkl", "rb"))

### add hand craft features

In [3]:
df = pd.read_csv("./data/original_data/hw1_train.csv")

In [5]:
labels = np.load("./data/labels.npy")

In [15]:
label_words = []
for i in labels:
    for j in i.split("."):
        if j not in label_words and j not in ["NO_REL", "other", "gr", "rt"]:
            label_words.append(j)

In [16]:
# max cosine simiarity between word and potential label

In [17]:
df

Unnamed: 0,ID,UTTERANCE,CORE RELATIONS
0,0,who plays luke on star wars new hope,movie.starring.actor movie.starring.character
1,1,show credits for the godfather,movie.starring.actor
2,2,who was the main actor in the exorcist,movie.starring.actor
3,3,find the female actress from the movie she s t...,movie.starring.actor actor.gender
4,4,who played dory on finding nemo,movie.starring.actor movie.starring.character
5,5,who was the female lead in resident evil,movie.starring.actor actor.gender
6,6,who played guido in life is beautiful,movie.starring.actor movie.starring.character
7,7,who was the co-star in shoot to kill,movie.starring.actor
8,8,find the guy who plays charlie on charlie's an...,movie.starring.actor movie.starring.character
9,9,cast and crew of movie the campaign,movie.starring.actor
