In [168]:
import pandas as pd
import numpy as np
import spacy
import itertools
import pickle
import re

### create dataset with movie and people names condensed

In [169]:
df1 = pd.read_csv("./data/train.csv")
df2 = pd.read_csv("./data/test.csv")
df = pd.concat((df1, df2))

In [170]:
movie_names = list(pd.read_csv("./data/movie_name_condensed_data/movie_names.csv").as_matrix().flatten())
movie_names.append("apollo thirteen")
movie_names.append("the amazing spider man")
movie_names.extend(["the hobbist", "rocky 2", "i love lucy", "appolo thirteen", "pretty girl"])
movie_names.extend(["modern family", "pretty women", "passion of the christ", "the hulk", 
                    "lion king", "les miserable", "scooby do", "amazing spiderman", "forest gump", "e t",
                    "shindler's list", "life is beatiful", "downton abbey", "edward scissor hands", 
                    "mr and mrs smith", "marley and me", "harry potter","dark knight","a mom for christmas",
                    "caddy shack", "star wars new hope", 'the house at the end of the street', "the king is back", 
                    "she s the man", "lord of the rings", "the land of blood and honey", "the santa claus", 
                    "the father of my children", "lord of the flys", "wizard of oz", "one life to live", 
                    "i am legend 2", "toy story 4"
                   ])

In [171]:
person_names = ["homer simpson", "kristen stewart", "steven spielberg", "richard lester", "robert wise"
                "woody allen", "charles vidor", "ray stark", "will smith", "kevin james", "tom hanks", 
                "tom cruise", "victor fleming", "angelina jolie", "albert ruddy", "alfred hitchcock", 
                "ed harris", "bette midler", "sandra bullock", "bruce lee", "charles vidor", "noah baumbach",
                "todd solondz", "brad pitt", "frances ford copolla", "clint eastwood", "karan johar", "will ferrell",
                "james brown", "larry clark", "jennifer aniston", "robert wise", "james cameron", "david selsnic", 
                "arthur rudy", 'alfred hitchock', "patrick swayze","dustin hoffman", "julia roberts", "ridley scott",
                "miranda july","oliver stone", "yash chopra", 'penny marshall', "kevin spacey", 'hugh jackman', 
                "quentin tarantino", "gwyneth paltrow", "spike lee", "robert redford", "george lucas","jj abrams",
                "robert deniro", "chris columbus", "martin scorcese", "tony scott", "niel abramson", "roger rabbit", 
                "the zucker brothers", "von sudow", "lee unkridge",
               ]

In [172]:
producer_names = ["searchlight films", 
                  "warner bros.", "7 arts", 
                  "castle rock entertainment", ]

In [173]:
actoress_names = list(pd.read_csv(
    "./data/movie_name_condensed_data/actoress_names.csv", header=None).as_matrix().flatten())
actoress_names = [i for i in actoress_names if len(i.split(" "))>1]

In [174]:
all_names = movie_names + person_names + actoress_names + producer_names
# filter names that show up in data
all_text = " ".join(df.text)
filtered_names = []
for i, name in enumerate(all_names):
    if i % 10000 == 0:
        print(i, end=",")
    if name in all_text:
        filtered_names.append(name)
len(filtered_names)

0,10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,140000,150000,160000,170000,180000,190000,200000,210000,220000,230000,

665

In [175]:
def detect_names(text, names):
    for name in names:
        if name in text:
            return True
    return False

NAME_MAPPER = {"apollo thirteen": "apollo 13",
               "apollo thirteen": "apollo 13", 
               "appolo thirteen": "apollo 13", 
               "steven spielberg": "spielberg",
               "stephen spielberg": "spielberg", 
               "spider man": "spiderman",
               "childs vidor": "charles vidor",
               " e t": " et",
               "alfred hitchock": "hitchcock",
               "alfred hitchcock": "hitchcock", 
               "warner brothers": "warner bros.",
               "albert rudy": "albert ruddy",
               "will ferell": "will ferrell",
               "the god father": "the godfather"}

NAMES = filtered_names + [i for i in NAME_MAPPER.values() 
                          if len(i.split(" "))>1 and i!=" et"]
remove = ["show me", "a movie", "the movies", "the company", "tell me the", "me the", 
          "are the", "tell me", "e t", "i am", "the actors", "ng ho", "ed ma", "al lang", "d day", 
          "in view", "the pass", "the house", "the life", "the box", "take me", "the star", 
          "the passion", "the king", "the man", "the giant", "the end", "the ring", "love stories", 
          "the land", "exhibit a", "de palma", "blind side", "santa claus", "the sixties", "ma ma", 
          "in orange", "the sin", "parental guidance", "the sand", "the boy", "about love", "all in", 
          "the kids", "the green", "the car", "the mother", "the father", "the voices", "the fly", 
          "made in france", "bad guy", "to live", "beautiful people", "top five", "i do", "i come", 
          "about time", "in bar", "tin man", 'new york', "my boy", "one life", "time please", 
          "love life", "our time", "i am i", "i want you", "the last movie", "the first star", 
          "you get me"] + list(NAME_MAPPER.keys())
the_words = [i for i in NAMES if len(i.split(" "))==2 and i.startswith("the")]
NAMES = [i for i in NAMES if i not in remove and i not in the_words]
NAMES = sorted(list(set(NAMES)), key=lambda x:len(x.split(" ")), reverse=True)



def map_names(text):
    for name, value in NAME_MAPPER.items():
        text = text.replace(name, value)
    return text

def phrasify(text):
    for name in NAMES:
        if name in text:
            phrase_name = name.replace(" ", "|").replace("'", "|")
            text = text.replace(name, phrase_name)
    return text

In [176]:
df1 = pd.read_csv("./data/train.csv", index_col="ID")
df2 = pd.read_csv("./data/test.csv", index_col="ID")
df = pd.concat((df1, df2))

In [177]:
df1["text"] = df1["text"].apply(map_names).apply(phrasify)
df2["text"] = df2["text"].apply(map_names).apply(phrasify)
df1["raw_text"] = df1["text"]
df2["raw_text"] = df2["text"]

In [178]:
df1.to_csv("./data/movie_name_condensed_data/train.csv")
df2.to_csv("./data/movie_name_condensed_data/test.csv")

In [180]:
# split data
df = pd.read_csv("./data/movie_name_condensed_data/train.csv", 
                 index_col="ID")
split_size = int(df.shape[0]/10)
np.random.seed(0)

test_idx = np.random.choice(df.index, size=split_size, replace=False)
rest = np.array([i for i in df.index if i not in test_idx])
val_idx = np.random.choice(rest, size=split_size, replace=False)
train_idx = np.array([i for i in rest if i not in val_idx])

df.loc[train_idx].to_csv("./data/movie_name_condensed_data/train_real.csv")
df.loc[rest].to_csv("./data/movie_name_condensed_data/train_val.csv")
df.loc[val_idx].to_csv("./data/movie_name_condensed_data/val.csv")
df.loc[test_idx].to_csv("./data/movie_name_condensed_data/holdout_test.csv")

In [77]:
# vocabulary
import re

df1 = pd.read_csv("./data/movie_name_condensed_data/train.csv")
df2 = pd.read_csv("./data/movie_name_condensed_data/test.csv")

splitter = lambda x: re.split(" |'", x.lower())

vocab1 = set(list(itertools.chain.from_iterable(list(df1["text"].apply(splitter)))))
vocab2 = set(list(itertools.chain.from_iterable(list(df2["text"].apply(splitter)))))
vocab = sorted(list(vocab1.union(vocab2)))
print("train vocab", len(vocab1))
print("test vocab", len(vocab2))
print("Combined vocab", len(vocab))
print("test vocab not in train", len([i for i in vocab2 if i not in vocab1]))

train vocab 1664
test vocab 957
Combined vocab 1910
test vocab not in train 246


In [35]:
np.save("./data/movie_name_condensed_data/vocab.npy", np.array(vocab))

In [38]:
f = open("../fasttext/crawl-300d-2M-subword.vec", "r", encoding="utf-8")
f_out = open("../fasttext/hw1_vocab_phrase.vec", "w")
f_out.write("{} {}\n".format(len(vocab), 300))
d = {}
for i, line in enumerate(f):
    if i == 0:
        continue
    if i % 100000 == 0:
        print(i, end=",")
    word = line.split(" ")[0]
    if word in vocab:
        d[word] = line.strip().split(' ')
        f_out.write(line)
    if len(d) == len(vocab):
        break
f.close()
f_out.close()

100000,200000,300000,400000,500000,600000,700000,800000,900000,1000000,1100000,1200000,1300000,1400000,1500000,1600000,1700000,1800000,1900000,2000000,

In [42]:
pickle.dump(d, open("./data/movie_name_condensed_data/vocab_ft.pkl", "wb"))

In [47]:
import data_utils

train_val_data, holdout_test_data, test_data = data_utils.prep_all_data(
    path="./data/movie_name_condensed_data/",
    train_file="train_val.csv",
    val_file="holdout_test.csv",
    test_file="test.csv"
    )

In [95]:
import torch

vocab = train_val_data.text_field.vocab.itos
ft_emb = pickle.load(open("./data/movie_name_condensed_data/vocab_ft.pkl", "rb"))
emb_matrix = []
num_unknown = 0
for word in vocab:
    if word in ft_emb:
        vec = ft_emb[word][1:]
        emb_matrix.append([float(i) for i in vec])
    else:
        num_unknown += 1
        vec = [np.random.normal() for i in range(300)]
        emb_matrix.append(vec)
emb_matrix = torch.tensor(emb_matrix)
torch.save(emb_matrix, "./data/movie_name_condensed_data/emb_matrix_ft.pt")

In [96]:
num_unknown

445

### save grams

In [80]:
df1 = pd.read_csv("./data/movie_name_condensed_data/train.csv", index_col="ID")
df2 = pd.read_csv("./data/movie_name_condensed_data/test.csv", index_col="ID")
df = pd.concat([df1, df2])
print(df1.shape)
print(df2.shape)
print(df.shape)

(3338, 4)
(1084, 4)
(4422, 4)


In [81]:
vocab = np.load("./data/movie_name_condensed_data/vocab.npy"); len(vocab)

1910

In [82]:
def get_ngram(sent, gram):
    words = re.split("'| ", sent.lower())
    if len(words) < gram:
        return []
    ngrams = []
    for i in range(len(words)-(gram-1)):
        ngrams.append("_".join(words[i:i+gram]))
    return ngrams

def save_ngram(df, n=2):
    all_ngrams = []
    for ngrams in df.raw_text.apply(lambda x: get_ngram(x, n)):
        all_ngrams.extend(ngrams)
    all_ngrams = np.array(sorted(list(set(all_ngrams))))
    np.save("./data/movie_name_condensed_data/{}grams.npy".format(n), all_ngrams)
    print("number of {} grams: {}".format(n, len(all_ngrams)))
    return all_ngrams

In [83]:
for n in range(1, 3):
    save_ngram(df, n)

number of 1 grams: 1910
number of 2 grams: 6660


In [84]:
import pandas as pd
import numpy as np
import datetime
import importlib
import pickle

import data_utils
import model_utils
import train_utils
import evaluation
import submission
importlib.reload(data_utils)
importlib.reload(model_utils)
importlib.reload(train_utils)
importlib.reload(evaluation)
importlib.reload(submission)

import torch
import torch.nn as nn

In [66]:
PATH = "./data/movie_name_condensed_data/"

In [93]:
train_data, val_data, test_data = data_utils.prep_all_data(path=PATH)

In [105]:
importlib.reload(model_utils)
importlib.reload(data_utils)

m = model_utils.BaseModelNGram(path=PATH)
result = train_utils.train(train_data, val_data, m,
                          lr=1e-2, print_freq=5, max_epoch=100)

Epoch: 0, LR: 0.01, Train Loss: 233.9552, Val Loss: 99.2596, Val f1 0.755
Epoch: 5, LR: 0.01, Train Loss: 3.4232, Val Loss: 69.5909, Val f1 0.828
Epoch: 10, LR: 0.01, Train Loss: 1.0634, Val Loss: 82.7617, Val f1 0.837
Epoch: 15, LR: 0.001, Train Loss: 0.4305, Val Loss: 87.2950, Val f1 0.844
Epoch: 20, LR: 0.001, Train Loss: 0.3631, Val Loss: 86.7176, Val f1 0.843
Epoch: 25, LR: 0.0001, Train Loss: 0.3160, Val Loss: 87.8480, Val f1 0.840
Epoch: 30, LR: 0.0001, Train Loss: 0.2400, Val Loss: 88.2805, Val f1 0.838
Epoch: 35, LR: 1e-05, Train Loss: 0.2772, Val Loss: 88.1830, Val f1 0.842


In [106]:
torch.save(result["trained_model"], "./data/model_checkpoints/ngram_phrase_MLP_Jan31.mdl")

In [107]:
importlib.reload(model_utils)
importlib.reload(data_utils)

m = model_utils.GRU(path=PATH)
result = train_utils.train(train_data, val_data, m,
                          lr=1e-2, print_freq=5, max_epoch=100)

Epoch: 0, LR: 0.01, Train Loss: 318.1742, Val Loss: 313.7753, Val f1 0.404
Epoch: 5, LR: 0.01, Train Loss: 17.0051, Val Loss: 108.0352, Val f1 0.774
Epoch: 10, LR: 0.01, Train Loss: 2.8036, Val Loss: 113.0202, Val f1 0.791
Epoch: 15, LR: 0.01, Train Loss: 1.0752, Val Loss: 120.3200, Val f1 0.791
Epoch: 20, LR: 0.001, Train Loss: 0.7764, Val Loss: 117.6829, Val f1 0.790
Epoch: 25, LR: 0.001, Train Loss: 0.8133, Val Loss: 118.0313, Val f1 0.795


In [108]:
torch.save(result["trained_model"], "./data/model_checkpoints/GRU_phrase_Jan31.mdl")

In [164]:
train_data_phrase, val_data_phrase, test_data_phrase = data_utils.prep_all_data(
    path=PATH, train_file="train.csv", val_file="val.csv")
train_data, val_data, test_data = data_utils.prep_all_data(train_file="train.csv", val_file="val.csv")

m_gram = torch.load("./data/model_checkpoints/ngram_MLP_Jan31_all_train.mdl")
m_gram_phrase = torch.load("./data/model_checkpoints/ngram_phrase_MLP_Jan31_all_train.mdl")
m_GRU = torch.load("./data/model_checkpoints/GRU_Jan31_all_train.mdl")
m_GRU_phrase = torch.load("./data/model_checkpoints/GRU_phrase_Jan31_all_train.mdl")

In [165]:
ensemble = submission.Ensemble(models=[m_gram, m_gram_phrase, m_GRU_phrase, m_GRU], 
                               val_data=[val_data, val_data_phrase, val_data_phrase, val_data],
                               test_data=[test_data, test_data_phrase, test_data_phrase, test_data])
df_val = ensemble.get_ensemble_result(submission=False)
df = ensemble.get_ensemble_result(submission=True)

ensemble f1 val score: 1.0


In [167]:
ensemble.save_submission_file("all_data")

In [166]:
df

Unnamed: 0_level_0,CORE RELATIONS,text,raw_label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,movie.starring.actor,star of thor,dummy
1,movie.starring.actor,who is in the movie the campaign,dummy
2,movie.starring.actor,list the cast of the movie the campaign,dummy
3,movie.starring.actor,who was in twilight,dummy
4,movie.starring.actor,who is in vulguria,dummy
5,movie.starring.actor,actor from lost,dummy
6,movie.starring.actor movie.starring.character,who played in the movie rocky,dummy
7,movie.starring.actor,who played in the movie captain america,dummy
8,movie.starring.actor,cast and crew for in july,dummy
9,movie.starring.actor,who is in movie in july,dummy
