In [3]:
import pandas as pd
import numpy as np
import spacy
import itertools
import pickle
import re

### take a look at data

In [22]:
df_train = pd.read_csv("./data/hw2_train.csv")

In [23]:
df_train.shape

(2312, 2)

In [24]:
df_train.sample(5)

Unnamed: 0,utterances,IOB Slot tags
2263,list movie ratings for us movie looper,O O O O B_country O B_movie
309,origin of spanglish,O O B_movie
1144,ed harris films,B_person I_person O
1626,show me scifi fantasy movies,O O B_genre I_genre O
2058,show my spielberg info,O O B_person O


In [7]:
df_val = pd.read_csv("hw2_utterance_dev.txt", header=None)
df_val.head(5)

Unnamed: 0,0
0,show me ones by david fincher
1,who is director of the words
2,what rating did the campaign movie get
3,how much did looper gross
4,what is the budget for epic


In [8]:
dev_tags = pd.read_csv("hw2_tags_dev.txt", header=None)
dev_tags.head(5)

Unnamed: 0,0
0,O O O O B_director I_director
1,O O O O B_movie I_movie
2,O O O B_movie I_movie O O
3,O O O B_movie O
4,O O O O O B_movie


In [9]:
test = pd.read_csv("hw2_utterance_test.txt", header=None)
test.head(5)

Unnamed: 0,0
0,find out what language the father of my childr...
1,search for zombie movies
2,summary of star wars four
3,spain has how many movies
4,who stars in house at the end of the street


### split 10% from the train and not use it

In [4]:
df_train = pd.read_csv("./data/hw2_train.csv")
np.random.seed(0)
holdout_size = int(df_train.shape[0]/10)
holdout_idx = np.random.choice(df_train.index, size=holdout_size, replace=False)
train_real_idx = [i for i in df_train.index if i not in holdout_idx]

In [5]:
df_train.loc[holdout_idx].to_csv("./data/holdout_uncorrected.csv", index=None)
df_train.loc[train_real_idx].to_csv("./data/train_real_uncorrected.csv", index=None)

### look at training data label distribution

In [179]:
# correct I-movie to I_movie
df_train = pd.read_csv("./data/hw2_train_corrected.csv")
df_train[df_train["IOB Slot tags"].str.contains("-")]

Unnamed: 0,utterances,IOB Slot tags
702,i want to know in what language was ju on filmed,O O O O O O O O B_movie I-movie O


In [180]:
df_train.loc[702, "IOB Slot tags"] = df_train.loc[702, "IOB Slot tags"].replace("-", "_")
df_train.to_csv("./data/hw2_train_corrected.csv", index=None)

In [181]:
all_y_true = []
for each_y in df_train["IOB Slot tags"]:
    all_y_true.extend(each_y.split(" "))
frequent_label = pd.DataFrame(all_y_true)[0].value_counts(); frequent_label

O                 10519
I_movie            1138
B_movie            1018
B_person            195
B_director          185
I_person            176
I_director          168
B_producer          164
B_country           153
B_mpaa_rating       141
B_language          119
I_producer          114
B_cast              106
I_cast              105
B_subject            95
B_genre              71
I_subject            33
I_language           17
B_char               15
I_mpaa_rating        12
I_country            12
I_genre               5
I_char                5
B_release_year        5
I_release_year        3
B_location            2
Name: 0, dtype: int64

In [182]:
len(list(frequent_label.index))

26

### Experiment with evaluation.py vs seqeval.metric

In [184]:
import seqeval

In [185]:
y_true = [["O", "O", "O"], ["O", "B"]]
y_pred = [["O", "O", "O"], ["O", "B"]]

In [186]:
evaluation.f1_score(y_true, y_pred)

1.0

In [187]:
seqeval.metrics.f1_score(y_true, y_pred)

1.0

### Turn labels into index

In [87]:
df = pd.read_csv("./data/hw2_train.csv")
df_val = pd.read_csv("./hw2_tags_dev.txt", header=None)

In [93]:
labels = sorted(list(set(" ".join(df["IOB Slot tags"]).split(" "))))
labels_val = sorted(list(set(" ".join(df_val[0]).split(" "))))

In [94]:
[i for i in labels_val if i not in labels]

['B_gross_rev', 'B_org', 'I_gross_rev', 'I_location']

In [95]:
[i for i in labels if i not in labels_val]

['I-movie', 'I_char', 'I_language']

In [35]:
# the validation label that is not found in train are saved to <UNK>

### Find out cases where number of text and number of tags differ 

In [87]:
df_train = pd.read_csv("./data/hw2_train.csv")

In [88]:
count = lambda x:len(x.split(" "))
def replacer(text):
    text = text.replace("I 'd", "I'd").replace("i 'd", "i'd")
    text = text.replace("i 'm", "i'm")
    text = text.replace("charlie 's angels", "charlie's angels")
    return text

df_train["utterances"] = df_train.utterances.apply(replacer)
df_train.loc[1881, "IOB Slot tags"] = "O O O O O B_person I_person"
df_train.loc[1059, "IOB Slot tags"] = "O O O O O B_movie I_movie I_movie"
df_train.loc[614, "IOB Slot tags"] = "B_movie O O O"
df_train.loc[21, "IOB Slot tags"] = "O O O O O B_movie I_movie I_movie"
df_train.loc[25, "IOB Slot tags"] = "O O B_person I_person O O O B_movie I_movie I_movie"
df_train.loc[62, "IOB Slot tags"] = "O O O O O O O O B_movie I_movie"
df_train.loc[612, "IOB Slot tags"] = "O O O O O O O B_movie I_movie I_movie I_movie"

In [91]:
df_train.to_csv("./data/hw2_train_corrected.csv", index=None)

In [92]:
df_val = pd.read_csv("./hw2_utterance_dev.txt", names=["utterances"]).join(
        pd.read_csv("./hw2_tags_dev.txt", names=["IOB Slot tags"]))

In [80]:
df_val[df_val.utterances.str.contains("lead")]

Unnamed: 0,utterances,IOB Slot tags
31,who played the female lead in hitch,O O O O O O B_movie


### look for errors in train labeling and correct them


In [116]:
df_train = pd.read_csv("./data/hw2_train.csv")
df_train[df_train["IOB Slot tags"].str.contains("O I")]

Unnamed: 0,utterances,IOB Slot tags
428,when was the first hunger games shown,O O B_movie O I_movie I_movie O
470,who directed the movie,O O I_movie O
612,what is the director 's name of lord of the rings,O O I_movie O O I_movie I_movie I_movie I_movi...
672,language of sound of music,O I_movie I_movie I_movie I_movie
1703,run a search for star wars four,O I_movie O O B_movie I_movie I_movie
2092,can you show me the name of the producer for c...,O O O O O O O O O O I_movie I_movie I_movie I_...


In [111]:
df_train.loc[428, "IOB Slot tags"] = "O O O O B_movie I_movie O"
df_train.loc[470, "IOB Slot tags"] = "O O O O"
df_train.loc[1703, "IOB Slot tags"] = "O O O O B_movie I_movie I_movie"
df_train.loc[2092, "IOB Slot tags"] = "O O O O O O O O O O B_movie I_movie I_movie I_movie"
df_train.loc[672, "IOB Slot tags"] = "O O B_movie I_movie I_movie"

In [114]:
df_train.to_csv("./data/hw2_train_corrected.csv", index=None)

In [118]:
df_val[df_val["IOB Slot tags"].str.contains("O I")]

Unnamed: 0,utterances,IOB Slot tags
86,how many scorsese films were filmed in france,O O I_producer O O O O B_location
282,who played mother in tarkovsky 's mirror,O O B_char O B_director O I_movie
