# Process _Stories_ dataset

Prepare dataset from COLING 2018 paper:
> Jahan, Labiba, Geeticka Chauhan, and Mark Finlayson. "A new approach to animacy detection." In _Proceedings of the 27th International Conference on Computational Linguistics_, pp. 1-12. 2018.

Download data from https://dspace.mit.edu/handle/1721.1/116172, unzip it, and store it in `../resources/`.

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import glob
import pandas as pd
import xml.etree.ElementTree as ET

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from tools import animacy_evaluation,processing

import spacy
import operator
nlp = spacy.load("en_core_web_lg")

### Parse corpus into sentence-level dataframe

In [2]:
def parse_animacy_xml(animacy_file):
    tree = ET.parse(animacy_file)
    story = tree.getroot()
    fulltext = ""
    storyNumber = animacy_file.split("/")[-1].split(".sty")[0]
    
    dTokens = dict()
    dRefExp = dict()
    dCorExp = dict()
    
    dAnimacyExpressions = dict()
    processed_annotations = []
    
    # ------------------------------------------------------
    # Capture relevant data from xml: token, refering expression, and coreferent
    # expression annotated with animacy.
    for section in story:
        if section.attrib['id'] == 'edu.mit.parsing.token':
            for child in section.findall('./desc'):
                dTokens[child.attrib['id']] = child.text.strip()
                
        if section.attrib['id'] == 'edu.mit.discourse.rep.refexp':
            for child in section.findall('./desc'):
                reftokens = []
                offset = int(child.attrib['off'])
                grouped = child.text.split(",")
                for gr in grouped:
                    reftokens.append(gr.split("~"))
                dRefExp[child.attrib['id']] = (child.text, reftokens)
                        
        if section.attrib['id'] == 'edu.mit.discourse.rep.coref':
            for child in section.findall('./desc'):
                offset = int(child.attrib['off'])
                text2mask = child.text.split("|")[0]
                coref_ani = 0
                if 'ani' in child.attrib:
                    if child.attrib['ani'] == '1':
                        coref_ani = 1
                
                dCorExp[child.attrib['id']] = (child.text, coref_ani)
    
    # ------------------------------------------------------
    # Capture referring expressions whose coreferent is annotated with animacy
    for corefExp in dCorExp:
        coref_ani = dCorExp[corefExp][1]
        coreftext, refids = dCorExp[corefExp][0].split("|")
        refids = refids.split(",")
        for refid in refids:
            for sq in dRefExp[refid][1]:
                dAnimacyExpressions[tuple(sq)] = ([dTokens[tk] for tk in sq], coref_ani)
    
    # ------------------------------------------------------
    # Add sentence index to each token
    fulltext = [dTokens[t] for t in dTokens]
    
    tokens_in_sentence = dict()
    sIndex = 0
    sentences = sent_tokenize(" ".join(fulltext))
    sentences_indices = []
    dSentences = dict()
    
    for s in sentences:
        sIndex += 1
        sentences_indices.append((s, sIndex))
        dSentences[sIndex] = s
    
    current_sentence = sentences_indices[0][0].split(" ")
    for t in dTokens:
        if dTokens[t] == current_sentence[0]:
            tokens_in_sentence[t] = (dTokens[t], sentences_indices[0][1])
            del current_sentence[0]
            if len(current_sentence) == 0:
                del sentences_indices[0]
                if sentences_indices:
                    current_sentence = sentences_indices[0][0].split(" ")
        else:
            tokens_in_sentence[t] = (dTokens[t], None)
    
    dSentenceIndices = dict()
    for tk in tokens_in_sentence:
        word_index = tk
        sent_index = tokens_in_sentence[tk][1]
        if sent_index in dSentenceIndices:
            dSentenceIndices[sent_index].append(word_index)
        else:
            dSentenceIndices[sent_index] = [word_index]
        
    # ------------------------------------------------------
    # Recover sentence where animated expression occurs, and its context
    for expression in dAnimacyExpressions:
        sentences_involved = list(set([tokens_in_sentence[e][1] for e in expression]))
        sentence = 0
        if len(sentences_involved) > 1: # Discard multisentence expressions, often annotation errors it seems
            continue
        else:
            sentence = sentences_involved[0]
        
        token_expression = dAnimacyExpressions[expression][0]
        animacy = dAnimacyExpressions[expression][1]
        
        words_sentence = dSentences[sentence].split()
        index_sentence = dSentenceIndices[sentence]
        
        # ------------------------------------------------------
        # Find each item of the target expression in the current sentence, and mask it
        masked_sentence = []
        if len(words_sentence) == len(index_sentence):
            zipped_sentence = list(zip(dSentenceIndices[sentence], dSentences[sentence].split()))
            indexExp = 0
            for z in zipped_sentence:
                if indexExp < len(expression):
                    if z[0] == expression[indexExp] and z[1] == token_expression[indexExp]:
                        masked_sentence.append("[MASK]")
                        indexExp += 1
                    else:
                        masked_sentence.append(z[1])
                else:
                    masked_sentence.append(z[1])
        
        # ------------------------------------------------------
        # Relevant outputs:
        prev_sentence = dSentences.get(sentence - 1, "")
        current_sentence = dSentences[sentence]
        target_expression = " ".join(dAnimacyExpressions[expression][0]).strip()
        masked_sentence = " ".join(masked_sentence)
        next_sentence = dSentences.get(sentence + 1, "")
        # Replace a multi-token masked expression to just one mask, instead of
        # having consecutive masked elements. E.g. Instead of "[MASK] [MASK] [MASK]
        # drank" for "A little man drank", replace to "[MASK] drank":
        regex_mask = r"( ?\[MASK\] ?\-?)+" 
        masked_sentence = re.sub(regex_mask, " [MASK] ", masked_sentence).strip()
        
        # ------------------------------------------------------
        # Append relevant outputs to list to return:
        if masked_sentence.count("[MASK]") == 1: # This line is to filter out some inconsistencies in the data
            processed_annotations.append((storyNumber, prev_sentence, current_sentence, masked_sentence, next_sentence, target_expression, animacy))
    
    return processed_annotations

In [3]:
# ------------------------------------------------------
# Process Jahan animacy corpus, turn it to Stories corpus:
storiesdf = pd.DataFrame()
for i in glob.glob("../resources/jahan_animacy_v1.0.0/jahan_animacy_corpus_v1.0.0/data/*"):
    processed_annotations = parse_animacy_xml(i)
    localdf = pd.DataFrame(processed_annotations, columns=["storyNumber", "prevSentence", "currentSentence", "maskedSentence", "nextSentence", "targetExpression", "animated"])
    storiesdf = pd.concat([storiesdf, localdf], ignore_index=True)
    
storiesdf[['maskedSentence', 'targetExpression', 'targetIsPRP']] = storiesdf.apply(lambda row: pd.Series(processing.process_expression(row['targetExpression'], row['maskedSentence'], nlp)), axis=1)
storiesdf[['context3wmasked', 'context3w']] = storiesdf.apply(lambda row: pd.Series(processing.ngram_context(row['maskedSentence'], row['targetExpression'], 3)), axis=1)

# ------------------------------------------------------
# Store stories dataframe:
storiesdf.to_pickle("../data/stories/all.pkl")

# ------------------------------------------------------
# Split stories into training (0.7) and test set (0.3), and store:
trainsplit = 0.7
train_set = storiesdf.sample(frac=trainsplit, random_state=0)
test_set = storiesdf.drop(train_set.index)
train_set.to_pickle("../data/stories/all_train.pkl")
test_set.to_pickle("../data/stories/all_test.pkl")

In [4]:
storiesdf.head()

Unnamed: 0,storyNumber,prevSentence,currentSentence,maskedSentence,nextSentence,targetExpression,animated,targetIsPRP,context3wmasked,context3w
0,story32,,Once upon a time there lived a king and queen .,Once upon a time there lived a [MASK] and queen .,"They had three sons , two of them with their w...",king,1,False,there lived a [MASK] and queen .,there lived a king and queen .
1,story32,"They had three sons , two of them with their w...",Now the King had a deer-park in which were qua...,Now the [MASK] had a deer-park in which were q...,Into that park there used to come a huge beast...,King,1,False,Now the [MASK] had a deer-park,Now the King had a deer-park
2,story32,Into that park there used to come a huge beast...,"The King did all he could , but he was unable ...","The [MASK] did all he could , but he was unabl...",So at last he called his sons together and sai...,King,1,False,The [MASK] did all he,The King did all he
3,story32,Into that park there used to come a huge beast...,"The King did all he could , but he was unable ...","The King did all [MASK] could , but he was una...",So at last he called his sons together and sai...,he,0,True,"King did all [MASK] could , but","King did all he could , but"
4,story32,"The King did all he could , but he was unable ...",So at last he called his sons together and sai...,So at last he called [MASK] sons together and ...,"Well , the eldest son undertook the task .",his,0,True,last he called [MASK] sons together and,last he called his sons together and


In [8]:
storiesdf.tail(50)

Unnamed: 0,storyNumber,prevSentence,currentSentence,maskedSentence,nextSentence,targetExpression,animated,targetIsPRP,context3wmasked,context3w
18658,story17,"Gather my bones , tie them in a handkerchief ,...",Water my bones every morning . '',Water my bones every [MASK] . '',Havroshechka did everything as the cow instruc...,morning,0,False,my bones every [MASK] . '',my bones every morning . ''
18659,story17,Havroshechka did everything as the cow instruc...,"She endured great hunger , but still did n't e...","She endured [MASK] , but still did n't eat any...",She watered the bones in the garden every day ...,great hunger,0,False,"She endured [MASK] , but still","She endured great hunger , but still"
18660,story17,"She endured great hunger , but still did n't e...",She watered the bones in the garden every day ...,She watered the bones in the garden every day ...,"Goodness , what a tree it was !",apple tree,1,False,from which an [MASK] grew .,from which an apple tree grew .
18661,story17,She watered the bones in the garden every day ...,"Goodness , what a tree it was !","Goodness , [MASK] !",The apples were plump .,what a tree it was,1,False,"Goodness , [MASK] !","Goodness , what a tree it was !"
18662,story17,She watered the bones in the garden every day ...,"Goodness , what a tree it was !","Goodness , what a tree [MASK] was !",The apples were plump .,it,1,True,what a tree [MASK] was !,what a tree it was !
18663,story17,The bowing branches were silver .,Whoever rode by stopped to look at the tree .,Whoever rode by stopped to look at the [MASK] .,Whoever walked by would stare in wonder .,tree,1,False,look at the [MASK] .,look at the tree .
18664,story17,"he said , `` whoever can bring me an apple wil...",And so the three sisters pushed themselves in ...,And so the three sisters pushed themselves in ...,While before the apples hung lower than arm 's...,apple tree,1,False,get to the [MASK] .,get to the apple tree .
18665,story17,While before the apples hung lower than arm 's...,The sisters wanted to knock them off the tree ...,The sisters wanted to knock them off the [MASK...,"They wanted to pick the apples , but the branc...",tree,1,False,"them off the [MASK] , but the","them off the tree , but the"
18666,story17,No matter how they struggled and thrashed abou...,Havroshechka came to the tree .,Havroshechka came to the [MASK] .,The branches bowed down and apples drooped int...,tree,1,False,came to the [MASK] .,came to the tree .
18667,story17,"She endured great hunger , but still did n't e...",She watered the bones in the garden every day ...,She watered the bones in the garden every day ...,"Goodness , what a tree it was !",apple,0,False,from which an [MASK] tree grew .,from which an apple tree grew .


In [5]:
storiesdf_uni = processing.drop_target_duplicates(storiesdf)
storiesdf_uni.to_pickle("../data/stories/unique.pkl")

# ------------------------------------------------------
# Split stories into training (0.7) and test set (0.3), and store:
trainsplit = 0.7
train_set_uni = storiesdf_uni.sample(frac=trainsplit, random_state=0)
test_set_uni = storiesdf_uni.drop(train_set_uni.index)
train_set_uni.to_pickle("../data/stories/unique_train.pkl")
test_set_uni.to_pickle("../data/stories/unique_test.pkl")

In [27]:
# ------------------------------------------------------
# Annotation counts:

print("Number of rows")
print("==============")
print("All rows:")
print("* All:", storiesdf.animated.count())
print("* Train:", train_set.animated.count())
print("* Test:", test_set.animated.count())
print("Rows without duplicates:")
print("* All:", storiesdf_uni.animated.count())
print("* Train:", train_set_uni.animated.count())
print("* Test:", test_set_uni.animated.count())

Number of rows
All rows:
* All: 18708
* Train: 13096
* Test: 5612
Rows without duplicates:
* All: 11891
* Train: 8324
* Test: 3567


In [29]:
# ------------------------------------------
# Class 1 proportion per dataset:

all_vc = storiesdf.animated.value_counts().to_dict()
all_train_vc = train_set.animated.value_counts().to_dict()
all_test_vc = test_set.animated.value_counts().to_dict()
uni_vc = storiesdf_uni.animated.value_counts().to_dict()
uni_train_vc = train_set_uni.animated.value_counts().to_dict()
uni_test_vc = test_set_uni.animated.value_counts().to_dict()

print("Proportion of class 1 labels")
print("============================")
print("All rows:")
print("* All:", round(all_vc[1] / (all_vc[0] + all_vc[1]),2))
print("* Train:", round(all_train_vc[1] / (all_train_vc[0] + all_train_vc[1]),2))
print("* Test:", round(all_test_vc[1] / (all_test_vc[0] + all_test_vc[1]),2))
print("Rows without duplicates:")
print("* All:", round(uni_vc[1] / (uni_vc[0] + uni_vc[1]),2))
print("* Train:", round(uni_train_vc[1] / (uni_train_vc[0] + uni_train_vc[1]),2))
print("* Test:", round(uni_test_vc[1] / (uni_test_vc[0] + uni_test_vc[1]),2))

Proportion of class 1 labels
All rows:
* All: 0.62
* Train: 0.63
* Test: 0.61
Rows without duplicates:
* All: 0.68
* Train: 0.69
* Test: 0.68


### Most frequent class baseline

In [24]:
dataset_df = pd.read_pickle("../data/stories/all_test.pkl")
classes_by_frequency = dataset_df.animated.value_counts(normalize=True).to_dict()
most_frequent_class = max(classes_by_frequency.items(), key=operator.itemgetter(1))[0]

y_pred = [int(most_frequent_class) for x in dataset_df['animated'].tolist()]
y_true = [int(x) for x in dataset_df['animated'].tolist()]

precision, recall, fscore, micro_fscore, map_ = animacy_evaluation.results(y_true,y_pred,0.5)
print("p:", round(precision,3), "r:", round(recall,3), "macro_f1:", round(fscore,3), "micro_f1:", round(micro_fscore,3), "map:", round(map_,3))

p: 0.307 r: 0.5 macro_f1: 0.38 micro_f1: 0.613 map: 0.61


In [25]:
dataset_df = pd.read_pickle("../data/stories/unique_test.pkl")
classes_by_frequency = dataset_df.animated.value_counts(normalize=True).to_dict()
most_frequent_class = max(classes_by_frequency.items(), key=operator.itemgetter(1))[0]

y_pred = [int(most_frequent_class) for x in dataset_df['animated'].tolist()]
y_true = [int(x) for x in dataset_df['animated'].tolist()]

precision, recall, fscore, micro_fscore, map_ = animacy_evaluation.results(y_true,y_pred,0.5)
print("p:", round(precision,3), "r:", round(recall,3), "macro_f1:", round(fscore,3), "micro_f1:", round(micro_fscore,3), "map:", round(map_,3))

p: 0.341 r: 0.5 macro_f1: 0.405 micro_f1: 0.682 map: 0.671
