# Process _machines19thC_ dataset

In [1]:
import pandas as pd
import numpy as np
from tools import animacy_evaluation,processing
import operator
import spacy
from pathlib import Path
nlp = spacy.load("en_core_web_lg")

In [2]:
master_all = pd.read_csv("/Users/mcollardanuy/Downloads/GT_MCA_annotations_DSH - Sheet1.tsv", sep="\t")
master_all = master_all.drop("Unnamed: 0", axis=1)
master_all = master_all.drop_duplicates(subset=["SentenceId"])

In [3]:
master_all_dw = pd.read_csv("/Users/mcollardanuy/Downloads/DW_annotations_DSH - Sheet1.tsv", sep="\t")
master_all_dw = master_all_dw.drop("Unnamed: 0", axis=1)
master_all_dw = master_all_dw.drop_duplicates(subset=["SentenceId"])

In [4]:
dAnimacyGT = dict()
dHumannessGT = dict()
dAnimacyMCA = dict()
dHumannessMCA = dict()
for i, row in master_all.iterrows():
    dAnimacyGT[row["SentenceId"]] = row["Animacy_GT"]
    dHumannessGT[row["SentenceId"]] = row["Humanness_GT"]
    dAnimacyMCA[row["SentenceId"]] = row["Animacy_MCA"]
    dHumannessMCA[row["SentenceId"]] = row["Humanness_MCA"]
    
dAnimacyDW = dict()
dHumannessDW = dict()
for i, row in master_all_dw.iterrows():
    dAnimacyDW[row["SentenceId"]] = row["Animacy_DW"]
    dHumannessDW[row["SentenceId"]] = row["Humanness_DW"]

In [5]:
majority_vote_animacy = dict()
for s in dAnimacyGT:
    if s in dAnimacyGT and s in dAnimacyMCA and s in dAnimacyDW:
        if not np.isnan(dAnimacyGT[s]) and not np.isnan(dAnimacyMCA[s]) and not np.isnan(dAnimacyDW[s]):
            animacy_annotations = [int(dAnimacyGT[s]), int(dAnimacyMCA[s]), int(dAnimacyDW[s])]
            majority_vote_animacy[s] = max(set(animacy_annotations), key = animacy_annotations.count)
        else:
            majority_vote_animacy[s] = None
    else:
        majority_vote_animacy[s] = None
            
majority_vote_humanness = dict()
for s in dHumannessGT:
    if s in dHumannessGT and s in dHumannessMCA and s in dHumannessDW:
        if not np.isnan(dHumannessGT[s]) and not np.isnan(dHumannessMCA[s]) and not np.isnan(dHumannessDW[s]):
            humanness_annotations = [int(dHumannessGT[s]), int(dHumannessMCA[s]), int(dHumannessDW[s])]
            majority_vote_humanness[s] = max(set(humanness_annotations), key = humanness_annotations.count)
        else:
            majority_vote_humanness[s] = None
    else:
        majority_vote_humanness[s] = None

In [6]:
date = []
prevSentence = []
currentSentence = []
maskedSentence = []
nextSentence = []
targetExpression = []
animated = []
human = []
context3wmasked = []
context3w = []

master_all["animacy_majority"] = master_all['SentenceId'].map(majority_vote_animacy)
master_all["humanness_majority"] = master_all['SentenceId'].map(majority_vote_humanness)

master_all[["prevSentence", "currentSentence", "maskedSentence", "nextSentence"]] = master_all.apply(lambda row: pd.Series(processing.processMachines19thC(row['Sentence'], row['SentenceCtxt'], row['TargetExpression'], nlp)), axis=1)
master_all = master_all.rename(columns={"TargetExpression": "targetExpression"})
master_all = master_all[master_all["maskedSentence"].str.contains("[MASK]", regex=False)]
master_all[['context3wmasked', 'context3w']] = master_all.apply(lambda row: pd.Series(processing.ngram_context(row['maskedSentence'], row['targetExpression'], 3)), axis=1)

animacy_all = pd.DataFrame()
animacy_all["date"] = master_all["Date"]
animacy_all["SentenceId"] = master_all["SentenceId"]
animacy_all["prevSentence"] = master_all["prevSentence"]
animacy_all["currentSentence"] = master_all["currentSentence"]
animacy_all["maskedSentence"] = master_all["maskedSentence"]
animacy_all["nextSentence"] = master_all["nextSentence"]
animacy_all["targetExpression"] = master_all["targetExpression"]
animacy_all["context3wmasked"] = master_all["context3wmasked"]
animacy_all["context3w"] = master_all["context3w"]
animacy_all["animated"] = master_all["animacy_majority"]
animacy_all = animacy_all[animacy_all["animated"].notnull()]
animacy_all = animacy_all.reset_index()
animacy_all["animated"] = animacy_all["animated"].astype('int64')

humanness_all = pd.DataFrame()
humanness_all["date"] = master_all["Date"]
humanness_all["SentenceId"] = master_all["SentenceId"]
humanness_all["prevSentence"] = master_all["prevSentence"]
humanness_all["currentSentence"] = master_all["currentSentence"]
humanness_all["maskedSentence"] = master_all["maskedSentence"]
humanness_all["nextSentence"] = master_all["nextSentence"]
humanness_all["targetExpression"] = master_all["targetExpression"]
humanness_all["context3wmasked"] = master_all["context3wmasked"]
humanness_all["context3w"] = master_all["context3w"]
humanness_all["animated"] = master_all["humanness_majority"]
humanness_all = humanness_all[humanness_all["animated"].notnull()]
humanness_all = humanness_all.reset_index()
humanness_all["animated"] = humanness_all["animated"].astype('int64')

In [7]:
pathdf = "../data/machines19thC/"
Path(pathdf).mkdir(parents=True, exist_ok=True)

#-----------------------------------------------------
# Store machines19thC animacy dataframe:
animacy_all = animacy_all.drop("index", axis=1)
animacy_all.to_pickle(pathdf + "animacy.pkl")

# ------------------------------------------------------
# Store machines19thC humanness dataframe:
humanness_all = humanness_all.drop("index", axis=1)
humanness_all.to_pickle(pathdf + "humanness.pkl")

# ------------------------------------------------------
# Split into train and test set and store:

trainsplit = 0.3

animacy_all = animacy_all.sample(frac=1, random_state=0).reset_index(drop=True)
animacy_all_train_set = animacy_all.sample(frac=trainsplit)
animacy_all_test_set = animacy_all[~animacy_all.index.isin(animacy_all_train_set.index)]

animacy_all_train_set.to_pickle(pathdf + "train.pkl")
animacy_all_test_set.to_pickle(pathdf + "test.pkl")

### Explore the data

In [8]:
print("Train annd test sets")
print("====================")
print("Animacy train set:", animacy_all_train_set.animated.value_counts().to_dict())
print("Animacy test set:", animacy_all_test_set.animated.value_counts().to_dict())

# ------------------------------------------------------
# Annotation counts:

print("\nNumber of rows")
print("==============")
print("All rows:")
print("* All:", animacy_all.animated.count())
print("* Train:", animacy_all_train_set.animated.count())
print("* Test:", animacy_all_test_set.animated.count())

Train annd test sets
Animacy train set: {0: 120, 1: 58}
Animacy test set: {0: 272, 1: 143}

Number of rows
All rows:
* All: 593
* Train: 178
* Test: 415


In [9]:
df = pd.read_pickle(pathdf + "train.pkl")

sentInd = 1
print("***prevSentence***", df.iloc[sentInd].prevSentence)
print("***currentSentence***", df.iloc[sentInd].currentSentence)
print("***maskedSentence***", df.iloc[sentInd].maskedSentence)
print("***nextSentence***", df.iloc[sentInd].nextSentence)
print("***targetExpression***", df.iloc[sentInd].targetExpression)
print("***context3wmasked***", df.iloc[sentInd].context3wmasked)
print("***context3w***", df.iloc[sentInd].context3w)
print("***animated***", df.iloc[sentInd].animated)

***prevSentence*** These machines are chain-boats.
***currentSentence*** Instead of having either screw or paddle wheels, both of which would, by their wash, damage the banks of the canal, the engine turns a large " sprocket" wheel amidships.
***maskedSentence*** Instead of having either screw or paddle wheels, both of which would, by their wash, damage the banks of the canal, the [MASK] turns a large " sprocket" wheel amidships.
***nextSentence*** A chain, which is laid the whole length of the canal, is brought to this wheel, and the tug winds herself along, much after the fashion of the floating bridge at Portsmouth.
***targetExpression*** engine
***context3wmasked*** the canal, the [MASK] turns a large
***context3w*** the canal, the engine turns a large
***animated*** 0


### Most frequent class baseline

In [10]:
dataset_df = pd.read_pickle(pathdf + "test.pkl")
classes_by_frequency = dataset_df.animated.value_counts(normalize=True).to_dict()
most_frequent_class = max(classes_by_frequency.items(), key=operator.itemgetter(1))[0]

y_pred = [int(most_frequent_class) for x in dataset_df['animated'].tolist()]
y_true = [int(x) for x in dataset_df['animated'].tolist()]

precision, recall, fscore, micro_fscore, map_ = animacy_evaluation.results(y_true,y_pred,0.5)
print("p:", round(precision,3), "r:", round(recall,3), "macro_f1:", round(fscore,3), "micro_f1:", round(micro_fscore,3), "map:", round(map_,3))

p: 0.328 r: 0.5 macro_f1: 0.396 micro_f1: 0.655 map: 0.316


### Create combined training set (`stories` and `machines19thC`)

Make sure you have run `process_stories_dataset.ipynb` before you run the next cells.

In [11]:
tmpstories = pd.read_pickle("../data/stories/train.pkl")
tmpstories = tmpstories.drop("storyNumber", axis=1)
tmpmachines = pd.read_pickle("../data/machines19thC/train.pkl")
tmpmachines = tmpmachines.drop(["date", "SentenceId"], axis=1)
combineddf = pd.concat([tmpstories, tmpmachines], ignore_index=True, sort=True)
combineddf = combineddf.sample(frac=1, random_state=0).reset_index(drop=True)

pathcomb = "../data/combined/"
Path(pathcomb).mkdir(parents=True, exist_ok=True)
combineddf.to_pickle(pathcomb + "train.pkl")