## Process _machines19thC_ dataset

This notebook prepares the _19thC Machines_ dataset for the experiments in the Living Machines paper.

In [None]:
import pandas as pd
import numpy as np
from tools import animacy_evaluation,processing
import operator
import spacy
from pathlib import Path
nlp = spacy.load("en_core_web_lg")

In [None]:
# --------------------------------------------
# Load and process machines19thC annotations:

master_all = pd.read_csv("../resources/machines19thC.tsv", sep="\t", index_col=0)
master_all[["prevSentence", "currentSentence", "maskedSentence", "nextSentence"]] = master_all.apply(lambda row: pd.Series(processing.processMachines19thC(row['Sentence'], row['SentenceCtxt'], row['TargetExpression'], nlp)), axis=1)
master_all = master_all.rename(columns={"TargetExpression": "targetExpression"})
master_all = master_all[master_all["maskedSentence"].str.contains("[MASK]", regex=False)]
master_all[['context3wmasked', 'context3w']] = master_all.apply(lambda row: pd.Series(processing.ngram_context(row['maskedSentence'], row['targetExpression'], 3)), axis=1)

# --------------------------------------------
# Separate animacy and humanness annotations:

# Animacy:
animacy_all = pd.DataFrame()
cols = ["Date","SentenceId","prevSentence","currentSentence","maskedSentence","nextSentence","targetExpression","context3wmasked","context3w"]
animacy_all = master_all[cols + ["animacy"]]
animacy_all = animacy_all.rename(columns={"animacy": "animated", "Date": "date"})
animacy_all = animacy_all[animacy_all["animated"].notnull()]
animacy_all = animacy_all.reset_index()
animacy_all["animated"] = animacy_all["animated"].astype('int64')

# Humanness:
humanness_all = pd.DataFrame()
humanness_all = master_all[cols]
humanness_all = master_all[cols + ["humanness"]]
humanness_all = humanness_all.rename(columns={"humanness": "animated", "Date": "date"})
humanness_all = humanness_all[humanness_all["animated"].notnull()]
humanness_all = humanness_all.reset_index()
humanness_all["animated"] = humanness_all["animated"].astype('int64')

In [None]:
pathdf_animacy = "../data/machines19thC_animacy/"
Path(pathdf_animacy).mkdir(parents=True, exist_ok=True)

#-----------------------------------------------------
# Store machines19thC animacy dataframe:
animacy_all = animacy_all.drop("index", axis=1)
animacy_all.to_pickle(pathdf_animacy + "animacy.pkl")

# ------------------------------------------------------
# Split into train and test set and store:

trainsplit = 0.3

animacy_all = animacy_all.sample(frac=1, random_state=0).reset_index(drop=True)
animacy_all_train_set = animacy_all.sample(frac=trainsplit, random_state=0)
animacy_all_test_set = animacy_all[~animacy_all.index.isin(animacy_all_train_set.index)]

animacy_all_train_set.to_pickle(pathdf_animacy + "train.pkl")
animacy_all_test_set.to_pickle(pathdf_animacy + "test.pkl")

In [None]:
pathdf_humanness = "../data/machines19thC_humanness/"
Path(pathdf_humanness).mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------
# Store machines19thC humanness dataframe:
humanness_all = humanness_all.drop("index", axis=1)
humanness_all.to_pickle(pathdf_humanness + "humanness.pkl")

# ------------------------------------------------------
# Split into train and test set and store:

trainsplit = 0.3

humanness_all = humanness_all.sample(frac=1, random_state=0).reset_index(drop=True)
humanness_all_train_set = humanness_all.sample(frac=trainsplit, random_state=0)
humanness_all_test_set = humanness_all[~humanness_all.index.isin(humanness_all_train_set.index)]

humanness_all_train_set.to_pickle(pathdf_humanness + "train.pkl")
humanness_all_test_set.to_pickle(pathdf_humanness + "test.pkl")

### Explore the data

In [None]:
print("Train annd test sets")
print("====================")
print("Animacy train set:", animacy_all_train_set.animated.value_counts().to_dict())
print("Animacy test set:", animacy_all_test_set.animated.value_counts().to_dict())

# ------------------------------------------------------
# Annotation counts:

print("\nNumber of rows")
print("==============")
print("All rows:")
print("* All:", animacy_all.animated.count())
print("* Train:", animacy_all_train_set.animated.count())
print("* Test:", animacy_all_test_set.animated.count())

In [None]:
df = pd.read_pickle(pathdf_animacy + "train.pkl")

sentInd = 1
print("***prevSentence***", df.iloc[sentInd].prevSentence)
print("***currentSentence***", df.iloc[sentInd].currentSentence)
print("***maskedSentence***", df.iloc[sentInd].maskedSentence)
print("***nextSentence***", df.iloc[sentInd].nextSentence)
print("***targetExpression***", df.iloc[sentInd].targetExpression)
print("***context3wmasked***", df.iloc[sentInd].context3wmasked)
print("***context3w***", df.iloc[sentInd].context3w)
print("***animated***", df.iloc[sentInd].animated)

### Most frequent class baseline

In [None]:
dataset_df = pd.read_pickle(pathdf_animacy + "test.pkl")
classes_by_frequency = dataset_df.animated.value_counts(normalize=True).to_dict()
most_frequent_class = max(classes_by_frequency.items(), key=operator.itemgetter(1))[0]

y_pred = [int(most_frequent_class) for x in dataset_df['animated'].tolist()]
y_true = [int(x) for x in dataset_df['animated'].tolist()]

precision, recall, fscore, micro_fscore, map_ = animacy_evaluation.results(y_true,y_pred,0.5)
print("p:", round(precision,3), "r:", round(recall,3), "macro_f1:", round(fscore,3), "micro_f1:", round(micro_fscore,3), "map:", round(map_,3))

In [None]:
dataset_df = pd.read_pickle(pathdf_humanness + "test.pkl")
classes_by_frequency = dataset_df.animated.value_counts(normalize=True).to_dict()
most_frequent_class = max(classes_by_frequency.items(), key=operator.itemgetter(1))[0]

y_pred = [int(most_frequent_class) for x in dataset_df['animated'].tolist()]
y_true = [int(x) for x in dataset_df['animated'].tolist()]

precision, recall, fscore, micro_fscore, map_ = animacy_evaluation.results(y_true,y_pred,0.5)
print("p:", round(precision,3), "r:", round(recall,3), "macro_f1:", round(fscore,3), "micro_f1:", round(micro_fscore,3), "map:", round(map_,3))

### Create combined training sets (`stories` and `machines19thC`)

Make sure you have run `process_stories_dataset.ipynb` before you run the following cell.

In [None]:
tmpstories = pd.read_pickle("../data/stories/train.pkl")
tmpstories = tmpstories.drop("storyNumber", axis=1)
tmpmachines = pd.read_pickle("../data/machines19thC_animacy/train.pkl")
tmpmachines = tmpmachines.drop(["date", "SentenceId"], axis=1)
combineddf = pd.concat([tmpstories, tmpmachines], ignore_index=True, sort=True)
combineddf = combineddf.sample(frac=1, random_state=0).reset_index(drop=True)

pathcomb = "../data/combined_animacy/"
Path(pathcomb).mkdir(parents=True, exist_ok=True)
combineddf.to_pickle(pathcomb + "train.pkl")

In [None]:
tmpstories = pd.read_pickle("../data/stories/train.pkl")
tmpstories = tmpstories.drop("storyNumber", axis=1)
tmpmachines = pd.read_pickle("../data/machines19thC_humanness/train.pkl")
tmpmachines = tmpmachines.drop(["date", "SentenceId"], axis=1)
combineddf = pd.concat([tmpstories, tmpmachines], ignore_index=True, sort=True)
combineddf = combineddf.sample(frac=1, random_state=0).reset_index(drop=True)

pathcomb = "../data/combined_humanness/"
Path(pathcomb).mkdir(parents=True, exist_ok=True)
combineddf.to_pickle(pathcomb + "train.pkl")