In [1]:
import pandas as pd
import re

In [2]:
before_df = pd.read_csv("jsa_sentences/jsa_metadataJSA_before1854.tsv", sep="\t")
after_df = pd.read_csv("jsa_sentences/jsa_metadataJSA_1854-1908.tsv", sep="\t")
both_df = pd.concat([before_df, after_df])

In [3]:
def process_content(query_tokens, article_id):
    with open("jsa_sentences/JSA_content/" + str(article_id) + ".txt") as fr:
        lines = fr.readlines()
        lines = [line.replace("\n", "").strip() for line in lines]
        lSentenceTuples = []
        for iline in range(len(lines)):
            prevSentence = ""
            nextSentence = ""
            if iline > 0:
                prevSentence = lines[iline - 1]
            if iline < len(lines) - 1:
                nextSentence = lines[iline + 1]
            currentSentence = lines[iline]
            
            matches = re.findall(r"(?=("+'|'.join(query_tokens)+r"))", currentSentence, re.IGNORECASE)
            matches = list(set(matches))
            
            for match in matches:
                targetExpression = match
                markedSentence = re.sub(r"\b%s\b" % match, "$***" + match + "***$", currentSentence, 1, flags=re.IGNORECASE)
                maskedSentence = re.sub(r"\b%s\b" % match, " [MASK] ", currentSentence, 1, flags=re.IGNORECASE)
                maskedSentence = re.sub(" +", " ", maskedSentence)
                    
                lSentenceTuples.append((prevSentence, currentSentence, markedSentence, maskedSentence, nextSentence, targetExpression))
                
    return lSentenceTuples

In [4]:
query_tokens = ["machine", "machines", "machinery", "engines", "engine", "locomotive", "locomotives", "turbine", "turbines", "apparatus", "apparatuses", "motor" "motors", "compressor", "compressors", "accumulator", "accumulators", "dynamo", "dynamos"]
query_tokens_re = [r"\b" + x + r"\b" for x in query_tokens]

df_cols = list(both_df.columns) + ["prevSentence", "currentSentence", "markedSentence", "maskedSentence", "nextSentence", "targetExpression"]
df_rows = []

for i, row in both_df.iterrows():
    tSentences = process_content(query_tokens_re, row["article_id"])
    for t in tSentences:
        prevSentence, currentSentence, markedSentence, maskedSentence, nextSentence, targetExpression = t
        df_rows.append(list(row) + [prevSentence, currentSentence, markedSentence, maskedSentence, nextSentence, targetExpression])

In [5]:
df_sentences = pd.DataFrame(df_rows, columns=df_cols)

In [6]:
# Keep only sentences shorter than 512 characters
df_sentences = df_sentences[df_sentences["maskedSentence"].str.len() < 510]

In [7]:
df_sentences.to_pickle("../data/jsa_animacy.pkl")

In [11]:
pd.read_pickle("../data/jsa_animacy.pkl").head()

Unnamed: 0,filename,journal_id,journal_id_text,journal_title,publisher_name,volume,issue,issue_id,article_id,article_type,...,title_group,date,contributors,subjects,prevSentence,currentSentence,markedSentence,maskedSentence,nextSentence,targetExpression
0,journal-article-10.2307_41326898,j50003411,transoclon,"Transactions of the Society, Instituted at Lon...",The Society,54,,i40060851,41326898,research-article,...,NO. IV. TEMPORARY DIVING-BELL,1844-01-01,"[{'given-names': 'Thomas', 'surname': 'Dickins...",['MECHANICS'],LIY.,H 98 MECHANICS. being able to procure a diving...,H 98 MECHANICS. being able to procure a diving...,H 98 MECHANICS. being able to procure a diving...,He next had an air-pump constructed under his ...,machine
1,journal-article-10.2307_41326898,j50003411,transoclon,"Transactions of the Society, Instituted at Lon...",The Society,54,,i40060851,41326898,research-article,...,NO. IV. TEMPORARY DIVING-BELL,1844-01-01,"[{'given-names': 'Thomas', 'surname': 'Dickins...",['MECHANICS'],These were used throughout the whole of the op...,A more powerful air-pump was also constructed ...,A more powerful air-pump was also constructed ...,A more powerful air-pump was also constructed ...,The first diving-bell used in the operations w...,apparatus
3,journal-article-10.2307_41325349,j50003411,transoclon,"Transactions of the Society, Instituted at Lon...",The Society,19,,i40060557,41325349,misc,...,PREFACE,1801-01-01,[],[],Mr. Palmer's method of Housing Corn in Wet Wea...,The more general introduction of Thrashing-Mac...,The more general introduction of Thrashing-$**...,The more general introduction of Thrashing- [M...,Immense tracts of Land lie unculti- vated in d...,Machines
4,journal-article-10.2307_41325349,j50003411,transoclon,"Transactions of the Society, Instituted at Lon...",The Society,19,,i40060557,41325349,misc,...,PREFACE,1801-01-01,[],[],The Drill Husbandry continues to gain advocate...,The Public are under great obligations to Thom...,The Public are under great obligations to Thom...,The Public are under great obligations to Thom...,This very ingeni- ous mid useful implement pos...,Machine
5,journal-article-10.2307_41325349,j50003411,transoclon,"Transactions of the Society, Instituted at Lon...",The Society,19,,i40060557,41325349,misc,...,PREFACE,1801-01-01,[],[],The advantages of the Drill over the Broad-cas...,The fnethod which he suggests for the preserva...,The fnethod which he suggests for the preserva...,The fnethod which he suggests for the preserva...,"Mr. EcclestOn, - of Scarisbrkk, who has for ma...",Machine


In [18]:
pd.read_pickle("../data/jsa_animacy.pkl").iloc[0].currentSentence

"H 98 MECHANICS. being able to procure a diving-bell at Rio de Janeiro, nor the means of casting one, it occurred to him that it was possible to make such a machine of iron water-tanks, strengthened with bars of iron, &amp;c ; and he obtained the Admiral's order to be furnished with two two-ton tanks from the Warspite (flag-ship)."

In [12]:
import spacy

In [13]:
nlp = spacy.load("en_core_web_lg")

In [19]:
doc = nlp("being able to procure a diving-bell at Rio de Janeiro, nor the means of casting one, it occurred to him that it was possible to make such a machine of iron water-tanks, strengthened with bars of iron, &amp;c ; and he obtained the Admiral's order to be furnished with two two-ton tanks from the Warspite (flag-ship).")

In [30]:
dfplay = pd.read_pickle("../data/jsa_animacy.pkl")
dfplay = dfplay[dfplay["targetExpression"] == "engine"]

for i, row in dfplay.iloc[40:50].iterrows():
    doc = nlp(row["currentSentence"])
    print(row["currentSentence"])
    for token in doc:
        print(token.text, token.pos_, token.dep_, token.head.text)
    print()
    print()

118 TowER-engine, account of a machine so called, in Cornwall, by Mr. Smea- ton, v. 190 - , its cost and complexity, v. 190 Towns, great.
118 NUM nummod engine
TowER PROPN compound engine
- PUNCT punct engine
engine NOUN ROOT engine
, PUNCT punct engine
account NOUN appos engine
of ADP prep account
a DET det machine
machine NOUN pobj of
so ADV advmod called
called VERB acl machine
, PUNCT punct called
in ADP prep called
Cornwall PROPN pobj in
, PUNCT punct account
by ADP prep engine
Mr. PROPN pobj by
Smea- ADJ amod ton
ton NOUN npadvmod engine
, PUNCT punct engine
v. CCONJ prep engine
190 NUM conj engine
- PUNCT punct engine
, PUNCT punct engine
its PRON poss cost
cost NOUN conj engine
and CCONJ cc cost
complexity NOUN conj cost
, PUNCT punct cost
v. CCONJ cc cost
190 NUM nummod Towns
Towns NOUN pobj v.
, PUNCT punct cost
great ADJ amod cost
. PUNCT punct engine


370 Water, Paris supplied with it by an English steam-engine, iv. introd.
370 NUM nummod Water
Water NOUN ROOT Water
, PUNC

given VERB acl signal
at ADP prep given
9h NOUN pobj at
. PUNCT punct are


Engines Nos. 5 and 6 are turned into the engine-sheds as before, and prepared to make the next exchange ; at 9h.
Engines PROPN compound Nos
Nos PROPN nsubjpass turned
. PUNCT punct Nos
5 NUM appos Nos
and CCONJ cc Nos
6 NUM conj Nos
are AUX auxpass turned
turned VERB ROOT turned
into ADP prep turned
the DET det sheds
engine NOUN compound sheds
- PUNCT punct sheds
sheds NOUN pobj into
as ADP prep turned
before ADV pcomp as
, PUNCT punct turned
and CCONJ cc turned
prepared VERB conj turned
to PART aux make
make VERB xcomp prepared
the DET det exchange
next ADJ amod exchange
exchange NOUN dobj make
; PUNCT punct prepared
at ADP prep prepared
9h NOUN pobj at
. PUNCT punct turned


54m. engine No. 1 arrives at the terminal station a, as before, and engine No. 3 is again ready to start with the 10 o'clock train, and so the reciprocating process is continued throughout the 24 hours at each of the intermediate exchange