# Explore datasets

In [None]:
import json
import glob
import csv
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="white")
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})

from tqdm.auto import tqdm
tqdm.pandas()

## 1. BLBooks

In [None]:
blb_df = pd.read_csv("data/blb_processed/bl_books_machine_words.csv")
blb_df_machine = blb_df[blb_df.hits.str.contains("(?i)machine")]

print("Mentions of machines in the corpus:", blb_df_machine.shape[0])

In [None]:
blb_metadata_df = pd.read_csv("data/blb_processed/BLB_metadata.csv")

# Add year to blb machine df:
def add_blb_year(bookpath, blb_metadata):
    bookid = int(bookpath.split("_")[0])
    try:
        year_to_return = int(blb_metadata[blb_metadata["identifier"] == bookid].iloc[0].date)
    except ValueError:
        year_to_return = 0
    return year_to_return

if not Path("data/blb_processed/blb_machine_with_year.pkl").exists():
    blb_df_machine["year"] = blb_df_machine.progress_apply(lambda row : add_blb_year(row['article_path'], blb_metadata_df), axis = 1)
    blb_df_machine.to_pickle("data/blb_processed/blb_machine_with_year.pkl")

In [None]:
blb_df_machine = pd.read_pickle("data/blb_processed/blb_machine_with_year.pkl")
print("Mentions of machines in the corpus:", blb_df_machine.shape[0])

In [None]:
blb_df_machine = blb_df_machine[blb_df_machine["year"] > 0]
print("Mentions of machines in the corpus:", blb_df_machine.shape[0])

In [None]:
blb_df_machine_synparsed = pd.read_pickle("data/blb_processed/BLB_machine_synparsed.pkl")
print("Sentences with machines we keep:", blb_df_machine_synparsed.shape[0])

## 2. HMD

In [None]:
main_hmd_machine_df = pd.DataFrame(columns = ['Unnamed: 0', 'article_path', 'prev_sentence', 'target_sentence', 'next_sentence', 'hits'])

count_machines = 0
for i in glob.glob("data/hmd_processed/hmd_data_machine_words/*"):
    hmd_df_tmp = pd.read_csv(i)
    hmd_df_machine = hmd_df_tmp[hmd_df_tmp.hits.str.contains("(?i)machine")]
    count_machines += hmd_df_machine.shape[0]
    main_hmd_machine_df = pd.concat([main_hmd_machine_df, hmd_df_machine], ignore_index=False)
    
print("Mentions of machines in the corpus:", count_machines)
print("Mentions of machines in the corpus:", main_hmd_machine_df.shape[0])

In [None]:
hmd_metadata = pd.read_csv("data/hmd_processed/HMD_metadata_all.csv")

# Add year to hmd machine df:
def add_hmd_year(artpath, hmd_metadata):
    lastpart = artpath.split("/")[-1]
    no_ext = lastpart.split(".txt")[0]
    publcode, issuecode, artcode = no_ext.split("_")
    publcode = int(publcode)
    issuecode = int(issuecode)
    relv_df = hmd_metadata[(hmd_metadata["issue_code"] == issuecode) & (hmd_metadata["publication_code"] == publcode)]
    artdate = relv_df.iloc[0].issue_date
    artyear = int(artdate.split("-")[0])
    return artyear

if not Path("data/hmd_processed/hmd_machine_with_year.pkl").exists():
    main_hmd_machine_df["year"] = main_hmd_machine_df.progress_apply(lambda row : add_hmd_year(row['article_path'], hmd_metadata), axis = 1)
    main_hmd_machine_df.to_pickle("data/hmd_processed/hmd_machine_with_year.pkl")

In [None]:
hmd_df_machine = pd.read_pickle("data/hmd_processed/hmd_machine_with_year.pkl")
hmd_df_machine.shape[0]

In [None]:
hmd_df_machine_synparsed = pd.read_pickle("data/hmd_processed/HMD_machine_synparsed.pkl")
print("Sentences with machines we keep:", hmd_df_machine_synparsed.shape[0])

In [None]:
hmd_df_machine.head()

## 3. JSA

In [None]:
jsa_metadata = pd.read_csv("data/jsa_processed/JSA_metadata.tsv", sep="\t")

journal = "The Journal of the Society of Arts"
print(journal)
print("First article:", jsa_metadata[jsa_metadata["journal_title"] == journal].year.min())
print("Last article:", jsa_metadata[jsa_metadata["journal_title"] == journal].year.max())
print("Number articles:", len(jsa_metadata[jsa_metadata["journal_title"] == journal].article_id.unique()))
print("Number volumes:", len(jsa_metadata[jsa_metadata["journal_title"] == journal].volume.unique()))
print()

journal = "Transactions of the Society, Instituted at London, for the Encouragement of Arts, Manufactures, and Commerce"
print(journal)
print("First article:", jsa_metadata[jsa_metadata["journal_title"] == journal].year.min())
print("Last article:", jsa_metadata[jsa_metadata["journal_title"] == journal].year.max())
print("Number articles:", len(jsa_metadata[jsa_metadata["journal_title"] == journal].article_id.unique()))
print("Number volumes:", len(jsa_metadata[jsa_metadata["journal_title"] == journal].volume.unique()))

print("\nJointly:", len(jsa_metadata.article_id.unique()))

In [None]:
jsa_df = pd.read_csv("data/jsa_processed/JSA_machine.tsv", sep="\t")
jsa_df_machine = jsa_df[jsa_df.targetExpression.str.contains("(?i)machine")]

print("Mentions of machines in the corpus:", jsa_df_machine.shape[0])

In [None]:
jsa_df_machine_synparsed = pd.read_pickle("data/jsa_processed/JSA_machine_synparsed.pkl")
print("Sentences with machines we keep:", jsa_df_machine_synparsed.shape[0])

## ALL

In [None]:
jsa_df_machine_bert = pd.read_pickle("data/jsa_processed/JSA_machine_synparsed_pred_bert.pkl")
hmd_df_machine_bert = pd.read_pickle("data/hmd_processed/HMD_machine_synparsed_pred_bert.pkl")
blb_df_machine_bert = pd.read_pickle("data/blb_processed/BLB_machine_synparsed_pred_bert.pkl")

In [None]:
dict_synparsed = {"jsa": jsa_df_machine_synparsed,
                  "hmd": hmd_df_machine_synparsed,
                  "blb": blb_df_machine_synparsed}

dict_machines = {"jsa": jsa_df_machine,
                 "hmd": hmd_df_machine,
                 "blb": blb_df_machine}

dict_bert = {"jsa": jsa_df_machine_bert,
             "hmd": hmd_df_machine_bert,
             "blb": blb_df_machine_bert}

In [None]:
blb_df_machine_synparsed["year"] = blb_df_machine_synparsed["date"]

In [None]:
dys_all = dict() # Dictionary sentences per year using all data
dys_filt = dict() # Dictionary sentences per year using filtered data

for kd in dict_synparsed:
    for y in dict_machines[kd].year:
        if y in range(1783, 1909):
            if kd in dys_all:
                if y in dys_all[kd]:
                    dys_all[kd][y] += 1
                else:
                    dys_all[kd][y] = 1
            else:
                dys_all[kd] = {y : 1}
    for y in dict_synparsed[kd].year:
        if y in range(1783, 1909):
            if kd in dys_filt:
                if y in dys_filt[kd]:
                    dys_filt[kd][y] += 1
                else:
                    dys_filt[kd][y] = 1
            else:
                dys_filt[kd] = {y : 1}

In [None]:
corpus_to_title = {"JSA": "Journal of the Society of Arts",
                   "HMD": "Heritage Made Digital newspapers",
                   "BLB": "Microsoft British Library books"}

In [None]:
for k in dys_all:
    comb_df = pd.DataFrame({'all machine sentences':pd.Series(dys_all[k]),'filtered sentences':pd.Series(dys_filt[k])})
    comb_df.plot(title=corpus_to_title[k.upper()], xlabel="Year", ylabel="Sentences")
    plt.tight_layout() 
    plt.savefig("figures/" + k + ".png", dpi=300)
    comb_df.to_csv("figures/" + k + "_numbers.tsv", sep="\t")