# Create a dataset that we can analyse, which includes the results of the BTM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dropbox
import io

# Connect to dropbox for raw data DO NOT COMMIT TOKEN
access_token = 'XXX'
dbx = dropbox.Dropbox(access_token)

## Load in outputs and original data

In [2]:
# Load tdm
md, response = dbx.files_download("/ClassifyingESTC/results/BTM19_NONUMBERS_DEDUPED_AdStpwds__TOPIC_DOC_MAT.csv")
tdm = pd.read_csv(io.BytesIO(response.content), encoding = "ISO-8859-1")

tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94680,94681,94682,94683,94684,94685,94686,94687,94688,94689
0,0.1387308,0.69516,0.021509,2.122421e-06,1.160653e-06,0.001511,1.595839e-08,0.012673,0.004342,0.002427,...,0.004498,0.313115,0.04631,0.01074485,0.0009475545,0.04411096,0.03866,0.047853,0.081622,0.002592158
1,0.03118734,0.046276,0.289836,0.0001397354,3.617359e-07,0.02472,0.001963455,0.043833,0.164854,0.031784,...,0.158796,0.056213,0.064663,0.08404406,0.03723422,0.1438186,0.08504,0.002602,0.130082,0.1110681
2,0.1798107,0.002929,0.017221,1.416502e-06,7.746189e-07,0.019313,3.368735e-05,0.021223,0.006268,0.058344,...,0.017532,0.081998,0.018546,0.0161317,0.03079085,0.01382835,0.07804,0.00883,0.021913,0.01356243
3,0.04027834,0.015272,0.014795,6.765009e-07,0.05619319,0.128679,5.08658e-09,0.038688,0.256291,0.074795,...,0.279345,0.02593,0.026698,0.02473841,0.041277,0.02492005,0.136009,0.098291,0.011968,0.002658302
4,1.863697e-07,3e-06,1e-06,2.308252e-05,1.262276e-05,6.4e-05,1.735564e-07,5e-06,6.7e-05,0.000706,...,3e-06,3e-06,5.6e-05,1.599264e-10,1.821761e-10,1.316764e-07,0.002013,0.000105,0.000128,2.346361e-08


In [3]:
# Load processed data
md, response = dbx.files_download("/ClassifyingESTC/intermediate_output/estc_btm_prepped.csv")
processed = pd.read_csv(io.BytesIO(response.content), encoding = "ISO-8859-1")

processed.head()

Unnamed: 0,system_number,date,pages,format,title,clean_title
0,6172522,1600.0,1.0,broadside,By the Queene. Whereas the Earle of Essex : ac...,queen wherea earl essex accompani earl rutland...
1,6180391,1600.0,584.0,octavo,Essayes : By Sir William Corne-Waleys the youn...,essay sir william cornewaley younger knight
2,6206783,1600.0,576.0,octavo,"The booke of common prayer, and administration...",book common prayer administr sacrament rite ce...
3,6205242,1600.0,1.0,half_sheet,Immortalitas animarum naturaliter innotescit,immortalita animarum naturalit innotescit
4,6205278,1600.0,1.0,half_sheet,Plantæ viuaciores sunt animalibus,plant viuacior sunt animalibu


In [4]:
# Must equal TRUE!
processed.shape[0] == tdm.shape[1]

True

## Topic prevalence and system number

In [19]:
transposed = pd.DataFrame(tdm.T)
transposed['system_number'] = processed['system_number'].tolist()
transposed.head()

# Save to dropbox
df_string = transposed.to_csv(index=False)
db_bytes = bytes(df_string, "ISO-8859-1")
dbx.files_upload(
    f=db_bytes,
    path="/ClassifyingESTC/results/prevalence_with_system_number.csv",
    mode=dropbox.files.WriteMode.overwrite
)

FileMetadata(client_modified=datetime.datetime(2025, 3, 8, 15, 29, 43), content_hash='057c95c09ce977c8187eceac44f180fab89289971288f201c66be25c967d91c2', export_info=NOT_SET, file_lock_info=NOT_SET, has_explicit_shared_members=NOT_SET, id='id:wwr8ZQ2NbWIAAAAAAAAiiw', is_downloadable=True, media_info=NOT_SET, name='prevalence_with_system_number.csv', parent_shared_folder_id='3961543809', path_display='/ClassifyingESTC/Results/prevalence_with_system_number.csv', path_lower='/classifyingestc/results/prevalence_with_system_number.csv', preview_url=NOT_SET, property_groups=NOT_SET, rev='62fd6688a98e2ec205c81', server_modified=datetime.datetime(2025, 3, 8, 15, 29, 43), sharing_info=FileSharingInfo(modified_by='dbid:AAAZrg3IGwGPrkL3cB_LnN3I7Tdm7nHQJgk', parent_shared_folder_id='3961543809', read_only=False), size=35318563, symlink_info=NOT_SET)

## Document and top topic

In [18]:
# Returns most prev topic for each doc
most_prev_topic = tdm.idxmax(axis = 0).to_list()

# Add to dataframe three times
processed['top_topic_no'] = most_prev_topic

processed.head(3)

Unnamed: 0,system_number,date,pages,format,title,clean_title,top_topic_no
0,6172522,1600.0,1.0,broadside,By the Queene. Whereas the Earle of Essex : ac...,queen wherea earl essex accompani earl rutland...,2
1,6180391,1600.0,584.0,octavo,Essayes : By Sir William Corne-Waleys the youn...,essay sir william cornewaley younger knight,0
2,6206783,1600.0,576.0,octavo,"The booke of common prayer, and administration...",book common prayer administr sacrament rite ce...,7


## Full dataset with topic (And slim version while I'm there)

In [7]:
# load original data
md, response = dbx.files_download("/ClassifyingESTC/input_files/estc_with_pages.csv")
raw = pd.read_csv(io.BytesIO(response.content), encoding = "ISO-8859-1")

raw = raw.set_index('system_number')

raw.head(3)

Unnamed: 0_level_0,citation_number,name,name_dates,name_type,role,all_names,title,date,publisher,country,city,place,physical_description,pages,format,language
system_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6180992,S109298,"Lefèvre, Raoul",active 1460,person,,"Mansion, Colard, printer [person] ; Caxton, Wi...",hEre begynneth the volume intituled and named ...,1473.0,"Printed by William Caxton and, probably, Colar...",Belgium,Bruges,"[Bruges : Printed by William Caxton and, proba...",[704] p. ; 2?.,704.0,folio,eng
6178187,S106478,"Cessolis, Jacobus de",,person,,"Mansion, Colard, printer [person] ; Caxton, Wi...","[T] o the right noble, right excellent [and] v...",1474.0,"Printed by William Caxton and Colard Mansion,",Belgium,Bruges,[Bruges : Printed by William Caxton and Colard...,[148] p. ; 2?.,148.0,folio,eng
6194414,S123061,"Lefèvre, Raoul",active 1460,person,,"Caxton, William, approximately 1422-1491 or 14...",Cy commence le volume intitule le recueil des ...,1474.0,"printed by William Caxton and, probably, Colar...",Belgium,Bruges,"[Ghent?] : [David Aubert?, for William Caxton]...",[572] p. ; 2?.,572.0,folio,fre


In [8]:
# Join
processed = processed[['system_number','top_topic_no']]

final_results = raw.merge(processed.set_index('system_number'), how = 'inner', left_index = True, right_index = True)

final_results.head(20)

Unnamed: 0_level_0,citation_number,name,name_dates,name_type,role,all_names,title,date,publisher,country,city,place,physical_description,pages,format,language,top_topic_no
system_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6172522,S100840,"England and Wales, Sovereign (1558-1603 : Eliz...",,organisation,,"Barker, Robert, -1645, printer [person] ; Engl...",By the Queene. Whereas the Earle of Essex : ac...,1600.0,"by Robert Barker, printer to the Queenes most ...",England,Imprinted at London,"Imprinted at London : by Robert Barker, printe...",1 sheet ([1] p.) ; 1?.,1.0,broadside,eng,2
6180391,S108699,"Cornwallis, William, Sir",approximately 1579-1614,person,,"Olney, Henry, editor [person] ; Mattes, Edmund...",Essayes : By Sir William Corne-Waleys the youn...,1600.0,Printed [by S. Stafford and R. Read] for Edmun...,England,London,[London] : Printed [by S. Stafford and R. Read...,[584] p. ; 8?.,584.0,octavo,eng,0
6206783,S96096,Church of England,,organisation,,"Church of England [organisation] ; Barker, Rob...","The booke of common prayer, and administration...",1600.0,"Imprinted at London by Robert Barker, printer ...",England,London,[London] : Imprinted at London by Robert Barke...,[576] p. ; 8?.,576.0,octavo,eng,7
6205242,S91321,University of Cambridge,,organisation,,University of Cambridge [organisation],Immortalitas animarum naturaliter innotescit,1600.0,"s.n.,",England,Cambridge,"[Cambridge : s.n., between 1600 and 1607?]",1 sheet ([1] p.) ; 1/2?.,1.0,half_sheet,eng,13
6205278,S91286,University of Cambridge,,organisation,,University of Cambridge [organisation],Plantæ viuaciores sunt animalibus,1600.0,"s.n.,",England,Cambridge,"[Cambridge : s.n., between 1600 and 1607?]",1 sheet ([1] p.) ; 1/2?.,1.0,half_sheet,eng,13
6209331,T301210,"White, John",-1671,person,,"White, John, -1671 [person]","Art's treasury, or, A profitable and pleasing ...",1600.0,"[s.n.],",England,England,"[England?] : [s.n.], [16--?].","259, [27] p. ; 12?.",286.0,12mo,eng,16
6476680,R477519,,,,,"Bellamie, John, -1654, bookseller [person] ; S...",The cobler of Pragve : with all his merry conc...,1600.0,"s.n.,",England,London,[London] : [William Sheffard and John Bellamy]...,"[2], 39, [1] p. ; 4?.",42.0,quarto,eng,10
6478143,R479028,"Capon, John",,person,,"Capon, John [person]",The nevv booke of Mr. Iohn Capons wits : dedic...,1600.0,"s.n.,",England,London,"[London? : s.n., 16--]",4 p. ; 8?.,4.0,octavo,eng,14
6479648,R491169,"Biddle, John",1615-1662,person,,"Biddle, John, 1615-1662 [person]",A brief scripture-catechism for children : Whe...,1600.0,"[s.n.],",England,London,"London : [s.n.], printed in the year 16--.","[6], 34 p. ; 8?.",40.0,octavo,eng,3
6197728,S2474,"Watson, Robert",,person,,"Watson, Robert [person]",A doble almanacke or kalender drawne for this ...,1600.0,"By Richard Watkins and Iames Robertes,",Michigan,Imprinted at London,Imprinted at London : By Richard Watkins and I...,[1+] p.,1.0,unknown,eng,18


In [9]:
# Save to dropbox
df_string = final_results.to_csv(index=False)
db_bytes = bytes(df_string, "ISO-8859-1")
dbx.files_upload(
    f=db_bytes,
    path="/ClassifyingESTC/results/BTM19_RESULTS_NoLabel.csv",
    mode=dropbox.files.WriteMode.overwrite
)

FileMetadata(client_modified=datetime.datetime(2024, 8, 2, 19, 40, 38), content_hash='0621126885fae826dbb6e6146f30e75f1460e0df3e2c89e31810c6318aa9794c', export_info=NOT_SET, file_lock_info=NOT_SET, has_explicit_shared_members=NOT_SET, id='id:wwr8ZQ2NbWIAAAAAAAAifQ', is_downloadable=True, media_info=NOT_SET, name='BTM19_RESULTS_NoLabel.csv', parent_shared_folder_id='3961543809', path_display='/ClassifyingESTC/Results/BTM19_RESULTS_NoLabel.csv', path_lower='/classifyingestc/results/btm19_results_nolabel.csv', preview_url=NOT_SET, property_groups=NOT_SET, rev='61eb8807adaeaec205c81', server_modified=datetime.datetime(2024, 8, 2, 19, 40, 38), sharing_info=FileSharingInfo(modified_by='dbid:AAAZrg3IGwGPrkL3cB_LnN3I7Tdm7nHQJgk', parent_shared_folder_id='3961543809', read_only=False), size=57111548, symlink_info=NOT_SET)

In [10]:
processed.head()

Unnamed: 0,system_number,top_topic_no
0,6172522,2
1,6180391,0
2,6206783,7
3,6205242,13
4,6205278,13


In [11]:
# Save slim version to dropbox
df_string = processed.to_csv(index=False)
db_bytes = bytes(df_string, "ISO-8859-1")
dbx.files_upload(
    f=db_bytes,
    path="/ClassifyingESTC/results/BTM19_topic_system_number.csv",
    mode=dropbox.files.WriteMode.overwrite
)

FileMetadata(client_modified=datetime.datetime(2025, 1, 19, 15, 7, 8), content_hash='406a6d9f7b188db198c18f239698f525122762e670fe9e11ea221c1059607039', export_info=NOT_SET, file_lock_info=NOT_SET, has_explicit_shared_members=NOT_SET, id='id:wwr8ZQ2NbWIAAAAAAAAigA', is_downloadable=True, media_info=NOT_SET, name='BTM19_topic_system_number.csv', parent_shared_folder_id='3961543809', path_display='/ClassifyingESTC/Results/BTM19_topic_system_number.csv', path_lower='/classifyingestc/results/btm19_topic_system_number.csv', preview_url=NOT_SET, property_groups=NOT_SET, rev='62c107f68cb28ec205c81', server_modified=datetime.datetime(2025, 1, 19, 15, 7, 8), sharing_info=FileSharingInfo(modified_by='dbid:AAAZrg3IGwGPrkL3cB_LnN3I7Tdm7nHQJgk', parent_shared_folder_id='3961543809', read_only=False), size=986956, symlink_info=NOT_SET)