# Explore harvested text files

In [1]:
import os
import pandas as pd
import fileinput
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])
    return harvests[-1]

def open_harvest_data(timestamp=None):
    '''
    Open the results of the specified harvest (most recent by default).
    
    Returns a DataFrame.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    print(timestamp)
    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])
    return df 

## Aggregate the text files

In [3]:
def aggregate_texts(timestamp=None):
    '''
    Aggregate all individual article texts creating one big file.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    output_file = os.path.join('data', timestamp, 'all-texts.txt')
    data_dir = os.path.join('data', timestamp, 'text')
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt']                                                                                   
    with open(output_file, 'w') as fout, fileinput.input(files) as fin:
        for line in fin:
            fout.write(line)
    
def aggregate_years(timestamp=None):
    '''
    Aggregate individual article text by year, creating one file per year.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    output_dir = os.path.join('data', timestamp, 'years')
    os.makedirs(output_dir, exist_ok=True)
    data_dir = os.path.join('data', timestamp, 'text')
    df = open_harvest_data(timestamp=timestamp)
    df['year'] = df['date'].dt.year
    years = list(df['year'].unique())
    for year in years:
        output_file = os.path.join(output_dir, '{}.txt'.format(year))
        files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt' and file[:4] == str(year)]                                                                                   
        with open(output_file, 'w') as fout, fileinput.input(files) as fin:
            for line in fin:
                fout.write(line)
                
                
def aggregate_newspapers(timestamp=None):
    '''
    Aggregate individual article text by newspaper, creating one file per newspaper.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    output_dir = os.path.join('data', timestamp, 'newspapers')
    os.makedirs(output_dir, exist_ok=True)
    data_dir = os.path.join('data', timestamp, 'text')
    df = open_harvest_data(timestamp=timestamp)
    newspapers = list(df['newspaper_id'].unique())
    for newspaper in newspapers:
        output_file = os.path.join(output_dir, '{}.txt'.format(newspaper))
        files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt' and '-{}-'.format(newspaper) in file]                                                                                   
        with open(output_file, 'w') as fout, fileinput.input(files) as fin:
            for line in fin:
                fout.write(line)

## Display word frequencies

In [4]:
def show_word_frequencies(text_file):
    with open(text_file, 'r') as text:
        blob = TextBlob(text.read())
    word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
    word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
    return pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})

def word_frequency_all(timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp)
    return show_word_frequencies(os.path.join(data_dir, 'all-texts.txt'))

def word_frequency_by_year(year, timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'years')
    return show_word_frequencies(os.path.join(data_dir, '{}.txt'.format(year)))

def word_frequency_by_newspaper(newspaper_id, timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'newspapers')
    return show_word_frequencies(os.path.join(data_dir, '{}.txt'.format(year)))   

In [121]:
aggregate_texts()

In [7]:
aggregate_years('1543809004')

1543809004


In [9]:
def calculate_tfidf(names, files, ngram_size=1):
    # Chomp chomp -- getting trigrams
    tf = TfidfVectorizer(input='filename', analyzer='word', ngram_range=(ngram_size, ngram_size), min_df=0, smooth_idf=False, sublinear_tf=True)
    tfidf_matrix = tf.fit_transform(files)
    # These are the actual phrases
    feature_names = tf.get_feature_names()
    # These are the scores
    texts = tfidf_matrix.todense()
    for index, row in enumerate(texts):
        name = names[index]
        print('\n\n{}\n'.format(name.upper()))
        text = row.tolist()[0]
        # If the score is not 0 save it with an index (which will let us get the feature_name)
        scores = [pair for pair in zip(range(0, len(text)), text) if pair[1] > 0]
        sorted_scores = sorted(scores, key=lambda t: t[1] * -1)
        # Print the top 20 results for each file
        for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_scores][:20]:
            print('{0: <40} {1}'.format(phrase, score))

def calculate_tfidf_by_year(timestamp=None, ngram_size=1):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'years')
    if not os.path.exists(data_dir):
        aggregate_years(timestamp)
    # Get a list of the file names in the directory.
    names = [file[:-4] for file in os.listdir(data_dir) if file[-4:] == '.txt']
    # Get a list of files to feed to scikit-learn.
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt']
    calculate_tfidf(names, files, ngram_size)
    
    
def calculate_tfidf_by_newspaper(timestamp=None, ngram_size=1):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp, 'newspapers')
    if not os.path.exists(data_dir):
        aggregate_newspapers(timestamp)
    df = open_harvest_data(timestamp=timestamp)
    newspapers = df[['newspaper_id', 'newspaper_title']].drop_duplicates().set_index('newspaper_id')
    # Get a list of the file names in the directory.
    names = [newspapers.loc[int(file[:-4])]['newspaper_title'] for file in os.listdir(data_dir) if file[-4:] == '.txt']
    # Get a list of files to feed to scikit-learn.
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file[-4:] == '.txt']
    calculate_tfidf(names, files, ngram_size)

In [12]:
calculate_tfidf_by_year('1543809004', ngram_size=3)
    



1912

of oil fuel                              0.06625768146926414
tons of oil                              0.061930588190567584
1000 tons of                             0.05663465402866593
use of oil                               0.0498070074025766
22 knots 1000                            0.04646577806598459
against drivers of                       0.04646577806598459
at 22 knots                              0.04646577806598459
days steaming from                       0.04646577806598459
enforcement of the                       0.04646577806598459
has to be                                0.04646577806598459
in the motor                             0.04646577806598459
internal combustion engine               0.04646577806598459
knots 1000 tons                          0.04646577806598459
motor liner selandia                     0.04646577806598459
of oil as                                0.04646577806598459
oil as fuel                              0.04646577806598459
the evidence of 

aman victedof the                        0.16222142113076254
an ex sydney                             0.16222142113076254
andrew munaltan whowas                   0.16222142113076254
as suspectcd personand                   0.16222142113076254
associate et aman                        0.16222142113076254
at liutv street                          0.16222142113076254
bee sentenced to                         0.16222142113076254
cauli london friday                      0.16222142113076254
days an ex                               0.16222142113076254
duey has bee                             0.16222142113076254
et aman victedof                         0.16222142113076254
evil days an                             0.16222142113076254
ex sydney publican                       0.16222142113076254
formerl hotel proprietor                 0.16222142113076254
friday andrew munaltan                   0.16222142113076254
has bee sentenced                        0.16222142113076254
hotel proprietor in     

the oceanic company                      0.06101253262404019
29 days this                             0.04979581881465393
by the oceanic                           0.04979581881465393
steamers of the                          0.04979581881465393
mails in london                          0.04553757768235742
with the advent                          0.04553757768235742
an important change                      0.04004775005021797
and hitherto has                         0.04004775005021797
announced by the                         0.04004775005021797
arrangements with these                  0.04004775005021797
by connecting at                         0.04004775005021797
change in its                            0.04004775005021797
claim to the                             0.04004775005021797
company lays claim                       0.04004775005021797
concluded the oceanic                    0.04004775005021797
date instead of                          0.04004775005021797
days steamer it         

india if she                             0.04622921206627537
on april 15                              0.04622921206627537
pakistan and ceylon                      0.04622921206627537
the commonwealth as                      0.04622921206627537
the prime ministers                      0.04622921206627537
in 17 days                               0.039979395775773606
in the commonwealth                      0.039979395775773606
london in 17                             0.039979395775773606
the commonwealth and                     0.039979395775773606
to london on                             0.039979395775773606
members of the                           0.03727554043730819
london on april                          0.03632348760943387
secretary to the                         0.03632348760943387
that he would                            0.03632348760943387
to attend the                            0.03372957948527184
of the commonwealth                      0.030073671318932103
the com monwealth 

in l8 days                               0.04774491824044844
overseas ah ways                         0.04774491824044844
route across africa                      0.04774491824044844
sent into hospital                       0.04774491824044844
sydney in l8                             0.04774491824044844
they were sent                           0.04774491824044844
were sent into                           0.04774491824044844
to be sent                               0.04129019071059047
the empire air                           0.03483546318073251
000 letters which                        0.028198917842841394
1914 like piany                          0.028198917842841394
1916 when the                            0.028198917842841394
200 it has                               0.028198917842841394
40 pa _engeis                            0.028198917842841394
50 000 letters                           0.028198917842841394
500 miles lagos                          0.028198917842841394
5d for postcaids 

men to reach                             0.0441824967669746
pacific aboard the                       0.0441824967669746
reach australia well                     0.0441824967669746
to reach australia                       0.0441824967669746
27 days is                               0.04255485114302273
the atlantic three                       0.04255485114302273
the pacific aboard                       0.04255485114302273
27 days london                           0.0407095787262074
atlantic three days                      0.0407095787262074
britain begins her                       0.0407095787262074
business men to                          0.0407095787262074
full sized tennis                        0.0407095787262074


1925

bagdad to colombo                        0.08535359630665311
slr kelth smith                          0.08535359630665311
be extended to                           0.08312475412772886
london to bagdad                         0.0738144790946731
service to india          

average daily flight                     0.022277610243645392
229 metres and                           0.02086668139669627


1920

all that is                              0.06051880053437344
as good as                               0.0532228967918637
said sir ross                            0.0532228967918637
that is wanted                           0.0532228967918637
am about to                              0.04293989801306663
an idle fan                              0.04293989801306663
and bull story                           0.04293989801306663
and comfort be                           0.04293989801306663
and durability of                        0.04293989801306663
and people are                           0.04293989801306663
and want it                              0.04293989801306663
and would willingly                      0.04293989801306663
are prone to                             0.04293989801306663
as number of                             0.04293989801306663
as pioneers in    

council of women                         0.03233568132245159
due singapore nov                        0.03233568132245159
leaves alexandria nov                    0.03233568132245159
leaves darwin nov                        0.03233568132245159
to imperial airways                      0.03233568132245159
airlines of australia                    0.029570526907366945
darwin nov due                           0.029570526907366945
due london nov                           0.029570526907366945
due sydney nov                           0.029570526907366945
imperial airways no                      0.029570526907366945
karachi nov leaves                       0.029570526907366945
leaves calcutta nov                      0.029570526907366945
leaves karachi nov                       0.029570526907366945
nov due singapore                        0.029570526907366945
nov leaves darwin                        0.029570526907366945
nov leaves karachi                       0.029570526907366945
nov tranships

In [106]:
calculate_tfidf_by_newspaper(ngram_size=1)

1536200911


THE MERCURY (HOBART, TAS. : 1860 - 1954)

rometch                                  0.0258397352435745
strahan                                  0.025216227810368565
brownell                                 0.02342578177640727
mongolia                                 0.02342578177640727
pedder                                   0.02342578177640727
launceston                               0.022807321553748865
kingsmill                                0.022763180065568576
giblin                                   0.022245718285806303
agst                                     0.022166077209062728
bellerive                                0.021895913229468662
fingal                                   0.021895913229468662
mathinna                                 0.02174154644119813
bennison                                 0.021081666287104856
wherrett                                 0.020798784925900708
oonah                                    0.020668396741229236
corinna              

ibsen                                    0.16188636859243138
outis                                    0.12731560062574437
gong                                     0.09021494307751633
gissing                                  0.08835636408992066
persevering                              0.07536903817968268
clocks                                   0.07278484803546735
socialistic                              0.0682063210311603
ashton                                   0.06703786476798426
classical                                0.06554450539271246
tho                                      0.06422839406726136
fisheries                                0.06182580553980686
acclimatised                             0.057987219306251454
ainuso                                   0.057987219306251454
antago                                   0.057987219306251454
bescecliod                               0.057987219306251454
btrngglers                               0.057987219306251454
caricaturist        

cools                                    0.12324843963459717
aainrt                                   0.11351331819157744
atuiwphere                               0.11351331819157744
eanpci                                   0.11351331819157744
enargy                                   0.11351331819157744
ensrgy                                   0.11351331819157744
ffiond                                   0.11351331819157744
grsat                                    0.11351331819157744
jify                                     0.11351331819157744
lozy                                     0.11351331819157744
prohably                                 0.11351331819157744
raiulfthpf                               0.11351331819157744
remember1                                0.11351331819157744
simnly                                   0.11351331819157744
sotur                                    0.11351331819157744
suspsnse                                 0.11351331819157744
tasuiuuia               

spluttered                               0.0899607828502847
cooled                                   0.0853612253183565
condenser                                0.08492100387451332
changing                                 0.08485137854972304
moods                                    0.08330256490799366
energies                                 0.08051592687679222
milky                                    0.07511733464163328
suns                                     0.07448748190159223
cracked                                  0.07390867808464079
earth                                    0.07284907680806862
sun                                      0.07202042540353049
860000                                   0.07167117586835571
betweeri                                 0.07167117586835571
coinuni                                  0.07167117586835571
cotiipared                               0.07167117586835571
fiftth                                   0.07167117586835571
greatey                   



VICTORIAN EXPRESS (GERALDTON, WA : 1878 - 1894)

balmaceda                                0.14663624975044467
insurgents                               0.12075743124496181
loyalists                                0.10870645404634782
santiago                                 0.09291831949256939
chilian                                  0.09274658566658935
placilla                                 0.08893199989807862
paraiso                                  0.08003296376577419
surgents                                 0.08003296376577419
valparaiso                               0.07926344334867201
8st                                      0.07113392763346979
lltli                                    0.07113392763346979
subsiding                                0.07068633384798571
kintore                                  0.06394924701048535
fled                                     0.061040829373574565
rifles                                   0.05965620793868388
bedford                          

anguished                                0.05457952329365339
arnfield                                 0.05457952329365339
breakfasts                               0.05457952329365339
kux                                      0.05457952329365339
critic                                   0.05372384355044573
silverton                                0.0501511876929479
hemphill                                 0.049117989195344346


BORDER CHRONICLE (BORDERTOWN, SA : 1908 - 1950)

bordertown                               0.10220001431759156
tatiara                                  0.09996235355510545
hutley                                   0.09768633775944745
commonage                                0.0821932589110303
mundalla                                 0.07881272222112302
staude                                   0.07881272222112302
5oo                                      0.07092627793183634
rums                                     0.07092627793183634
peppermint                         

rogions                                  0.07130768732709708
ico                                      0.06894712234661161
continont                                0.06666960505500763
oarth                                    0.06305117607404179
woro                                     0.058911458337383635
thu                                      0.057572875867552796
1825                                     0.057530593145792064
poriods                                  0.057530593145792064
wraggu                                   0.057530593145792064
wraggc                                   0.05460386576007689
tho                                      0.05407251480378611
hoat                                     0.05378861757241104
locturo                                  0.05378861757241104
timo                                     0.05159775577370929
havu                                     0.05113364365113585
inoro                                    0.05113364365113585
thon               

hordern                                  0.05694088228998958
oyster                                   0.05148889141309569
bellbrook                                0.050509859329393766
biddy                                    0.050509859329393766
fzanes                                   0.050509859329393766
mercier                                  0.050509859329393766
scurr                                    0.050509859329393766
themometer                               0.050509859329393766
tipsy                                    0.04545555869829326
timbers                                  0.04335724268839011
casual                                   0.043143118982294795
hickey                                   0.0424989823617282
macleay                                  0.04093345271943855
alien                                    0.040927883293897134
littler                                  0.04040125806719276
swamps                                   0.04014704249611124
dreamt           

amlbarely                                0.1685938617050279
and5the                                  0.1685938617050279
anticlones                               0.1685938617050279
decembetr                                0.1685938617050279
egradually                               0.1685938617050279
energyi                                  0.1685938617050279
ireach                                   0.1685938617050279
iseksons                                 0.1685938617050279
jostabout                                0.1685938617050279
lmow                                     0.1685938617050279
religio                                  0.1685938617050279
s4oken                                   0.1685938617050279
sittakes                                 0.1685938617050279
sunpot                                   0.1685938617050279
withini1912                              0.1685938617050279
yoiinselvei                              0.1685938617050279
autarctia                               

morwell                                  0.05403619527036103
thle                                     0.05069687689010098
leongatha                                0.043445764648217734
grubb                                    0.04213998586321034
tihe                                     0.04151859560189294
anotlher                                 0.03894925044133928
arragul                                  0.03894925044133928
exhillaration                            0.03894925044133928
tihere                                   0.03894925044133928
solly                                    0.03861493730897801
dougall                                  0.0374818989581347
cornell                                  0.03705975826692679
tlhe                                     0.03644737789107438
lhas                                     0.035789084468064225
fromn                                    0.0352020784475149
locals                                   0.03505176936140305
nicolson                

tlhe                                     0.049343356603005456
thll                                     0.046043291130028044
tlih                                     0.046043291130028044
lthe                                     0.0443086828701587
withl                                    0.04242876416849641
liit                                     0.04143594043639787
anid                                     0.041037532072302346
inl                                      0.038918569411992364
trl                                      0.03874081305295261
willt                                    0.03874081305295261
thie                                     0.03774123982879303
tihe                                     0.03695237037068609
lti                                      0.036828589742767696
revived                                  0.03659685436071564
soils                                    0.03659685436071564
tle                                      0.03611827077066447
aind               

sutcliffe                                0.09547414651316065
decorations                              0.0683628841233015
roast                                    0.055389272518043625
warship                                  0.05489080587374399
albany                                   0.05386841980907323
brilliance                               0.051423020219643824
clara                                    0.0503631638262137
statistical                              0.0503631638262137
styria                                   0.04939559074647605
wben                                     0.04850550979552805
tbe                                      0.04602717560126801
fulfilment                               0.045522394016047324
_uit                                     0.04549394236786518
aastralia                                0.04549394236786518
aianospheric                             0.04549394236786518
aoffote                                  0.04549394236786518
apparcas                

ihis                                     0.10186301958211297
11113                                    0.09718984873173399
111t4                                    0.09718984873173399
alonlo                                   0.09718984873173399
autuman                                  0.09718984873173399
awill                                    0.09718984873173399
bhromleter                               0.09718984873173399
btut                                     0.09718984873173399
careavou                                 0.09718984873173399
ccutrate                                 0.09718984873173399
climtax                                  0.09718984873173399
comnmon                                  0.09718984873173399
condn                                    0.09718984873173399
continellt                               0.09718984873173399
couhl                                    0.09718984873173399
countinent                               0.09718984873173399
cpesaek                 

1930                                     0.12160241917333182
aence                                    0.11275756037272744
aheaid                                   0.11275756037272744
fearfil                                  0.11275756037272744
haigs                                    0.11275756037272744
lntervening                              0.11275756037272744
mniuimui                                 0.11275756037272744
onil                                     0.11275756037272744
predictih                                0.11275756037272744
prohets                                  0.11275756037272744
reasdns                                  0.11275756037272744
regionl                                  0.11275756037272744
seiptled                                 0.11275756037272744
tespt                                    0.11275756037272744
tutil                                    0.11275756037272744
uninitiatel                              0.11275756037272744
zoalanul                

calloway                                 0.023469541509742814
gorrie                                   0.021462566438504166
jessamine                                0.021193167185368306
higgs                                    0.02012364550499711
epigram                                  0.019314901318557173
goondl                                   0.019314901318557173
tractors                                 0.01927532074219672
bast                                     0.019052611438173193
fbok                                     0.018875125553690076
plutarch                                 0.018875125553690076
visor                                    0.018875125553690076
morgans                                  0.018614424768311228
oub                                      0.018108128548447993
armenians                                0.01805859775928138
farrar                                   0.01805859775928138
fte                                      0.01805859775928138
herberton    

muldoon                                  0.02667121515146099
zaimes                                   0.02567703949765178
jessamine                                0.02535473995890708
higgs                                    0.02486857087643301
barber                                   0.02392917605431614
georgetown                               0.023848772173247348
millchester                              0.02310765049794316
yez                                      0.02310765049794316
czech                                    0.022581518652669327
finition                                 0.022581518652669327
havard                                   0.022581518652669327
impostors                                0.022581518652669327
amd                                      0.022493591140607973
armenians                                0.021604654283348183
t3ie                                     0.021604654283348183
taie                                     0.021604654283348183
arawatta       

disuse                                   0.04560460767440469
torbens                                  0.04560460767440469
fos                                      0.04521316960899957


KERANG NEW TIMES (VIC. : 1901 - 1918)

salinity                                 0.07765745382323656
boort                                    0.07605758015505687
teal                                     0.0648780279904556
tbe                                      0.061238906405485814
gregory                                  0.05951766326514096
frequency                                0.05642102325030453
atlantic                                 0.05014691229440217
153p                                     0.049915675297204753
18s6                                     0.049915675297204753
223p                                     0.049915675297204753
242p                                     0.049915675297204753
250p                                     0.049915675297204753
317p                                   

newspader                                0.10182413710309879
petersflield                             0.10182413710309879
plactd                                   0.10182413710309879
rvay                                     0.10182413710309879
toells                                   0.10182413710309879


OVENS AND MURRAY ADVERTISER (BEECHWORTH, VIC. : 1855 - 1918)

yackandandah                             0.06906241049855795
alonday                                  0.058521673439761786
rocket                                   0.054715234803364945
kierath                                  0.05229872953813243
millthorpe                               0.05229872953813243
alessrs                                  0.05201452361100495
beechworth                               0.04991968926348663
merino                                   0.04932112773399025
rutherglen                               0.04756655209071319
chloroform                               0.04408561736116842
bonegilla          

fawkner                                  0.10447595485584621
geil                                     0.09051683009334269
carolin                                  0.07136564449285972
comforters                               0.07136564449285972
rattles                                  0.06422439668920077
lien                                     0.05783508949391871
sliall                                   0.05708314888554185
conceit                                  0.05131762732162475
epithets                                 0.05131762732162475
maxim                                    0.050855802778323585
nbsp                                     0.04851937167300996
hales                                    0.04764293278133243
heroes                                   0.04764293278133243
garden                                   0.04649965920069723
wharves                                  0.045455979914863276
corpse                                   0.044176379517965825
bendigo              

fl                                       0.08074294072813162
harrold                                  0.07567404777525469
tbe                                      0.07293699249524173
ash                                      0.07006406766740059
11907                                    0.060289026117045015
accountauts                              0.060289026117045015
adkiees                                  0.060289026117045015
aitn                                     0.060289026117045015
amm                                      0.060289026117045015
antaj                                    0.060289026117045015
apile                                    0.060289026117045015
asjd                                     0.060289026117045015
atber                                    0.060289026117045015
auvtijt                                  0.060289026117045015
ayhat                                    0.060289026117045015
baiket                                   0.060289026117045015
basy        

haddy                                    0.11951391636347752
zingara                                  0.11951391636347752
banner                                   0.10801589791139012
kadina                                   0.10755468165324165
conservation                             0.08677522598167249
predominate                              0.08594009993565471
1913                                     0.0841789482526967
hunt                                     0.07238616088661111
0allar8of                                0.07058684427183273
actton                                   0.07058684427183273
adrt                                     0.07058684427183273
ahti                                     0.07058684427183273
apb1l                                    0.07058684427183273
changcs                                  0.07058684427183273
clemjbnt                                 0.07058684427183273
coalesoence                              0.07058684427183273
conijpctiod              

redcliffe                                0.030862708066692533
humpybong                                0.02940301639065208
tubbs                                    0.0266068970399986
moxley                                   0.022545985994112894
routgen                                  0.022545985994112894
bth                                      0.021904949648879346
bft                                      0.021707564688461366
woody                                    0.02138834425419176
israelites                               0.021206128354203074
glaisber                                 0.02061798787672759
mutinous                                 0.02061798787672759
oholera                                  0.02061798787672759
stenograms                               0.02061798787672759
spode                                    0.020289907819440307
divisional                               0.019734122156082248
nbsp                                     0.019663085613602825
traffio         

narandera                                0.10324052603790979
sovs                                     0.0980155537023641
currajong                                0.08705997407434256
ninon                                    0.08705997407434256
waddell                                  0.07367993115010987
coolamon                                 0.07325223929983982
bridegroom                               0.07203861073614368
junee                                    0.06919838077709657
cubic                                    0.06795146461240766
gorman                                   0.0667970005631943
tributaries                              0.0667970005631943
slender                                  0.06260311016492577
irrigation                               0.06206870066248207
handicap                                 0.06050392651168798
recreation                               0.059755914955586
brooch                                   0.059444504525337054
wagga                       

1016                                     0.2239246874439654
1033                                     0.13225352764070436
1vh                                      0.13225352764070436
becin                                    0.13225352764070436
birkcnbrad                               0.13225352764070436
cnlly                                    0.13225352764070436
cqod                                     0.13225352764070436
govrrnnicnt                              0.13225352764070436
hirn                                     0.13225352764070436
igcc                                     0.13225352764070436
iiomc                                    0.13225352764070436
innd                                     0.13225352764070436
intcrviouid                              0.13225352764070436
loictrllrr                               0.13225352764070436
nictrninlo                               0.13225352764070436
sck                                      0.13225352764070436
sea8on8                  

nbsp                                     0.16046871523107417
unpaid                                   0.09350173920473542
consultative                             0.09295898717959483
haynes                                   0.09295898717959483
ragged                                   0.08837058729410367
salaried                                 0.08837058729410367
honorary                                 0.08335340968793897
hopetoun                                 0.07979887283757199
leagues                                  0.06625981468457413
administrators                           0.06525207091984503
bacca                                    0.06525207091984503
benignantly                              0.06525207091984503
bluntest                                 0.06525207091984503
dubi                                     0.06525207091984503
explan                                   0.06525207091984503
explicitly                               0.06525207091984503
forrarder               

predictors                               0.11708238136115925
tanco                                    0.09851308489868178
rulo                                     0.08987885374526866
reputa                                   0.08419163107567071
tating                                   0.08419163107567071
cream                                    0.08245286605589673
elaboration                              0.08193461643796049
boldest                                  0.07994378843620428
hazarding                                0.07994378843620428
insignificance                           0.07994378843620428
aching                                   0.0781629321042024
gowen                                    0.0781629321042024
overshadowed                             0.0781629321042024
devas                                    0.07655195159457769
plagues                                  0.07655195159457769
disasters                                0.07597483005644252
predictions                

declina                                  0.14541059077438434
184g                                     0.12272285475937714
ajusd                                    0.12272285475937714
cjrtain                                  0.12272285475937714
eaye                                     0.12272285475937714
eraisphere                               0.12272285475937714
forerunnera                              0.12272285475937714
i1i                                      0.12272285475937714
ifitteqmltn                              0.12272285475937714
juty                                     0.12272285475937714
nrjon                                    0.12272285475937714
oxoess                                   0.12272285475937714
rtspecfc                                 0.12272285475937714
tiden                                    0.12272285475937714
vap                                      0.12272285475937714
declination                              0.11947462551040336
atmo                    

berrigan                                 0.10178478524817022
nbsp                                     0.08841559630321176
tbinks                                   0.07742397625013027
olsen                                    0.06881503901055705
agoin                                    0.06604358024601636
skipper                                  0.06490811383032624
sleet                                    0.06302007836759066
ranzo                                    0.0618645711565026
memories                                 0.06051820379334046
steward                                  0.05836665842470684
aft                                      0.05708305209029802
adds                                     0.05423438208828612
afore                                    0.05417606019740278
tier                                     0.05325563391692937
1bti                                     0.05081242462409637
1uuuu                                    0.05081242462409637
8atiflfying              

bwords                                   0.11449497706998135
drinlts                                  0.11449497706998135
nampeun                                  0.11449497706998135
nionsoop                                 0.11449497706998135
armor                                    0.11001832924840166
6o                                       0.1030379656558577
wos                                      0.10267580688774303
marry                                    0.09775333334066194
hoys                                     0.09633604360826112
petersfleld                              0.09633604360826112
thingb                                   0.09633604360826112
curves                                   0.09356854048200682
weapons                                  0.09280175320233036
isked                                    0.09158095424173406
fools                                    0.09132846646852229
ov                                       0.0906199226749351
the                       

limbo                                    0.09780974603860032
planted                                  0.09655807562271713
av                                       0.09292940659073565
balls                                    0.09280888299768465
divil                                    0.09233025279618111
fowls                                    0.08246242002242514
cask                                     0.07912784758715731
fust                                     0.07912784758715731
em                                       0.07829580739636482
3912                                     0.07792439425119504


HAMILTON SPECTATOR (VIC. : 1870 - 1918)

extravaganza                             0.07695190051292182
wbat                                     0.0738527375655335
eome                                     0.07262636588790398
panels                                   0.0691795920391679
verandahs                                0.06767402684769859
roof                                     0.0

aneut                                    0.11665119337290233
tcountry                                 0.11665119337290233
weaponsi                                 0.11665119337290233
armor                                    0.11209023948600569
chis                                     0.10497841882727654
wos                                      0.1046094397369081
marry                                    0.09959426415184656
curves                                   0.09533066156007393
fools                                    0.09304840155528017
ov                                       0.09232651417476748
the                                      0.08981447896171314
bitten                                   0.08960893230204646
strongest                                0.08896816312274987
monsoon                                  0.08738381010444818
tower                                    0.08712797443046363
bigger                                   0.08596401573744343
placid                   

cadman                                   0.03709758619698994
kuranda                                  0.03590596606861889
mulgrave                                 0.034525333844111265
wolfram                                  0.03385296582864127
mourilyan                                0.032835502217480425


TRAFALGAR AND YARRAGON TIMES (VIC. : 1914 - 1918)

ijionsoon                                0.1688439794967968
paonsoons                                0.1688439794967968
doesn                                    0.13849452489841488
loved                                    0.13575270440906265
placid                                   0.12442655827991629
summer                                   0.11107644080330789
stimson                                  0.1103952276611491
blease                                   0.10827432336718892
elbert                                   0.10827432336718892
gillian                                  0.10827432336718892
hubbard                          

hodgman                                  0.06523154566126327
cootamundra                              0.06194461480471167
reardon                                  0.06098867526603269
farrant                                  0.05727166114132523
muttama                                  0.05727166114132523
wallendbeen                              0.05237413060578616
cyclonet                                 0.04713328050466309
wallendoon                               0.047036693813519034
harney                                   0.046060115586925784
jindalee                                 0.046060115586925784
pinkstone                                0.046060115586925784
inglis                                   0.040905349275727525
balcony                                  0.040626270713572304
falconer                                 0.04020525351489524
coota                                    0.03882672962248236
drivers                                  0.037172873234395935
allmau           

cudgegong                                0.09205384950494107
mudgee                                   0.08714755930622202
gulgong                                  0.06170392460079563
hargraves                                0.05780896564255072
farthest                                 0.05569673848005877
mcewen                                   0.054376576890825985
avould                                   0.05083974872057047
bagnall                                  0.04874875442088478
cobbora                                  0.04874875442088478
crudine                                  0.04874875442088478
loneragan                                0.04874875442088478
mattick                                  0.04874875442088478
mooy                                     0.04874875442088478
rheinberger                              0.04874875442088478
turkington                               0.04874875442088478
lithgow                                  0.045758520371458285
bathurst              

pilotlcss                                0.10697911856608021
cattlo                                   0.09053774911209396
conferenco                               0.09001220223884637
greenwood                                0.08212303541844596
abattoirs                                0.07486432155445265
tho                                      0.0716762288690161
prico                                    0.07141810308123676
infants                                  0.0662216417417132
3733                                     0.06318359076775644
administratiou                           0.06318359076775644
agamoi                                   0.06318359076775644
aifcted                                  0.06318359076775644
anxwen                                   0.06318359076775644
arkes                                    0.06318359076775644
bcccnt                                   0.06318359076775644
berviofa                                 0.06318359076775644
bfflom                    

grandma                                  0.052990045672805325
steerage                                 0.038728055575502074
kater                                    0.03729028105367372
xenia                                    0.03729028105367372
borchgrevink                             0.034101438849698934
winefred                                 0.034101438849698934
brindisi                                 0.03356827764009917
ultimo                                   0.031780117564354275
taiyuan                                  0.03137603267569797
biver                                    0.03099573740983303
neely                                    0.030689057065472383
zain                                     0.030689057065472383
1119                                     0.029990306224273355
brambletye                               0.029990306224273355
conil                                    0.029990306224273355
creugniet                                0.029990306224273355
hurstville   

yaekandandah                             0.1226321631154757
zwar                                     0.1009234938856109
beechworth                               0.08608895060632422
poole                                    0.08261867151050342
chargers                                 0.07957050835950046
burgess                                  0.07722309034526485
testimonials                             0.07311451729049119
remounts                                 0.0716082357268311
draughts                                 0.06734150382931307
broadford                                0.06695060481620121
aird                                     0.06364596309416175
youngest                                 0.062105837274483634
roach                                    0.061050687738139514
chiltern                                 0.060068367020882515
costume                                  0.05828629380504347
easter                                   0.057877416692414585
sluicing               

vvragge                                  0.16214673709443836
theor                                    0.14460081352005846
antarctia                                0.13958275488051047
lasia                                    0.1352359147682175
austpa                                   0.1331791153261443
bdl7                                     0.1331791153261443
brld                                     0.1331791153261443
jisy                                     0.1331791153261443
lfes                                     0.1331791153261443
pluhis                                   0.1331791153261443
qaarrcl                                  0.1331791153261443
riptins                                  0.1331791153261443
flc                                      0.1198524639440325
natter                                   0.1198524639440325
predic                                   0.11587644685427083
stippled                                 0.11267193255428962
luy                                

onslaught                                0.15537526607292607
4ursus                                   0.12761737207887566
barograpn                                0.12761737207887566
biay                                     0.12761737207887566
btrland                                  0.12761737207887566
doivu                                    0.12761737207887566
hiut                                     0.12761737207887566
ijrsus                                   0.12761737207887566
kinr                                     0.12761737207887566
luiometi                                 0.12761737207887566
njune                                    0.12761737207887566
stxetches                                0.12761737207887566
tirror                                   0.12761737207887566
tisinan                                  0.12761737207887566
urdus                                    0.12761737207887566
vceterday                                0.12761737207887566
westeriv                

pulped                                   0.09608533863516892
limitation                               0.08084617866873274
oomes                                    0.08084617866873274
package                                  0.06909307180214783
refrigerating                            0.06909307180214783
anticyclones                             0.06766612930414195
packing                                  0.06560701870229653
barometrical                             0.0637528611777364
rabbits                                  0.06015089587320204
gape                                     0.056785039690344624
601b                                     0.05674954885102917
aaeni                                    0.05674954885102917
alwa                                     0.05674954885102917
arrangemainmj                            0.05674954885102917
baraoohi                                 0.05674954885102917
bnow                                     0.05674954885102917
btcrnns                 

anxiety                                  0.13054595406060576
floods                                   0.1291867365291782
reat                                     0.12855080680387934
calamities                               0.12408972895099725
lengthened                               0.12408972895099725
resembles                                0.12209309441560574
biver                                    0.11525206884456102
neighbouring                             0.11376765515368348
rising                                   0.11376419980867974
continuously                             0.11100940502623836
improbable                               0.11100940502623836
slips                                    0.11100940502623836


ADELAIDE OBSERVER (SA : 1843 - 1904)

pengelly                                 0.03433108374407372
bernhardt                                0.03417164638895632
dawkins                                  0.03208902216834137
yds                                      0.031

rawei                                    0.05717170654888568
phial                                    0.045979728184335024
gympie                                   0.03948477222223159
woro                                     0.039430397521742
lindley                                  0.037380566242578885
acrobatisms                              0.03709615518721064
dolivorod                                0.03709615518721064
evei                                     0.03709615518721064
interments                               0.03709615518721064
justioo                                  0.03709615518721064
sorios                                   0.03709615518721064
syson                                    0.03709615518721064
talleyrand                               0.03709615518721064
tieis                                    0.03709615518721064
vanneck                                  0.03709615518721064
ergy                                     0.03658761334741014
myles                   

likened                                  0.04904722459675186
installation                             0.04839026646641298
messina                                  0.04752067156870505
planets                                  0.04661501451010947
outbursts                                0.04533315937572297
wrugge                                   0.04533315937572297
solar                                    0.0451175226729276


TASMANIAN NEWS (HOBART, TAS. : 1883 - 1911)

pills                                    0.05259199523924706
obart                                    0.0502980139648862
rth                                      0.04745127982358123
ellington                                0.04509231126400948
kingsmiil                                0.04509231126400948
onowai                                   0.04509231126400948
rifted                                   0.04509231126400948
ith                                      0.04442976010516395
pink                                    

cnristian                                0.1431319215205729
cojirse                                  0.1431319215205729
dardanelia                               0.1431319215205729
ftreat                                   0.1431319215205729
hifetorv                                 0.1431319215205729
jafter                                   0.1431319215205729
jiassen                                  0.1431319215205729
jjiat                                    0.1431319215205729
kunberley                                0.1431319215205729
leadens                                  0.1431319215205729
mafcking                                 0.1431319215205729
pfenoa                                   0.1431319215205729
recorfla                                 0.1431319215205729
rrfjftntfid                              0.1431319215205729
tearie                                   0.1431319215205729
ueii                                     0.1431319215205729
ueonta                                  

1914                                     0.09830407222272677
aailed                                   0.09541764692288762
atarvatioo                               0.09541764692288762
basod                                    0.09541764692288762
bbokiuhd                                 0.09541764692288762
bcllevu                                  0.09541764692288762
bnrwood                                  0.09541764692288762
demolishrd                               0.09541764692288762
drewned                                  0.09541764692288762
earlli                                   0.09541764692288762
ebu                                      0.09541764692288762
fallb                                    0.09541764692288762
fuuetionary                              0.09541764692288762
goast                                    0.09541764692288762
iniuiuium                                0.09541764692288762
inufct                                   0.09541764692288762
neariug                 

leoturer                                 0.07751157615785978
wbioh                                    0.05960955440901772
wae                                      0.05688002049428971
aod                                      0.05352083713102781
glblin                                   0.052946762686165626
inhabitated                              0.052946762686165626
thamae                                   0.052946762686165626
zino                                     0.052946762686165626
iters                                    0.052492304695795314
oot                                      0.04879029392474487
davonport                                0.04764861179815309
obarge                                   0.04764861179815309
scientifio                               0.04764861179815309
ulver                                    0.04764861179815309
comet                                    0.04623913229078214
beiog                                    0.04454939220550328
formby             

accomodation                             0.13222959004453994
tiiany                                   0.13222959004453994
fdr                                      0.09802630576141422
drama                                    0.0955072624968795
colombo                                  0.07814576735194953
6olumns                                  0.0780969259865583
7l0                                      0.0780969259865583
anetan                                   0.0780969259865583
asocial                                  0.0780969259865583
attaioing                                0.0780969259865583
aused                                    0.0780969259865583
baxaar                                   0.0780969259865583
bitnddfy                                 0.0780969259865583
botelkeepert                             0.0780969259865583
brfr                                     0.0780969259865583
bubgay                                   0.0780969259865583
cdntafos                            

In [74]:
aggregate_newspapers()

1536200911


In [75]:
df = open_harvest_data()

1536200911


In [99]:
newspapers = df[['newspaper_id', 'newspaper_title']].drop_duplicates().set_index('newspaper_id')
newspapers.loc['10']

KeyError: 'the label [10] is not in the [index]'

In [85]:
n

Unnamed: 0_level_0,Unnamed: 1_level_0,article_id,words,corrections
newspaper_id,newspaper_title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,"The South Australian Advertiser (Adelaide, SA : 1858 - 1889)",686068440,33488,16
8,The Maitland Mercury and Hunter River General Advertiser (NSW : 1843 - 1893),75959376,5528,0
9,"Northern Territory Times and Gazette (Darwin, NT : 1873 - 1927)",31281505,19437,13
10,"The Mercury (Hobart, Tas. : 1860 - 1954)",322427246,88715,31
11,The Canberra Times (ACT : 1926 - 1995),713157939,5589,4
12,"The Courier-Mail (Brisbane, Qld. : 1933 - 1954)",76516085,1271,1
13,"The Argus (Melbourne, Vic. : 1848 - 1957)",196460675,122907,80
16,The Brisbane Courier (Qld. : 1864 - 1933),2020246731,401596,397
30,"The West Australian (Perth, WA : 1879 - 1954)",420072286,23005,9
34,"The Advertiser (Adelaide, SA : 1889 - 1931)",558887120,74751,60
