# Preprocess county dataset

This notebook processes a dataset mapping British Library (BL) newspaper title IDs to Counties created by Yann Ryan, previous Curator of Newspaper Data at the BL. Read more about how Yann created the original dataset at https://bookdown.org/yann_ryan/r-for-newspaper-data/mapping-with-r-geocode-and-map-the-british-librarys-newspaper-collection.html#get-county-information-from-the-title-list

In this notebook, we make some small additions. The processed data is used in Step 2 of Press Picker for filtering the titles by county in the visualisation.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Read in counties dataset
path2counties = os.path.join("datasets", "title_list_with_corrected_counties.csv")
titles_counties = pd.read_csv(path2counties, encoding = "ISO-8859-1", dtype=str)

titles_counties.head()

Unnamed: 0,title_id,nid,nlp,publication_title,edition,preceding_titles,succeeding_titles,place_of_publication,country_of_publication,general_area_of_coverage,...,current_publication_frequency,publisher,holdings_more_information,free_text_information_about_dates_of_publication,online_status,link_to_british_newspaper_archive,explore_link,wikititle,geometry,G_NAME
0,2841953,,,"Corante, or, Newes from Italy, Germany, Hungar...",,,,London,England,,...,,N. B|Nathaniel Butter,,,,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
1,2852602,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
2,2852624,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
3,2852630,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
4,2852631,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON


In [3]:
len(titles_counties)

22486

In [4]:
# Add the leading zeros back onto ids
titles_counties['title_id'] = titles_counties['title_id'].astype(int)
titles_counties['title_id'] = titles_counties['title_id'].apply(lambda x: '{0:0>9}'.format(x))
titles_counties.head()

Unnamed: 0,title_id,nid,nlp,publication_title,edition,preceding_titles,succeeding_titles,place_of_publication,country_of_publication,general_area_of_coverage,...,current_publication_frequency,publisher,holdings_more_information,free_text_information_about_dates_of_publication,online_status,link_to_british_newspaper_archive,explore_link,wikititle,geometry,G_NAME
0,2841953,,,"Corante, or, Newes from Italy, Germany, Hungar...",,,,London,England,,...,,N. B|Nathaniel Butter,,,,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
1,2852602,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
2,2852624,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
3,2852630,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON
4,2852631,,,A Relation of the late Occurrents which haue h...,,,,London,England,London,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,London,"c(-0.1275, 51.5072)",LONDON


In [5]:
# Get list of unique counties with counts
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(titles_counties['G_NAME'].value_counts())

LONDON                5789
LANCASHIRE            1575
WEST RIDING            958
KENT                   804
ESSEX                  631
WARWICKSHIRE           598
CHESHIRE               522
HAMPSHIRE              513
STAFFORDSHIRE          508
SUSSEX                 508
DEVON                  472
LINCOLNSHIRE           464
COUNTY DURHAM          439
HERTFORDSHIRE          434
SURREY                 409
LANARKSHIRE            397
GLOUCESTERSHIRE        359
GLAMORGAN              348
NOTTINGHAMSHIRE        319
SOMERSET               298
BERKSHIRE              279
NORFOLK                260
CORNWALL               255
DERBYSHIRE             252
EAST RIDING            249
SUFFOLK                223
BUCKINGHAMSHIRE        222
LEICESTERSHIRE         212
WILTSHIRE              204
NORTHAMPTONSHIRE       186
BEDFORDSHIRE           183
MIDLOTHIAN             180
FORFARSHIRE            180
WORCESTERSHIRE         168
CAMBRIDGESHIRE         160
OXFORDSHIRE            156
DORSET                 155
N

### Check how many titles have a missing county

In [6]:
# Check how many titles have a missing county
print('%s titles are missing county data' % len(titles_counties[pd.isna(titles_counties['G_NAME'])]['title_id'].unique()))

235 titles are missing county data


In [7]:
# Where titles are missing a county - do they have a 'general_area_of_coverage'?
titles_counties[pd.isna(titles_counties['G_NAME'])]['general_area_of_coverage'].value_counts()

Avon                                       88
Jersey                                     33
Isle of Man                                31
Guernsey                                   29
Devon                                      22
Ireland                                    10
Antrim (Northern Ireland : County)          6
Highland Region                             2
Cornwall                                    2
Londonderry (Northern Ireland : County)     1
Belfast (Northern Ireland)                  1
Aberdeenshire                               1
Kildare (Ireland : County)                  1
Name: general_area_of_coverage, dtype: int64

In [8]:
# What titles have neither county nor 'general_area_of_coverage'?
# print('%s titles are missing county AND general_area_of_coverage data % len(titles_counties[pd.isna(titles_counties['G_NAME'] & pd.isna(titles_counties['general_area_of_coverage'])]['title_id'].unique()))

titles_counties[pd.isna(titles_counties['G_NAME']) & pd.isna(titles_counties['general_area_of_coverage'])]

Unnamed: 0,title_id,nid,nlp,publication_title,edition,preceding_titles,succeeding_titles,place_of_publication,country_of_publication,general_area_of_coverage,...,current_publication_frequency,publisher,holdings_more_information,free_text_information_about_dates_of_publication,online_status,link_to_british_newspaper_archive,explore_link,wikititle,geometry,G_NAME
178,2834200,,,Haerlemsche Courant,,,,Haerlem,England,,...,,A. Casteleyn,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,Haarlem,"c(4.63333, 52.3833)",
253,2824144,,,Amsterdamse Courant,,,,Amsterdam,England,,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,Amsterdam,"c(4.9, 52.3667)",
467,2831494,,,"The London Chronicle. January 1-July 7, 1758",,,,Dublin,Ireland,,...,,,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...,Dublin,"c(-6.26667, 53.35)",
21050,13372506,,,Totally Dublin,,,,Dublin,Ireland,,...,Monthly,,,2004 October (issue 1) -,,,http://primocat.bl.uk/F?func=direct&local_base...,Dublin,"c(-6.26667, 53.35)",
21373,14495511,,,The Carrick biz,,Continues: Belfast biz,,Belfast,Northern Ireland,,...,Monthly,Belfast Biz,,2007 February (issue 1) -,,,http://primocat.bl.uk/F?func=direct&local_base...,Belfast,"c(-5.93, 54.5964)",
21462,14672658,,,Dealer : 'free-ads' paper serving Northern Ire...,,,,Derry,Northern Ireland,,...,Weekly,Dealer,,,,,http://primocat.bl.uk/F?func=direct&local_base...,Derry,"c(-7.3074, 54.9958)",
22399,14895880,,,"Alive (Dublin, Ireland)|Alive! : Catholic mont...",,,,Dublin,Ireland,,...,Monthly,Alive Group,,,,,http://primocat.bl.uk/F?func=direct&local_base...,Dublin,"c(-6.26667, 53.35)",
22473,15553870,,,Sveiks!|Sveiks! (Dublin),,,,Dublin,Ireland,,...,Bi-weekly|Fortnightly,Sveiks,,,,,http://primocat.bl.uk/F?func=direct&local_base...,Dublin,"c(-6.26667, 53.35)",


None of these are Great Britain newspapers, so we will not worry they are excluded

In [9]:
# Where 'general_area_of_coverage' is the County - set that as the County facet
for index, row in titles_counties.iterrows():     
        # if the title's 'G_NAME' is empty
        if pd.isna(row['G_NAME']):
            # if the title's 'general_area_of_coverage' matches one of these 
            if (row['general_area_of_coverage'] == 'Avon') | (row['general_area_of_coverage'] == 'Jersey') | (row['general_area_of_coverage'] == 'Isle of Man') | \
            (row['general_area_of_coverage'] == 'Guernsey') | (row['general_area_of_coverage'] == 'Devon')| (row['general_area_of_coverage'] == 'Cornwall')| \
            (row['general_area_of_coverage'] == 'Aberdeenshire'):
                row['G_NAME'] = row['general_area_of_coverage'].upper()

titles_counties[pd.isna(titles_counties['G_NAME'])]['general_area_of_coverage'].value_counts()                

Ireland                                    10
Antrim (Northern Ireland : County)          6
Highland Region                             2
Belfast (Northern Ireland)                  1
Londonderry (Northern Ireland : County)     1
Kildare (Ireland : County)                  1
Name: general_area_of_coverage, dtype: int64

In [10]:
# Where are the Highland newspaper titles from?
titles_counties[(titles_counties['general_area_of_coverage'] == "Highland Region") & pd.isna(titles_counties['G_NAME'])]

Unnamed: 0,title_id,nid,nlp,publication_title,edition,preceding_titles,succeeding_titles,place_of_publication,country_of_publication,general_area_of_coverage,...,current_publication_frequency,publisher,holdings_more_information,free_text_information_about_dates_of_publication,online_status,link_to_british_newspaper_archive,explore_link,wikititle,geometry,G_NAME
3020,13908552,16336,,The Invergordon Times and General Advertiser,,,,,Scotland,Highland Region,...,,,no 142-3209 (17 March 1858 - 3 January 1917),,,,http://primocat.bl.uk/F?func=direct&local_base...,Invergordon,"c(-4.15704, 57.6879)",
16141,13895085,5854,,Ullapool News,,,,,Scotland,Highland Region,...,,,,,,,http://primocat.bl.uk/F?func=direct&local_base...,Ullapool,"c(-5.166, 57.9)",


These are both in ROSS AND CROMARTY

In [11]:
titles_counties.loc[(pd.isna(titles_counties['G_NAME']) & (titles_counties['general_area_of_coverage'] == "Highland Region")), ['G_NAME']] = "ROSS AND CROMARTY"

In [12]:
titles_counties[pd.isna(titles_counties['G_NAME'])]['general_area_of_coverage'].value_counts()                

Ireland                                    10
Antrim (Northern Ireland : County)          6
Belfast (Northern Ireland)                  1
Londonderry (Northern Ireland : County)     1
Kildare (Ireland : County)                  1
Name: general_area_of_coverage, dtype: int64

### What newspaper titles are missing from titles_counties?

In [13]:
# Read in the master newspaper title dataset
path2newspaper_titles = os.path.join("datasets", "BritishAndIrishNewspapers_20191118.xlsx")
sheet_name = "Title List"

titles_orig_read = pd.read_excel(path2newspaper_titles, sheet_name=sheet_name, dtype='str')
titles = titles_orig_read
titles.head()
len(titles)

24927

In [14]:
titles.head()

Unnamed: 0,Title.ID,NID,NLP,Publication title,Edition,Preceding titles,Succeeding titles,Place of publication,Country of publication,General area of coverage,...,Last date held,Publication date one,Publication date two,Current publication frequency,Publisher,Holdings: more information,Free text information about dates of publication,Online status,Link to British Newspaper Archive,Explore link
0,2824188,,,"Corante, or, Newes from Italy and Germanie (It...",,,,Amsterdam,The Netherlands,,...,1621,1621,,,Broer Ianson,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
1,2824189,,,"Corante, or, Newes from Italy, Germanie, Hunga...",,,,Amsterdam,The Netherlands,,...,1621,1621,,,Broer Ionson,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
2,2834274,,,"Corante, or, Newes from Italy, Germany, Hungar...",,,,The Hague,The Netherlands,,...,1621,1621,,,Adrian Clarke,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
3,2841953,,,"Corante, or, Newes from Italy, Germany, Hungar...",,,,London,England,,...,1621,1621,1621.0,,N. B|Nathaniel Butter,,,,,http://primocat.bl.uk/F?func=direct&local_base...
4,2824187,,,"Courant Newes out of Italy, Germany, Bohemia, ...",,,,Amsterdam,The Netherlands,,...,1621,1621,,,George Veseler,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...


In [15]:
"%s titles out of %s from master sheet missing from titles_counties" % (len(titles[~titles['Title.ID'].isin(list(titles_counties['title_id']))]), len(titles))

'2441 titles out of 24927 from master sheet missing from titles_counties'

In [16]:
# Get list of unique 'General area of coverage's with counts for titles missing from titles_counties
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(titles[~titles['Title.ID'].isin(list(titles_counties['title_id']))]['General area of coverage'].value_counts())

Dublin (Ireland : County)                                                                                                                                                                                                                                                                               426
Belfast (Northern Ireland)                                                                                                                                                                                                                                                                              119
London                                                                                                                                                                                                                                                                                                   91
Cork (Ireland : County)                                                                             

The majority of missing titles seem to be from Ireland, which we are not visualising in Press Picker, so we don't need to worry about that here. 
The following code adds the low hanging fruit missing ones from Britain, using their 'General area of coverage'. 

In [17]:
titles_missingFrom_titles_counties = titles[~titles['Title.ID'].isin(list(titles_counties['title_id']))]
titles_missingFrom_titles_counties.head()

Unnamed: 0,Title.ID,NID,NLP,Publication title,Edition,Preceding titles,Succeeding titles,Place of publication,Country of publication,General area of coverage,...,Last date held,Publication date one,Publication date two,Current publication frequency,Publisher,Holdings: more information,Free text information about dates of publication,Online status,Link to British Newspaper Archive,Explore link
0,2824188,,,"Corante, or, Newes from Italy and Germanie (It...",,,,Amsterdam,The Netherlands,,...,1621,1621,,,Broer Ianson,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
1,2824189,,,"Corante, or, Newes from Italy, Germanie, Hunga...",,,,Amsterdam,The Netherlands,,...,1621,1621,,,Broer Ionson,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
2,2834274,,,"Corante, or, Newes from Italy, Germany, Hungar...",,,,The Hague,The Netherlands,,...,1621,1621,,,Adrian Clarke,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
4,2824187,,,"Courant Newes out of Italy, Germany, Bohemia, ...",,,,Amsterdam,The Netherlands,,...,1621,1621,,,George Veseler,The earliest English-language serialised news ...,,,,http://primocat.bl.uk/F?func=direct&local_base...
111,2830572,,,The Irish Monthly Mercury. 21 Dec. 1649. no. 1,,,,Cork|London,England|Ireland,London,...,1649,1649,,,G. Calvert|T. N,,,BURNEY,,http://primocat.bl.uk/F?func=direct&local_base...


In [18]:
# Where the title does not appear in corrected_county, 
# if its 'General area of coverage' from titles is one of county options, use that
for index, row in titles_missingFrom_titles_counties.iterrows():    
    # and the title's 'General area of coverage' isn't empty
    if ~pd.isna(row['General area of coverage']) & (row['General area of coverage']  != 'nan'):
        # if the title's 'General area of coverage' matches an entry from the existing list of counties 
        if (row['General area of coverage'] == 'London') | \
        (row['General area of coverage'] == 'Gwent') | \
        (row['General area of coverage'] == 'Shropshire') | \
        (row['General area of coverage'] == 'Worcestershire') | \
        (row['General area of coverage'] == 'Cornwall'):
            if titles_counties['G_NAME'].str.match(row['General area of coverage']).any():
                county_to_add = row['General area of coverage']
            elif row['General area of coverage'] == 'London':
                county_to_add = 'LONDON'
            elif row['General area of coverage'] == 'Gwent':
                county_to_add = 'MONMOUTHSHIRE'
            elif row['General area of coverage'] == 'Shropshire':
                county_to_add = 'SHROPSHIRE'
            elif row['General area of coverage'] == 'Worcestershire':
                county_to_add = 'WORCESTERSHIRE'
            elif row['General area of coverage'] == 'Cornwall':
                county_to_add = 'CORNWALL'
        else:
            county_to_add = np.nan
        # Add new row to titles_counties
        new_row = pd.DataFrame(([[row['Title.ID'], county_to_add, row['Publication title'], row['General area of coverage']]]), columns=['title_id', 'G_NAME', 'publication_title', 'General area of coverage'])
        titles_counties = titles_counties.append(new_row, ignore_index=True)
    else:
        new_row = pd.DataFrame(([[row['Title.ID'], np.nan, row['Publication title'], row['General area of coverage']]]), columns=['title_id', 'G_NAME', 'publication_title', 'General area of coverage'])
        titles_counties = titles_counties.append(new_row, ignore_index=True)
        
len(titles_counties)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


24927

In [19]:
"%s titles out of %s from master sheet missing from titles_counties" % (len(titles[~titles['Title.ID'].isin(list(titles_counties['title_id']))]), len(titles))

'0 titles out of 24927 from master sheet missing from titles_counties'

In [20]:
print('%s titles are missing county data out of %s titles' % (len(titles_counties[pd.isna(titles_counties['G_NAME'])]['title_id'].unique()), len(titles_counties[~pd.isna(titles_counties['G_NAME'])]['title_id'].unique())))

2324 titles are missing county data out of 22597 titles


While there's room for tidying up the remainder, we have county data for most of the Great Britain titles

## Check against filtered title list from Step 01

In [21]:
path2titles = os.path.join("datasets", "dynamic_io", "titles.csv")
titles_filtered = pd.read_csv(path2titles, encoding = "ISO-8859-1", dtype=str)

# Add the leading zeros back onto ids
titles_filtered['Title.ID'] = titles_filtered['Title.ID'].astype(int)
titles_filtered['Title.ID'] = titles_filtered['Title.ID'].apply(lambda x: '{0:0>9}'.format(x))
titles_filtered.head()

Unnamed: 0.1,Unnamed: 0,Title.ID,NID,NLP,Publication title,Edition,Preceding titles,Succeeding titles,Place of publication,Country of publication,...,Holdings: more information,Free text information about dates of publication,Online status,Link to British Newspaper Archive,Explore link,connectivity,publication_title_str,preceding_title_str,succeeding_title_str,general_area_of_coverage_str
0,0,13901436,11147,,The York Courant,,,,,England,...,17 December 1728-5 January 1733; 8 November 17...,,,,http://primocat.bl.uk/F?func=direct&local_base...,,yorkcourant,,,northyorkshire
1,1,13904179,12991,2593.0,General evening post (London)|General evening ...,,,"Continued in part by: St. James's chronicle, a...",London,England,...,,1733 October 2 - 1822 February 2|Issue numberi...,,,http://primocat.bl.uk/F?func=direct&local_base...,,generaleveningpostlondon|generaleveningpostlon...,,"['stjamesschronicleandlondoneveningpost', 'stj...",london
2,2,13890724,2313,,Adam's Weekly Courant,,,Continued by: Chester Courant and Anglo-Welsh ...,,England,...,"no 249-253, 322 etc (24 August -21 September 1...",,,,http://primocat.bl.uk/F?func=direct&local_base...,,adamsweeklycourant,,['chestercourantandanglowelshgazette'],cheshire
3,3,13941263,31871,,The Newcastle Journal,,,,,England,...,no 1-1577 (7 April 1739-26 April 1788),,,,http://primocat.bl.uk/F?func=direct&local_base...,,newcastlejournal,,,tyneandwear
4,4,13920536,24283,1039.0,Glasgow Journal,,Continues: Glasgow Weekly Journal. 14 July 1741,,,Scotland,...,"no 1, etc (27 July 1741-25 July 1743; 15, 22 F...",,,,http://primocat.bl.uk/F?func=direct&local_base...,,glasgowjournal,['glasgowweeklyjournal'],,strathclyde


In [22]:
len(titles_filtered)

9325

In [23]:
# Are some of our filtered titles missing county information in titles_counties?
titles_counties_filtered = titles_counties[(titles_counties['title_id'].isin(list(titles_filtered['Title.ID']))) & pd.isna(titles_counties['G_NAME'])]
titles_counties_filtered

Unnamed: 0,G_NAME,General area of coverage,country_of_publication,coverage_city,current_publication_frequency,edition,explore_link,first_date_held,first_geographical_subject_heading,free_text_information_about_dates_of_publication,...,place_of_publication,preceding_titles,publication_date_one,publication_date_two,publication_title,publisher,subsequent_geographical_subject_headings,succeeding_titles,title_id,wikititle
22663,,Lewes,,,,,,,,,...,,,,,The Lewes and Brighthelmston pacquet and weekl...,,,,016309664,
22701,,Trinidad and Tobago,,,,,,,,,...,,,,,The Tobago Gazette,,,,013917428,
22760,,,,,,,,,,,...,,,,,"Pierce Egan's Life in London, and Sporting guide",,,,013912646,
22881,,Trinidad and Tobago,,,,,,,,,...,,,,,The Tobago Chronicle and Royal Gazette,,,,013917427,
22882,,Trinidad and Tobago,,,,,,,,,...,,,,,Tobago Gazette and West India News,,,,013917429,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24879,,"Isleworth (London, England)|London",,,,,,,,,...,,,,,Brentford & Isleworth chronicle|Brentford and ...,,,,014758274,
24884,,"Merton (London, England)",,,,,,,,,...,,,,,Wimbledon guardian|Wimbledon guardian.co.uk|Wi...,,,,015470517,
24892,,Strathclyde,,,,,,,,,...,,,,,Scottish daily express,,,,011451004,
24905,,,,,,,,,,,...,,,,,County news,,,,016161325,


In [24]:
titles_counties_filtered[pd.isna(titles_counties_filtered['G_NAME'])]['General area of coverage'].value_counts()

Dyfed                                                                                                           11
Strathclyde                                                                                                      5
Trinidad and Tobago                                                                                              5
Castle Cary  |Glastonbury  |Shepton Mallet  |Somerset                                                            2
Pimlico (London, England)                                                                                        2
Calder Valley (West Yorkshire, England)|West Yorkshire                                                           1
Essex  |Leigh-on-Sea  |Shoeburyness (Southend-on-Sea, England)                                                   1
Merton (London, England)                                                                                         1
Dumfries and Galloway                                                           

In [25]:
# Manually add corrected_county to this small number of titles without it, based on 'General area of coverage'
for index, row in titles_counties_filtered.iterrows():    
    if row['General area of coverage'] == 'Dyfed':
        # Dyfed covers more than 1 modern Welsh county - this is a compromise
        county_to_add = 'PEMBROKESHIRE'
    elif row['General area of coverage'] == 'Strathclyde':
        # Strathclyde covers many counties - this is a compromise
        county_to_add = 'ARGYLL'
    elif row['General area of coverage'] == ('Pimlico (London, England)' or '(Docklands (London, England)' or 'Isleworth (London, England)|London' or 'Merton (London, England)' or 'Crystal Palace (London, England)|Lambeth (London, England)|West Norwood (London, England)' or 'Clapham (London, England)|Lambeth (London, England)|Wandsworth (London, England)'):
        county_to_add = 'LONDON'
    elif row['General area of coverage'] == 'Castle Cary  |Glastonbury  |Shepton Mallet  |Somerset':
        county_to_add = 'SOMERSET'
    elif row['General area of coverage'] == 'Highland':
        # Highland covers many counties - this is a compromise
        county_to_add = 'INVERNESS'
    elif row['General area of coverage'] == 'Beaconsfield  |Buckinghamshire  |Chalfont St. Giles  |Chalfont St. Peter  |Marlow (Buckinghamshire, England)':
        county_to_add = 'BUCKINGHAMSHIRE'
    elif row['General area of coverage'] == ('East Riding of Yorkshire' or 'East Riding of Yorkshire  |Humberside'):
        county_to_add = 'EAST RIDING'
    elif row['General area of coverage'] == 'Essex  |Leigh-on-Sea  |Shoeburyness (Southend-on-Sea, England)':
        county_to_add = 'ESSEX'
    elif row['General area of coverage'] == ('Lewes' or 'East Sussex  |Newhaven  |Peacehaven (East Sussex, England)'):
        county_to_add = 'SUSSEX'
    elif row['General area of coverage'] == 'Grove (Oxfordshire, England)|Oxfordshire':
        county_to_add = 'OXFORDSHIRE'
    elif row['General area of coverage'] == ('Moffat' or 'Canonbie (Scotland : Parish)|Dumfriesshire (Scotland)|Newcastleton (Scotland)|Roxburghshire (Scotland)' or 'Dumfries and Galloway'):
        county_to_add = 'DUMFRIES SHIRE'
    elif row['General area of coverage'] == 'Powys':
        county_to_add = 'BRECKNOCKSHIRE'
    elif row['General area of coverage'] == 'Aberdeenshire':
        county_to_add = 'ABERDEENSHIRE'
    elif row['General area of coverage'] == 'Coningsby  |Lincolnshire  |Woodhall Spa':
        county_to_add = 'LINCOLNSHIRE'
    elif row['General area of coverage'] == 'West Midlands':
        # West Midlands covers many counties - this is a compromise
        county_to_add = 'WARWICKSHIRE'
    elif row['General area of coverage'] == 'Banffshire':
        county_to_add = 'BANFFSHIRE'
    elif row['General area of coverage'] == 'Calder Valley (West Yorkshire, England)|West Yorkshire':
        county_to_add = 'WEST RIDING'
    elif row['General area of coverage'] == 'Clwyd':
        county_to_add = 'FLINTSHIRE'
    # Set the new corrected_county
    titles_counties.loc[titles_counties['title_id'] == row['title_id'], ['G_NAME']] = county_to_add

# Check setting new corrected_county has been successful  
titles_counties[(titles_counties['title_id'].isin(list(titles_filtered['Title.ID']))) & pd.isna(titles_counties['G_NAME'])]

Unnamed: 0,G_NAME,General area of coverage,country_of_publication,coverage_city,current_publication_frequency,edition,explore_link,first_date_held,first_geographical_subject_heading,free_text_information_about_dates_of_publication,...,place_of_publication,preceding_titles,publication_date_one,publication_date_two,publication_title,publisher,subsequent_geographical_subject_headings,succeeding_titles,title_id,wikititle


## Save processed dataset

In [26]:
# Rename some columns
titles_counties = titles_counties.rename(columns={"G_NAME": "corrected_county", "title_id": "Title.ID"})
# Change counties from uppercase to titlecase
titles_counties['corrected_county'] = titles_counties['corrected_county'].str.title() 
# only keep title_id and corrected_county
titles_counties = titles_counties[['Title.ID', 'corrected_county']]
titles_counties.head()

Unnamed: 0,Title.ID,corrected_county
0,2841953,London
1,2852602,London
2,2852624,London
3,2852630,London
4,2852631,London


In [27]:
# Save as csv
parent_path = os.path.join("datasets", "dynamic_io")
titles_counties.to_csv(os.path.join(parent_path, "counties.csv"))