In [1]:
import pandas as pd
import numpy as np

In [2]:
books = pd.read_csv('./Data/british_library_books.csv')

In [3]:
books.head()

Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,"A new edition, revised, etc.",London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [4]:
print(books.columns)
print(len(books.columns))


Index(['Identifier', 'Edition Statement', 'Place of Publication',
       'Date of Publication', 'Publisher', 'Title', 'Author', 'Contributors',
       'Corporate Author', 'Corporate Contributors', 'Former owner',
       'Engraver', 'Issuance type', 'Flickr URL', 'Shelfmarks'],
      dtype='object')
15


## Removing Columns

In [5]:
unneeded_columns = ['Edition Statement',
                    'Corporate Author',
                    'Corporate Contributors',
                    'Former owner',
                    'Engraver',
                    'Contributors',
                    'Issuance type',
                    'Shelfmarks']

# books = books.drop(unneeded_columns, axis=1)
# inplace=true does the same thing

books.drop(unneeded_columns, inplace=True, axis=1)
len(books.columns)

7

## Changing the Index

In [6]:
books.set_index('Identifier', inplace=True)
books.head()

Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [7]:
books.loc[480]

Place of Publication                                               London
Date of Publication                                                  1857
Publisher                                            Wertheim & Macintosh
Title                   [The World in which I live, and my place in it...
Author                                                          A., E. S.
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Name: 480, dtype: object

## Cleaning Columns Data

In [8]:
books.loc[1905:,'Date of Publication'].head(20)

Identifier
1905                  1888
1929           1839, 38-54
2836                  1897
2854                  1865
2956               1860-63
2957                  1873
3017                  1866
3131                  1899
4598                  1814
4884                  1820
4976                  1800
5382    1847, 48 [1846-48]
5385               [1897?]
5389               [1897?]
5432                  1893
6036                  1805
6821                  1837
7521                  1896
7630                  1898
8239                  1899
Name: Date of Publication, dtype: object

Need to remove:
* extra dates in brackets
* Convert ranges to start date
* Remove dates with ?
* Convert nan to np NaN

In [34]:
# Find 4 digits at the beginning of string,
regex = r'^(\d{4})'

In [35]:
date_reformat = books['Date of Publication'].str.extract(regex, expand=False)
date_reformat.head()

Identifier
206    1879
216    1868
218    1869
472    1851
480    1857
Name: Date of Publication, dtype: object

In [37]:
books['Date of Publication'] = pd.to_numeric(date_reformat)

In [43]:
print(books.shape) #total rows
books['Date of Publication'].isnull().sum() #empty rows
# Approx 10%

(8287, 6)


971

## Cleaning Field Text

In [48]:
books['Place of Publication'].head(10)
# all different versions of london

Identifier
206                                  London
216                London; Virtue & Yorston
218                                  London
472                                  London
480                                  London
481                                  London
519                                  London
667     pp. 40. G. Bryan & Co: Oxford, 1898
874                                 London]
1143                                 London
Name: Place of Publication, dtype: object

In [44]:
books.loc[4157862]
# published in Newcastle

Place of Publication                                  Newcastle-upon-Tyne
Date of Publication                                                  1867
Publisher                                                      T. Fordyce
Title                   Local Records; or, Historical Register of rema...
Author                      FORDYCE, T. - Printer, of Newcastle-upon-Tyne
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Name: 4157862, dtype: object

In [46]:
books.loc[4159587]
# published in Newcastle - but text has hyphens

Place of Publication                                  Newcastle upon Tyne
Date of Publication                                                  1834
Publisher                                                Mackenzie & Dent
Title                   An historical, topographical and descriptive v...
Author                                              Mackenzie, E. (Eneas)
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Name: 4159587, dtype: object

In [53]:
published = books["Place of Publication"]
london_based = published.str.contains('London')
oxford_based = published.str.contains('Oxford')
london_based[:5]

Identifier
206    True
216    True
218    True
472    True
480    True
Name: Place of Publication, dtype: bool

In [59]:
books['Place of Publication'] = np.where(london_based, 'London', 
                                        np.where(oxford_based, 'Oxford', published.str.replace('-', ' ')))

In [60]:
books['Place of Publication'].head(10)

Identifier
206     London
216     London
218     London
472     London
480     London
481     London
519     London
667     Oxford
874     London
1143    London
Name: Place of Publication, dtype: object