In [1]:
import pandas as pd

In [3]:
df = pd.read_hdf('pg-text-1-gb-merged.hdf', 'pg')

In [8]:
df.columns

Index(['LCC', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads',
       'formats', 'id', 'language_GB', 'subjects_GB', 'title_GB', 'type',
       '_repo', '_version', 'alternative_title', 'contributor', 'covers',
       'creator', 'description', 'edition_identifiers', 'edition_note',
       'gutenberg_bookshelf', 'gutenberg_issued', 'gutenberg_type',
       'identifiers', 'jauthor', 'jbookid', 'jlang', 'jloc', 'jmdate', 'jsubj',
       'jtitle', 'language_PG', 'language_note', 'production_note',
       'publication_date', 'publication_note', 'publisher', 'rights',
       'rights_url', 'series_note', 'subjects_PG', 'summary',
       'tableOfContents', 'title_PG', 'titlepage_image', 'url', 'wikipedia'],
      dtype='object')

In [12]:
# jauthor seems to just be the author plus birth and death dates, 
# and that information is already recorded in 
# columns 'authoryearofbirth' and 'authoryearofdeath'. Drop it.  

In [11]:
df = df.drop('jauthor', 1)

In [15]:
# Jtitle also doesn't have anything new.
df = df.drop('jtitle', 1)


In [24]:
# It looks like Gitenberg titles replace line breaks with colons,
# which is easier to work with. 
# Also PG titles seem to be 'nan' on quite a lot of them. 
# Good case for dropping title_PG. 
df = df.drop('title_PG', 1)

In [42]:
df = df.rename(columns={'title_GB': 'title'})

In [35]:
df = df.drop('jloc', 1) # Jloc is a subset of LCC

In [54]:
df.columns

Index(['LCC', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads',
       'formats', 'id', 'languages', 'subjects_GB', 'title', 'type', '_repo',
       '_version', 'alternative_title', 'contributor', 'covers', 'creator',
       'description', 'edition_identifiers', 'edition_note',
       'gutenberg_bookshelf', 'gutenberg_issued', 'gutenberg_type',
       'identifiers', 'jbookid', 'jmdate', 'jsubj', 'language_note',
       'production_note', 'publication_date', 'publication_note', 'publisher',
       'rights', 'rights_url', 'series_note', 'subjects_PG', 'summary',
       'tableOfContents', 'titlepage_image', 'url', 'wikipedia'],
      dtype='object')

In [48]:
# Gitenberg has way more language data. Go with that. 
df = df.drop('language_PG', 1)

In [51]:
df = df.drop('jlang', 1)

In [49]:
df = df.rename(columns={'language_GB': 'languages'})

In [52]:
# Merge subjects

In [55]:
df[['title', 'subjects_GB', 'subjects_PG', 'jsubj']]

Unnamed: 0,title,subjects_GB,subjects_PG,jsubj
0,,{},,
1,The Declaration of Independence of the United ...,"{United States -- History -- Revolution, 1775-...","['!lcc E201', '!lcsh United States -- History ...",
2,The United States Bill of Rights: The Ten Orig...,"{Civil rights -- United States -- Sources, Uni...",['!lcsh Civil rights -- United States -- Sourc...,['Government -- United States']
3,John F. Kennedy's Inaugural Address,{Presidents -- United States -- Inaugural addr...,['!lcsh Presidents -- United States -- Inaugur...,
4,Lincoln's Gettysburg Address: Given November 1...,"{Lincoln, Abraham, 1809-1865. Gettysburg addre...","['!lcc E456', ""!lcsh Soldiers' National Cemete...",
5,The United States Constitution,"{United States. Constitution, United States --...",['!lcsh United States -- Politics and governme...,
6,Give Me Liberty or Give Me Death,{Virginia -- Politics and government -- 1775-1...,['!lcsh United States -- Politics and governme...,
7,The Mayflower Compact,"{Massachusetts -- History -- New Plymouth, 162...","['!lcsh Mayflower Compact (1620)', '!lcsh Mass...",
8,Abraham Lincoln's Second Inaugural Address,{Presidents -- United States -- Inaugural addr...,['!lcsh United States -- Politics and governme...,
9,Abraham Lincoln's First Inaugural Address,{Presidents -- United States -- Inaugural addr...,['!lcsh Presidents -- United States -- Inaugur...,


In [65]:
df.loc[26]['subjects_GB']

{'Adam (Biblical figure) -- Poetry',
 'Bible. Genesis -- History of Biblical events -- Poetry',
 'Eve (Biblical figure) -- Poetry',
 'Fall of man -- Poetry'}

In [64]:
df.loc[26]['subjects_PG']

"['!lcsh Eve (Biblical figure) -- Poetry', '!lcsh Adam (Biblical figure) -- Poetry', '!lcsh Bible. Genesis -- History of Biblical events -- Poetry', '!lcsh Fall of man -- Poetry', '!lcc PR', 'GITenberg']"

In [63]:
df.loc[26]['jsubj']

"['Poetry', 'Religion']"

In [66]:
df = df.drop('subjects_PG', 1)

In [None]:
df

In [67]:
df = df.rename(columns={'subjects_GB': 'lcsh'})

In [68]:
df = df.rename(columns={'jsubj': 'subjects'})

In [74]:
df[df.production_note.notnull()].production_note

4231                          EBook produced by Col Choat.
6585     Produced by Steve Schulze, Charles Franks and ...
12272    EBook produced by David Starner and Heather Ma...
Name: production_note, dtype: object

In [77]:
# Can't identify this one. 
df = df.drop('jbookid', 1)

In [94]:
df.loc[53158].formats

{'application/epub+zip': 'http://www.gutenberg.org/ebooks/54409.epub.noimages',
 'application/rdf+xml': 'http://www.gutenberg.org/ebooks/54409.rdf',
 'application/x-mobipocket-ebook': 'http://www.gutenberg.org/ebooks/54409.kindle.images',
 'image/jpeg': 'http://www.gutenberg.org/cache/epub/54409/pg54409.cover.small.jpg',
 'text/html; charset=utf-8': 'http://www.gutenberg.org/files/54409/54409-h/54409-h.htm',
 'text/plain; charset=utf-8': 'http://www.gutenberg.org/files/54409/54409-0.txt'}

In [95]:
df.columns

Index(['LCC', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads',
       'formats', 'id', 'languages', 'lcsh', 'title', 'type', '_repo',
       '_version', 'alternative_title', 'contributor', 'covers', 'creator',
       'description', 'edition_identifiers', 'edition_note',
       'gutenberg_bookshelf', 'gutenberg_issued', 'gutenberg_type',
       'identifiers', 'jmdate', 'subjects', 'language_note', 'production_note',
       'publication_date', 'publication_note', 'publisher', 'rights',
       'rights_url', 'series_note', 'summary', 'tableOfContents',
       'titlepage_image', 'url', 'wikipedia'],
      dtype='object')

In [97]:
df.to_hdf('pg-text-2-cleaned.hdf', 'pg')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['LCC', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads', 'formats', 'id', 'languages', 'lcsh', 'title', 'type', '_repo', '_version', 'alternative_title', 'contributor', 'covers', 'creator', 'description', 'edition_identifiers', 'edition_note', 'gutenberg_bookshelf', 'gutenberg_issued', 'gutenberg_type', 'identifiers', 'jmdate', 'subjects', 'language_note', 'production_note', 'publication_date', 'publication_note', 'publisher', 'rights', 'rights_url', 'series_note', 'summary', 'tableOfContents', 'titlepage_image', 'url', 'wikipedia']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [98]:
df

Unnamed: 0,LCC,author,authoryearofbirth,authoryearofdeath,downloads,formats,id,languages,lcsh,title,...,publication_note,publisher,rights,rights_url,series_note,summary,tableOfContents,titlepage_image,url,wikipedia
0,{},,,,,{},0,,{},,...,,,,,,,,,,
1,"{E201, JK}","Jefferson, Thomas",1743,1826,619,{'text/html': 'http://www.gutenberg.org/ebooks...,1,[en],"{United States -- History -- Revolution, 1775-...",The Declaration of Independence of the United ...,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/1,
2,"{KF, JK}",United States,,,189,{'text/html': 'http://www.gutenberg.org/files/...,2,[en],"{Civil rights -- United States -- Sources, Uni...",The United States Bill of Rights: The Ten Orig...,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/2,
3,{E838},"Kennedy, John F. (John Fitzgerald)",1917,1963,36,{'text/html': 'http://www.gutenberg.org/files/...,3,[en],{Presidents -- United States -- Inaugural addr...,John F. Kennedy's Inaugural Address,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/3,
4,{E456},"Lincoln, Abraham",1809,1865,66,{'text/html': 'http://www.gutenberg.org/files/...,4,[en],"{Lincoln, Abraham, 1809-1865. Gettysburg addre...",Lincoln's Gettysburg Address: Given November 1...,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/4,
5,"{KF, JK}",United States,,,425,{'text/plain': 'http://www.gutenberg.org/ebook...,5,[en],"{United States. Constitution, United States --...",The United States Constitution,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/5,
6,{E201},"Henry, Patrick",1736,1799,90,{'text/html': 'http://www.gutenberg.org/files/...,6,[en],{Virginia -- Politics and government -- 1775-1...,Give Me Liberty or Give Me Death,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/6,
7,{F001},,,,24,{'text/html': 'http://www.gutenberg.org/files/...,7,[en],"{Massachusetts -- History -- New Plymouth, 162...",The Mayflower Compact,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/7,
8,{E456},"Lincoln, Abraham",1809,1865,29,{'text/html; charset=us-ascii': 'http://www.gu...,8,[en],{Presidents -- United States -- Inaugural addr...,Abraham Lincoln's Second Inaugural Address,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/8,
9,{E456},"Lincoln, Abraham",1809,1865,39,{'text/plain; charset=us-ascii': 'http://www.g...,9,[en],{Presidents -- United States -- Inaugural addr...,Abraham Lincoln's First Inaugural Address,...,,Project Gutenberg,Public domain in the USA.,http://creativecommons.org/about/pdm,,,,,http://www.gutenberg.org/ebooks/9,
