## Downloading and cleaning some books:

In [1]:
import urllib.request
import re
import numpy as np
import pandas as pd
import feather

In [2]:
books = pd.read_csv('books.csv', encoding='latin-1')

In [3]:
books.head()

Unnamed: 0,gutenberg_id,text,title,author
0,11,ALICE'S ADVENTURES IN WONDERLAND,Alice's Adventures in Wonderland,"Carroll, Lewis"
1,11,,Alice's Adventures in Wonderland,"Carroll, Lewis"
2,11,Lewis Carroll,Alice's Adventures in Wonderland,"Carroll, Lewis"
3,11,,Alice's Adventures in Wonderland,"Carroll, Lewis"
4,11,THE MILLENNIUM FULCRUM EDITION 3.0,Alice's Adventures in Wonderland,"Carroll, Lewis"


In [4]:
books = books.fillna('')
books[['text','title']].head()

In [8]:
books[['text', 'title', 'author']] = books[['text', 'title', 'author']].astype(str)
books = books.fillna('')

In [9]:
books.head()

Unnamed: 0,gutenberg_id,text,title,author
0,11,ALICE'S ADVENTURES IN WONDERLAND,Alice's Adventures in Wonderland,"Carroll, Lewis"
1,11,,Alice's Adventures in Wonderland,"Carroll, Lewis"
2,11,Lewis Carroll,Alice's Adventures in Wonderland,"Carroll, Lewis"
3,11,,Alice's Adventures in Wonderland,"Carroll, Lewis"
4,11,THE MILLENNIUM FULCRUM EDITION 3.0,Alice's Adventures in Wonderland,"Carroll, Lewis"


In [10]:
book_titles = books['title'].unique()

In [11]:
book_titles = list(book_titles)

In [12]:
len(book_titles)

1745

In [13]:
book_titles[0:5]

["Alice's Adventures in Wonderland",
 'Through the Looking-Glass',
 'The Hunting of the Snark: An Agony in Eight Fits',
 'Moby Dick',
 'The Song of Hiawatha']

In [14]:
# make a list of complete book text for each book

concat_books_list=[]
for book in books['title'].unique():
    concat_books_list.append((' ').join(books.loc[books['title'] == book]['text']))

In [15]:
len(books['gutenberg_id'].unique())

1756

In [16]:
concat_books_list[6][:1000]

"The War of the Worlds  by H. G. Wells [1898]        But who shall dwell in these worlds if they be      inhabited? .  .  .  Are we or they Lords of the      World? .  .  .  And how are all things made for man?--           KEPLER (quoted in The Anatomy of Melancholy)    BOOK ONE  THE COMING OF THE MARTIANS    CHAPTER ONE  THE EVE OF THE WAR   No one would have believed in the last years of the nineteenth century that this world was being watched keenly and closely by intelligences greater than man's and yet as mortal as his own; that as men busied themselves about their various concerns they were scrutinised and studied, perhaps almost as narrowly as a man with a microscope might scrutinise the transient creatures that swarm and multiply in a drop of water.  With infinite complacency men went to and fro over this globe about their little affairs, serene in their assurance of their empire over matter.  It is possible that the infusoria under the microscope do the same.  No one gave a th

In [17]:
book_author = books['author'].unique()
book_author_list = list(book_author)

In [18]:
book_author_list[1]

'Melville, Herman'

In [19]:
concat_df = books[['gutenberg_id','author','title']]

In [20]:
len(concat_df.drop_duplicates())

1756

In [21]:
len(book_titles)

1745

#### These should be the same number, but they're not, indicating that I've got some duplicate book titles.

In [22]:
# make a list of complete book text for each book

concat_books_list=[]
for book in books['gutenberg_id'].unique():
    concat_books_list.append((' ').join(books.loc[books['gutenberg_id'] == book]['text']))

In [23]:
concat_books_list[6][:1000]

"The War of the Worlds  by H. G. Wells [1898]        But who shall dwell in these worlds if they be      inhabited? .  .  .  Are we or they Lords of the      World? .  .  .  And how are all things made for man?--           KEPLER (quoted in The Anatomy of Melancholy)    BOOK ONE  THE COMING OF THE MARTIANS    CHAPTER ONE  THE EVE OF THE WAR   No one would have believed in the last years of the nineteenth century that this world was being watched keenly and closely by intelligences greater than man's and yet as mortal as his own; that as men busied themselves about their various concerns they were scrutinised and studied, perhaps almost as narrowly as a man with a microscope might scrutinise the transient creatures that swarm and multiply in a drop of water.  With infinite complacency men went to and fro over this globe about their little affairs, serene in their assurance of their empire over matter.  It is possible that the infusoria under the microscope do the same.  No one gave a th

In [24]:
concat_df = books[['gutenberg_id','author','title']]
concat_df = concat_df.drop_duplicates()
concat_df.head(7)

Unnamed: 0,gutenberg_id,author,title
0,11,"Carroll, Lewis",Alice's Adventures in Wonderland
3339,12,"Carroll, Lewis",Through the Looking-Glass
7246,13,"Carroll, Lewis",The Hunting of the Snark: An Agony in Eight Fits
8086,15,"Melville, Herman",Moby Dick
8248,19,"Longfellow, Henry Wadsworth",The Song of Hiawatha
14663,35,"Wells, H. G. (Herbert George)",The Time Machine
17884,36,"Wells, H. G. (Herbert George)",The War of the Worlds


In [25]:
concat_df['text'] = concat_books_list
concat_df.head(7)

Unnamed: 0,gutenberg_id,author,title,text
0,11,"Carroll, Lewis",Alice's Adventures in Wonderland,ALICE'S ADVENTURES IN WONDERLAND Lewis Carrol...
3339,12,"Carroll, Lewis",Through the Looking-Glass,THROUGH THE LOOKING-GLASS By Lewis Carroll ...
7246,13,"Carroll, Lewis",The Hunting of the Snark: An Agony in Eight Fits,THE HUNTING OF THE SNARK ...
8086,15,"Melville, Herman",Moby Dick,The reader will find a complete text and html ...
8248,19,"Longfellow, Henry Wadsworth",The Song of Hiawatha,THE SONG OF HIAWATHA ...
14663,35,"Wells, H. G. (Herbert George)",The Time Machine,"The Time Machine, by H. G. Wells [1898] I ..."
17884,36,"Wells, H. G. (Herbert George)",The War of the Worlds,The War of the Worlds by H. G. Wells [1898] ...


In [26]:
import pickle

pickler = open("R_books_df.pkl","wb")
pickle.dump(concat_df, pickler)
pickler.close()

pickler = open("R_book_authors_list.pkl","wb")
pickle.dump(book_author_list, pickler)
pickler.close()

pickler = open("R_books_list.pkl","wb")
pickle.dump(concat_books_list, pickler)
pickler.close()

pickler = open("R_book_titles_list.pkl","wb")
pickle.dump(book_titles, pickler)
pickler.close()