# Import Important Libraries

In [132]:
from urllib.request import urlopen 
from urllib.error import HTTPError
import json 
import numpy as np
import csv
import re
import random

# Pull Data From Website

In [1]:
def pull_data(min_yr = 1500, max_yr = 2000, num_pgs = 20):
    data = []

    for yr in range(min_yr, max_yr, 100):
        for pg in range(1, num_pgs + 1, 1):
            print(f"Processing Year {yr}, Page {pg}/{num_pgs}", end='\r')
            url = f"http://gutendex.com/books?author_year_start={yr}&author_year_start={yr + 99}&languages=en&page={pg}"
            response = urlopen(url)
            data_json = json.loads(response.read()) 
            data_pg = np.array([(x['id'], x['authors'][0]['birth_year']) for x in data_json['results']])

            data.extend(data_pg)
    return np.array(data)

## Save Data if data isn't saved

In [6]:
try:
    data = np.genfromtxt('data/date_data.csv', delimiter=',')[1:]
except FileNotFoundError:
    data = pull_data()
    data_csv = {'book_id': data[:,0], 'birth_yr': data[:,1]}
    with open('data/date_data.csv', 'w') as f:
        w = csv.writer(f)
        w.writerow(data_csv.keys())
        w.writerows(zip(*data_csv.values()))

In [7]:
data[:5]

array([[   84.,  1797.],
       [ 1513.,  1564.],
       [ 1342.,  1775.],
       [25344.,  1804.],
       [  345.,  1847.]])

# Get Text Data

In [155]:
def get_text(book_id):
    url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
    text = urlopen(url).read()
    return text

def get_text_samples(text, num_samples = 3):
    # Get rid of Guttenberg Header and footer
    book_text = [x.strip() for x in text.decode("utf-8").split('***')][2]
    # Remove '\r' symbol
    book_text = re.sub(r"[\r]+", "", book_text)
    # split by paragraph breaks
    book_text = re.split(r"\n{2,}", book_text)
    # remove paragraphs 
    book_text = list(filter(lambda x: len(x) >= (50 * 8), np.array(book_text)))
    # Randomly sample remaining paragraphs
    paragraphs = random.sample(book_text, min(num_samples, len(book_text)))
    # Replace \n with ' ' and return paragraphs
    return np.array([re.sub(r"\n", " ", p) for p in paragraphs])

In [160]:
book_ids = data[:,0].astype(int)
book_samples = []
invalid_ids = []

for i in range(book_ids.shape[0]):
    try:
        text = get_text(book_ids[i])
    except HTTPError as err:
        print(f"HTTP {err.code} Error: book_id = {book_ids[i]}")
        invalid_ids.append(book_ids[i])
        
    text_samples = get_text_samples(text)
    ids = np.full(len(text_samples), book_ids[i])
    samples = np.array(list(zip(ids, text_samples)))
    book_samples.extend(samples)
    
    print(f"Progress: {i/book_ids.shape[0]}", end='\r')


HTTP 404 Error: book_id = 33283
HTTP 404 Error: book_id = 5740
HTTP 404 Error: book_id = 38769
HTTP 404 Error: book_id = 21076
HTTP 404 Error: book_id = 3201
HTTP 404 Error: book_id = 41568
HTTP 404 Error: book_id = 51155
HTTP 404 Error: book_id = 114
HTTP 404 Error: book_id = 26471
HTTP 404 Error: book_id = 33283
HTTP 404 Error: book_id = 5740
HTTP 404 Error: book_id = 38769
HTTP 404 Error: book_id = 21076
HTTP 404 Error: book_id = 3201
HTTP 404 Error: book_id = 41568
HTTP 404 Error: book_id = 51155
HTTP 404 Error: book_id = 114
HTTP 404 Error: book_id = 26471
HTTP 404 Error: book_id = 29785
HTTP 404 Error: book_id = 5001
HTTP 404 Error: book_id = 19797
HTTP 404 Error: book_id = 33283
HTTP 404 Error: book_id = 5740
HTTP 404 Error: book_id = 38769
HTTP 404 Error: book_id = 21076
HTTP 404 Error: book_id = 3201
HTTP 404 Error: book_id = 41568
HTTP 404 Error: book_id = 51155
HTTP 404 Error: book_id = 114
HTTP 404 Error: book_id = 26471
HTTP 404 Error: book_id = 29785
HTTP 404 Error: book_

In [162]:
book_samples = np.array(book_samples)

In [163]:
book_samples[:,0]

array(['84', '84', '84', ..., '60946', '60946', '60946'], dtype='<U57303')

In [164]:
book_data_csv = {'book_id': book_samples[:,0], 'text': book_samples[:,1]}
with open('data/excerpts.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(book_data_csv.keys())
    w.writerows(zip(*book_data_csv.values()))