In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# define data locations

data_dir = 'Gutenberg_English_Fiction_1k/'
target_file = 'master996.csv'
corpus_dir = 'Gutenberg_19th_century_English_Fiction/'

In [3]:
# import target

data = pd.read_csv(data_dir + target_file, sep=';', engine='python')
data.loc[:]['book_id'] = data['book_id'].apply(lambda book_id: book_id[:-5]) # remove '.epub' ending
data.set_index('book_id', inplace=True)
data

Unnamed: 0_level_0,Book_Name,guten_genre,Author_Name
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pg10067,The Mystery of the Boule Cabinet: A Detective ...,Detective and Mystery,Stevenson| Burton Egbert
pg1032,The Pupil,Literary,James| Henry
pg10379,At Love's Cost,Literary,Garvice| Charles
pg10473,The Heart of the Range,Western Stories,White| William Patterson
pg10812,The Worshipper of the Image,Literary,Gallienne| Richard Le
...,...,...,...
pg766DickensDavidCopfld,David Copperfield,Literary,Dickens| Charles
pg786DickensHardTimes,Hard Times,Literary,Dickens| Charles
pg834DoyleMemoirsSherlk,Memoirs of Shelock Holmes,Detective and Mystery,Connan| Doyle
pg863Agatha1,The Mysterious Affair at Styles,Detective and Mystery,Christie| Agatha


In [4]:
# import corpus

def get_book_content(book_id):      
    filename = data_dir + corpus_dir + book_id + '-content.html'
    
    with open(filename, encoding='utf-8') as file:        
        try:
            content = file.read()
            
        except UnicodeDecodeError:
            print('UnicodeDecodeError trying to read {}. Returning None.'.format(book_id))
            return None
        
        content = content.replace('<p>','')   
        
    return content

data['content'] = [get_book_content(book_id) for book_id in data.index]
data

Unnamed: 0_level_0,Book_Name,guten_genre,Author_Name,content
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pg10067,The Mystery of the Boule Cabinet: A Detective ...,Detective and Mystery,Stevenson| Burton Egbert,A Detective Story\nA.B.M. Fellow-Sherlockian\n...
pg1032,The Pupil,Literary,James| Henry,This edition first published 1916\nThe text fo...
pg10379,At Love's Cost,Literary,Garvice| Charles,"""Until this moment I have never fully realised..."
pg10473,The Heart of the Range,Western Stories,White| William Patterson,"""The Rider of Golden Bar,"" ""Hidden Trails,"" ""L..."
pg10812,The Worshipper of the Image,Literary,Gallienne| Richard Le,The Worshipper of the Image\nEvening was in th...
...,...,...,...,...
pg766DickensDavidCopfld,David Copperfield,Literary,Dickens| Charles,I do not find it easy to get sufficiently far ...
pg786DickensHardTimes,Hard Times,Literary,Dickens| Charles,The One Thing Needful\nMurdering the Innocents...
pg834DoyleMemoirsSherlk,Memoirs of Shelock Holmes,Detective and Mystery,Connan| Doyle,"""I am afraid, Watson, that I shall have to go,..."
pg863Agatha1,The Mysterious Affair at Styles,Detective and Mystery,Christie| Agatha,The intense interest aroused in the public by ...


In [5]:
#to faster debug, i use a smaller subset of the texts right now:
data_content = data['content'][:5]

In [6]:
import maPrepro
data_content_filtered = maPrepro.prepare_texts(data_content, use_stemming=True)

Text Count:  5  Progress: 
0

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1234

In [7]:
#--------------------------
#Tokenization to integers:

In [8]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential

#import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=None, #max number of words, only the X most relevant words will be taken
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', #signs that are filtered out
    lower=True, #when true, everything will be converted tp lower case
    split=" ", #word seperator
    char_level=False, #if true, every char will be treated as token
    oov_token=None, #out-of-vocabulary replacement
)

In [9]:
tokenizer.fit_on_texts(data_content_filtered) #tokenizer gets fit to our texts

In [10]:
sequences = tokenizer.texts_to_sequences(data_content[0]) 
sequencesFiltered = tokenizer.texts_to_sequences(data_content_filtered[0]) 
#return a list of tokens for the X (see num_words) most common words the tokenizer knows, for every input text
print("unfiltered length: ",len(sequences)," filtered length",len(sequencesFiltered))

unfiltered length:  371350  filtered length 245848
