In [1]:
%load_ext autoreload 
%autoreload 2

# imports and setup
import pandas as pd
import numpy as np
import os 

# import and instanciate config file to get access to paths
os.chdir(os.path.dirname(os.getcwd()))
from config.config import Config
config = Config()

# import utils functions
from src.pipelines.preprocessor import Preprocessor
from src.pipelines.models import Models
from sklearn import set_config; set_config(display='diagram')

In [2]:
from src.pipelines.preproc.pipeline_00 import build_preprocessor_pipeline_00

build_preprocessor_pipeline_00()

In [3]:
with open(os.path.join(config.DATA_PROCESSED_PATH, 'books_processed.csv'), 'r') as f:
    books = pd.read_csv(f)

books.head()


Unnamed: 0,Author,Book
0,Twain Mark,﻿The Project Gutenberg EBook of Chapters from ...
1,Plato,"﻿The Project Gutenberg EBook of Sophist, by Pl..."
2,Twain Mark,﻿The Project Gutenberg EBook of On the Decay o...
3,Shakespeare William,"﻿The Project Gutenberg eBook of Pericles, by W..."
4,Shakespeare William,﻿The Project Gutenberg eBook of As You Like It...


In [4]:
books.shape

(436, 2)

In [5]:
books.value_counts

<bound method DataFrame.value_counts of                   Author                                               Book
0             Twain Mark  ﻿The Project Gutenberg EBook of Chapters from ...
1                  Plato  ﻿The Project Gutenberg EBook of Sophist, by Pl...
2             Twain Mark  ﻿The Project Gutenberg EBook of On the Decay o...
3    Shakespeare William  ﻿The Project Gutenberg eBook of Pericles, by W...
4    Shakespeare William  ﻿The Project Gutenberg eBook of As You Like It...
..                   ...                                                ...
431           Twain Mark  ﻿The Project Gutenberg EBook of Life On The Mi...
432           Twain Mark  ﻿The Project Gutenberg EBook of A Tramp Abroad...
433          Austen Jane  ﻿The Project Gutenberg eBook of Sense and Sens...
434  Shakespeare William  ﻿\nProject Gutenberg Etext of The Rape of Lucr...
435                Plato  ﻿The Project Gutenberg EBook of Statesman, by ...

[436 rows x 2 columns]>

In [6]:
pipe00 = build_preprocessor_pipeline_00()
books_chunked = pipe00.fit_transform(books)
books_chunked 

Unnamed: 0,Author,Book
0,0,solitary rambles did not speak a mind at ease ...
1,0,poverty while i am in them—when we arrived at ...
2,0,flow of spirits on his side or any such expres...
3,0,brightening “dear me that’s a very good though...
4,0,was in a reverie of sweet remembrances “the sc...
...,...,...
9995,9,the stars all about it you will see them twink...
9996,9,the garden of death an eton kitcat mrs erlynne...
9997,9,carlyle in a book he had given her years ago a...
9998,9,alone are heard above no pinion cleaves its wa...


In [7]:
# check the number of chunks per book
books_chunked['Book'].apply(len)


0       1501
1       1327
2       1407
3       1234
4       1319
        ... 
9995    1280
9996    1580
9997    1348
9998    1386
9999    1361
Name: Book, Length: 10000, dtype: int64

In [8]:

# check the number of words in each chunk
books_chunked['Book'].apply(lambda x: len(x.split(' ')))

0       256
1       256
2       256
3       256
4       256
       ... 
9995    256
9996    256
9997    256
9998    256
9999    256
Name: Book, Length: 10000, dtype: int64

In [9]:
# it is working

In [10]:
books_chunked

Unnamed: 0,Author,Book
0,0,solitary rambles did not speak a mind at ease ...
1,0,poverty while i am in them—when we arrived at ...
2,0,flow of spirits on his side or any such expres...
3,0,brightening “dear me that’s a very good though...
4,0,was in a reverie of sweet remembrances “the sc...
...,...,...
9995,9,the stars all about it you will see them twink...
9996,9,the garden of death an eton kitcat mrs erlynne...
9997,9,carlyle in a book he had given her years ago a...
9998,9,alone are heard above no pinion cleaves its wa...


In [11]:
from keras.preprocessing.text import text_to_word_sequence

In [12]:
# only keep 5 rows 
books_chunked = books_chunked.iloc[:5, :]
books_chunked

Unnamed: 0,Author,Book
0,0,solitary rambles did not speak a mind at ease ...
1,0,poverty while i am in them—when we arrived at ...
2,0,flow of spirits on his side or any such expres...
3,0,brightening “dear me that’s a very good though...
4,0,was in a reverie of sweet remembrances “the sc...


In [13]:
books_chunked['Book'][3]

'brightening “dear me that’s a very good thought very good indeed to be sure i may just as well go as not for i am of no use at home—am i and it only harasses me you who have not a mother’s feelings are a great deal the properest person you can make little charles do anything he always minds you at a word it will be a great deal better than leaving him only with jemima oh i shall certainly go i am sure i ought if i can quite as much as charles for they want me excessively to be acquainted with captain wentworth and i know you do not mind being left alone an excellent thought of yours indeed anne i will go and tell charles and get ready directly you can send for us you know at a moment’s notice if anything is the matter but i dare say there will be nothing to alarm you i should not go you may be sure if i did not feel quite at ease about my dear child” the next moment she was tapping at her husband’s dressingroom door and as anne followed her up stairs she was in time for the whole conv

In [14]:
list_of_strings = books_chunked['Book'].apply(lambda x: text_to_word_sequence(x))

# check the number of words in each chunk
books_chunked['Book'].apply(lambda x: len(x.split(' ')))

len(list_of_strings[3])

256

In [15]:
from src.pipelines.preproc.pipeline_04 import build_preprocessor_pipeline_04

In [16]:
pipe04 = build_preprocessor_pipeline_04()
books_chunked_vec = pipe04.fit_transform(books)
books_chunked_vec

Unnamed: 0,Author,Book
0,0,"[[-0.14131084084510803, -0.2975656986236572, 0..."
1,0,"[[0.5318660736083984, 0.15655338764190674, -0...."
2,0,"[[0.17099054157733917, 0.09196417778730392, 0...."
3,0,"[[-0.040306948125362396, 0.08470198512077332, ..."
4,0,"[[0.28656548261642456, -3.0210437774658203, 1...."
...,...,...
9995,9,"[[-1.3644824028015137, -1.5537986755371094, -0..."
9996,9,"[[-1.3644824028015137, -1.5537986755371094, -0..."
9997,9,"[[-0.17829836905002594, 0.15979209542274475, -..."
9998,9,"[[-0.9533970355987549, 0.8526332378387451, -0...."


In [17]:
# check the number of vectors in each chunk
books_chunked_vec['Book'].apply(lambda x: len(x))

0       256
1       256
2       256
3       256
4       256
       ... 
9995    256
9996    256
9997    256
9998    256
9999    256
Name: Book, Length: 10000, dtype: int64

In [18]:
len(books_chunked_vec['Book'][0])

256

In [19]:
from src.pipelines.preproc.pipeline_045 import build_preprocessor_pipeline_045


In [20]:
pipe045 = build_preprocessor_pipeline_045()
books_chunked_vec = pipe045.fit_transform(books)
books_chunked_vec

Unnamed: 0,Author,Book
0,0,"[[0.03220482, -0.31926122, 0.38597348, 0.11726..."
1,0,"[[0.13064584, 0.30756617, 0.110088, -0.1859102..."
2,0,"[[-0.18071216, -0.11658524, -0.089549296, -0.1..."
3,0,"[[-0.052913915, 0.078841336, 0.10659341, -0.01..."
4,0,"[[3.9120626, -0.1962506, 4.615563, 1.5494909, ..."
...,...,...
9995,9,"[[-2.2707212, -0.5011498, -0.27457333, 0.53589..."
9996,9,"[[-2.2707212, -0.5011498, -0.27457333, 0.53589..."
9997,9,"[[0.02026872, -0.061411813, 0.1602562, 0.11160..."
9998,9,"[[-0.08958607, -0.35306755, -0.66861784, -1.98..."


In [21]:
books_chunked_vec['Book'][0]

[array([ 0.03220482, -0.31926122,  0.38597348,  0.11726458, -0.38244295,
        -0.13545091,  0.34602222,  0.4347486 , -0.12894857, -0.13351214,
         1.0814227 ,  0.36355594,  0.20494783, -0.37452537,  0.01451374,
        -0.0570275 ,  0.69027936, -0.15213162, -0.34706584, -0.63723856,
         0.665111  , -0.42415172,  0.39391953,  0.39469227,  0.17533726,
         0.1405706 ,  0.26380384,  0.23801777, -0.25330865, -0.20604701,
        -0.03214763, -0.15438172,  0.3162301 , -0.16124095,  0.47280222,
         0.22145271,  0.34552008, -0.19350004, -0.35969788,  0.02042077,
        -0.16266319,  0.18627748, -0.02532198, -0.18339932,  0.02978458,
        -0.04819441,  0.19083285,  0.16302113,  0.38396266,  0.3677184 ,
        -0.37274903,  0.06671879, -0.5827627 ,  0.09395032,  1.0991595 ,
         0.88995266,  0.1983406 ,  0.3186445 ,  0.39615956, -0.77914923],
       dtype=float32),
 array([ 0.02414412, -0.00600148,  0.02851906, -0.0138047 , -0.02958647,
         0.0289786 , -0.010

In [22]:
# check the number of vectors in each chunk
books_chunked_vec['Book'].apply(lambda x: len(x))

0       256
1       256
2       256
3       256
4       256
       ... 
9995    256
9996    256
9997    256
9998    256
9999    256
Name: Book, Length: 10000, dtype: int64

In [23]:
import numpy as np

X = books_chunked_vec['Book']
y = books_chunked_vec['Author']
y = np.array(y)


In [24]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
y = ohe.fit_transform(y.reshape(-1, 1)).toarray()
y

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [25]:
X2 = np.array([np.vstack(X[i]) for i in range(len(X))])

In [26]:
X2

array([[[ 3.22048217e-02, -3.19261223e-01,  3.85973483e-01, ...,
          3.18644494e-01,  3.96159559e-01, -7.79149234e-01],
        [ 2.41441205e-02, -6.00148458e-03,  2.85190642e-02, ...,
          3.74112627e-03, -2.39114463e-02, -2.35900488e-02],
        [ 5.27041578e+00,  1.85271025e+00, -2.91739166e-01, ...,
         -8.21276784e-01, -1.80513597e+00,  2.38452151e-01],
        ...,
        [ 1.50449443e+00,  3.27678537e+00,  2.24258685e+00, ...,
         -1.55149508e+00, -3.09588820e-01,  4.01383907e-01],
        [-2.24289107e+00,  3.05972099e-01,  4.15899706e+00, ...,
         -1.53969014e+00, -9.16175127e-01, -1.38785124e+00],
        [ 3.77141178e-01,  1.91585481e+00,  1.82066894e+00, ...,
         -2.62934685e+00, -1.79007494e+00, -2.14513946e+00]],

       [[ 1.30645841e-01,  3.07566166e-01,  1.10087998e-01, ...,
          3.32213581e-01,  6.82375669e-01, -6.45470738e-01],
        [ 7.14770138e-01, -1.19012880e+00, -5.74089587e-01, ...,
          1.35301995e+00, -1.97245777e

In [27]:
from keras import Input, layers, Sequential, optimizers

model = Sequential()
model.add(Input(shape=(None, 100)))
model.add(layers.Masking())
model.add(layers.LSTM(100, activation='relu'))
# ou bien  model.add(layers.Dense(100, activation='relu'))

model.add(layers.Dense(10, activation="softmax"))
print("Model initialized")
optimizer = optimizers.Adam(learning_rate=0.001)



Model initialized


In [28]:
model.compile(loss='categorical_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy'])
print("Model compiled")



Model compiled


In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, None, 60)          0         
                                                                 
 lstm (LSTM)                 (None, 60)                29040     
                                                                 
 dense (Dense)               (None, 10)                610       
                                                                 
Total params: 29650 (115.82 KB)
Trainable params: 29650 (115.82 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True,
    verbose=1)

history = model.fit(
    X2,
    y,
    validation_split=0.2,
    epochs=30,
    batch_size=64,
    callbacks=[es],
    verbose=1)

print(history.history.keys())
# print(f"Model trained on {len(X)} rows with min val MAE: {round(np.min(history.history['val_mae']), 2)}")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 4: early stopping
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
