#### This module generates tokenizer, defines, trains and saves the model

In [1]:
%load_ext memory_profiler

# Imports
import scrape_data_ as sc
import pandas as pd
import numpy as np
import ast
from tensorflow.keras import Sequential, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import SGD
import time

In [2]:
# Parameters definitions
%memit
start_time = time.time()

MAX_LENGTH = 1000
OOV_TOKEN = "<OOV>"
EMBEDDING_DIM = 64
VOCAB_SIZE = 50000
MAX_EPOCHS = 50
BATCH_SIZE = 64
PATIENCE = 5

FILE_PATH = 'dataset.tsv'
TOKENIZER_PATH = 'recipe_tokenizer.json'
MODEL_PATH = 'classifier'

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 291.48 MiB, increment: 0.03 MiB
Time elapsed: 0.00000 (seconds)


In [3]:

%memit
start_time = time.time()

def generate_tokenizer(texts_to_fit, vocab_size = VOCAB_SIZE, tokenizer_path = TOKENIZER_PATH  ):
    # Create a Tokenizer object with the specified vocabulary size and out-of-vocabulary token
    tokenizer = Tokenizer(num_words = VOCAB_SIZE, oov_token=OOV_TOKEN)

    # Fit the tokenizer on the input texts
    tokenizer.fit_on_texts(texts_to_fit)

    # Save the tokenizer configuration to a file
    with open(tokenizer_path, 'w+') as tokenizer_file:
        tokenizer_file.write(tokenizer.to_json())

    # Return the fitted tokenizer
    return tokenizer

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 291.58 MiB, increment: 0.10 MiB
Time elapsed: 0.00000 (seconds)


In [4]:

%memit
start_time = time.time()

def get_data(file_path = FILE_PATH):
    # Load the dataset into a pandas DataFrame from a TSV file
    df = pd.read_csv(file_path, sep='\t', header=None)

    # Extract the text data and label data from the DataFrame as lists
    data, labels =  df[0].astype(str).tolist(), df[1].to_list()

    # Split the dataset into train and test sets
    train_size = int(0.8 * len(data))

    # Generate a tokenizer based on the training data
    tokenizer = generate_tokenizer(data[:train_size], vocab_size = VOCAB_SIZE, tokenizer_path = TOKENIZER_PATH)

    # Preprocess the text data using the TextPreprocessor object
    processed_data = sc.preprocess_text(data)

    # Convert the label data from string to integer arrays using ast.literal_eval()
    arr = np.array(labels)
    processed_labels = np.array([ast.literal_eval(x) for x in arr])

    # Split the preprocessed data and labels into train and test sets
    X_train, y_train = processed_data[:train_size], processed_labels[:train_size]
    X_test, y_test = processed_data[train_size:], processed_labels[train_size:]

    # Return the preprocessed train and test sets
    return X_train, y_train, X_test, y_test

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 291.67 MiB, increment: 0.00 MiB
Time elapsed: 0.00000 (seconds)


In [5]:

%memit
start_time = time.time()

def create_model():
    # Create a sequential model
    model = Sequential()

    # Add an embedding layer with the specified input dimension, output dimension, and input length
    model.add(layers.Embedding(input_dim = VOCAB_SIZE, 
                               output_dim = EMBEDDING_DIM, 
                               input_length = MAX_LENGTH))

    # Add a flatten layer to flatten the output of the embedding layer
    model.add(layers.Flatten())

    # Add a dense layer with 32 units and SELU activation function
    model.add(layers.Dense(32, activation='selu'))

    # Add another dense layer with 64 units and SELU activation function
    model.add(layers.Dense(64, activation='selu'))

    # Add a dense layer with 3 units and softmax activation function
    model.add(layers.Dense(3, activation='softmax'))

    # Compile the model with Adam optimizer, binary crossentropy loss, and accuracy metric
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
#     # Define the optimizer
#     sgd = SGD()  

#     # Compile the model with the chosen loss function and optimizer
#     model.compile(loss='hinge', optimizer=sgd, metrics=['accuracy']) 
    
    # Print the model summary
    model.summary()

    # Return the compiled model
    return model

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 291.68 MiB, increment: 0.00 MiB
Time elapsed: 0.00000 (seconds)


In [6]:

%memit
start_time = time.time()

def train_model(X_train, y_train, X_val, y_val):
    """
    Train the model
    """
    model = create_model()
    
    # create the EarlyStopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience = PATIENCE, mode='min')

    # fit the model with early stopping
    history = model.fit(X_train, y_train, epochs=MAX_EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test, y_test), callbacks=[early_stop])

    # Return the trained model and history
    return model, history

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')
    

peak memory: 291.69 MiB, increment: 0.00 MiB
Time elapsed: 0.00000 (seconds)


In [7]:

%memit
start_time = time.time()

# Save the model
def save_model(model):
    model.save(model_path)
    
# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')    

peak memory: 291.69 MiB, increment: 0.00 MiB
Time elapsed: 0.00000 (seconds)


In [8]:

%memit
start_time = time.time()
# Get the train and test data
X_train, y_train, X_test, y_test = get_data()

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 291.69 MiB, increment: 0.00 MiB
Time elapsed: 18.35678 (seconds)


In [9]:

%memit
start_time = time.time()
# Train the model
trained_model, training_history = train_model(X_train, y_train, X_test, y_test)

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 484.36 MiB, increment: 0.00 MiB
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 64)          3200000   
                                                                 
 flatten (Flatten)           (None, 64000)             0         
                                                                 
 dense (Dense)               (None, 32)                2048032   
                                                                 
 dense_1 (Dense)             (None, 64)                2112      
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 5,250,339
Trainable params: 5,250,339
Non-trainable params: 0
_________________________________________________________________
E

In [10]:

%memit
start_time = time.time()
# Save the model
trained_model.save(MODEL_PATH)

# Print the time of execution
end_time =   time.time() 
print(f'Time elapsed: {end_time - start_time :.5f} (seconds)')

peak memory: 577.30 MiB, increment: 0.00 MiB
INFO:tensorflow:Assets written to: classifier\assets
Time elapsed: 1.61927 (seconds)
