In [1]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf

Using TensorFlow backend.


# Data Importation

In [2]:
data = pd.read_csv('data/data.csv')
target = pd.read_csv('data/target.csv')

# Neural Network for text

## Data Preprocessing

In [3]:
tf.test.is_built_with_cuda()

True

In [4]:
tf.config.list_physical_devices('GPU')

[]

In [5]:
x_train = np.asarray(data[data['label'] == 'Train'][['description']])
x_val = np.asarray(data[data['label'] == 'Validation'][['description']])
x_test = np.asarray(data[data['label'] == 'Test'][['description']])

target.categ = pd.Categorical(target.categ)
target['categ_number'] = np.int32(target.categ.cat.codes)

y_train = np.asarray(target[target['label'] == 'Train'][['categ_number']])
y_val = np.asarray(target[target['label'] == 'Validation'][['categ_number']])
y_test = np.asarray(target[target['label'] == 'Test'][['categ_number']])

In [6]:
target['categ_number'].dtypes

dtype('int32')

First preprocessing found here : https://towardsdatascience.com/deep-transfer-learning-for-natural-language-processing-text-classification-with-universal-1a2c69e5baa9

In [7]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text):
    return contractions.fix(text)

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def pre_process_document(document):
    # strip HTML
    document = strip_html_tags(document)
    # lower case
    document = document.lower()
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    # remove accented characters
    document = remove_accented_chars(document)
    # expand contractions    
    document = expand_contractions(document)  
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [8]:
train_reviews = pre_process_corpus(x_train)
val_reviews = pre_process_corpus(x_val)
test_reviews = pre_process_corpus(x_test)

In [9]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    {'sentence': train_reviews}, y_train, 
    batch_size=256, num_epochs=None, shuffle=True)
    
# Prediction on the whole training set.
predict_train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    {'sentence': train_reviews}, y_train, shuffle=False)
    
# Prediction on the whole validation set.
predict_val_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    {'sentence': val_reviews}, y_val, shuffle=False)
    
# Prediction on the test set.
predict_test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
    {'sentence': test_reviews}, y_test, shuffle=False)   




In [10]:
import tensorflow_hub as hub
embedding_feature = hub.text_embedding_column(
    key='sentence', 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2",
    trainable=False)

In [11]:
dnn = tf.estimator.DNNClassifier(
          hidden_units=[512, 128],
          feature_columns=[embedding_feature],
          n_classes=7,
          activation_fn=tf.nn.relu,
          dropout=0.1,
          optimizer=tf.keras.optimizers.Adamax(learning_rate=0.005))
# train for approx 12 epochs
# 256*1500 / 30000 == 12.8

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\jpaul\\AppData\\Local\\Temp\\tmptkw3li1x', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\jpaul\\AppData\\Local\\Temp\\tmptkw3li1x', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [12]:
#tf.logging.set_verbosity(tf.logging.ERROR)
import time

TOTAL_STEPS = 1500
STEP_SIZE = 100
for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
    print()
    print('-'*100)
    print('Training for step =', step)
    start_time = time.time()
    dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
    elapsed_time = time.time() - start_time
    print('Train Time (s):', elapsed_time)
    print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
    print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))


----------------------------------------------------------------------------------------------------
Training for step = 0
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


Instructions for updating:
To construct input pipelines, use the `tf.data` module.


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...


INFO:tensorflow:Saving checkpoints for 0 into C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


INFO:tensorflow:loss = 1.9495214, step = 0


INFO:tensorflow:loss = 1.9495214, step = 0


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 100...


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 100...


INFO:tensorflow:Saving checkpoints for 100 into C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt.


INFO:tensorflow:Saving checkpoints for 100 into C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt.


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 100...


INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 100...


INFO:tensorflow:Loss for final step: 0.0339194.


INFO:tensorflow:Loss for final step: 0.0339194.


Train Time (s): 43.28112030029297
INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2020-06-05T11:14:48Z


INFO:tensorflow:Starting evaluation at 2020-06-05T11:14:48Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


INFO:tensorflow:Restoring parameters from C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 8.99388s


INFO:tensorflow:Inference Time : 8.99388s


INFO:tensorflow:Finished evaluation at 2020-06-05-11:14:57


INFO:tensorflow:Finished evaluation at 2020-06-05-11:14:57


INFO:tensorflow:Saving dict for global step 100: accuracy = 0.99206346, average_loss = 0.034492888, global_step = 100, loss = 0.035000097


INFO:tensorflow:Saving dict for global step 100: accuracy = 0.99206346, average_loss = 0.034492888, global_step = 100, loss = 0.035000097


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


Eval Metrics (Train): {'accuracy': 0.99206346, 'average_loss': 0.034492888, 'loss': 0.035000097, 'global_step': 100}
INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2020-06-05T11:15:01Z


INFO:tensorflow:Starting evaluation at 2020-06-05T11:15:01Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


INFO:tensorflow:Restoring parameters from C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 8.65618s


INFO:tensorflow:Inference Time : 8.65618s


INFO:tensorflow:Finished evaluation at 2020-06-05-11:15:10


INFO:tensorflow:Finished evaluation at 2020-06-05-11:15:10


INFO:tensorflow:Saving dict for global step 100: accuracy = 0.9095238, average_loss = 0.27344632, global_step = 100, loss = 0.27023602


INFO:tensorflow:Saving dict for global step 100: accuracy = 0.9095238, average_loss = 0.27344632, global_step = 100, loss = 0.27023602


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: C:\Users\jpaul\AppData\Local\Temp\tmptkw3li1x\model.ckpt-100


Eval Metrics (Validation): {'accuracy': 0.9095238, 'average_loss': 0.27344632, 'loss': 0.27023602, 'global_step': 100}

----------------------------------------------------------------------------------------------------
Training for step = 100
INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


RuntimeError: Cannot set `iterations` to a new Variable after the Optimizer weights have been created