In [1]:
import numpy as np
import pandas as pd

import os
import difflib
from collections import Counter
import shutil
import string
import re

# TensorFlow
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
%matplotlib inline

## seaborn
import seaborn as sns

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")

# Text Classification Pipeline with Tensorflow

This article is based on the Keras [**Text classification from scratch**](https://keras.io/examples/nlp/text_classification_from_scratch/) where we demonstrate a text classification pipeline using TensorFlow. The dataset used here is the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) dataset from Kaggle.

<div class="alert alert-block alert-info">
<font size="+2"><b>
Large Movie Review Dataset
</b></font>
</div>

This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details.

## Downloading the Dataset

In [2]:
def Get_Data(_URL, Remove = True):
    # The dataset URL
    File = _URL.split('/')[-1]
    Full_Name =  os.path.join(os.getcwd(), File)
    # Download the dataset file from the URL
    path_to_zip = tf.keras.utils.get_file(fname =Full_Name, origin=_URL, extract=True, cache_dir = os.getcwd())
    PATH = os.path.dirname(path_to_zip)
    PATH = os.path.join(PATH, 'datasets')
    Folder = difflib.get_close_matches(File.split('.')[0],os.listdir(PATH))
    PATH = os.path.join(PATH, Folder[0])
    # Deleting the zip file
    if Remove:
        os.remove(File)
    return PATH
    #-----------------------------------------------------------------
    
_URL = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
PATH = Get_Data(_URL)

## Dataset Directory Details

In [3]:
def Path_Tree(PATH):
    sep = ' ' * 3
    title = PATH.split('\\')[-1]
    print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1) + Style.RESET_ALL)
    print(Back.BLACK + Fore.CYAN + Style.NORMAL + title +':'+ Style.RESET_ALL)
    print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1)+ Style.RESET_ALL)
    for entry in os.listdir(PATH):
        sub = os.path.join(PATH, entry)
        if os.path.isdir(sub):
            print('└──',Back.CYAN + Fore.BLACK + Style.NORMAL + entry+':'+ Style.RESET_ALL)
            for entry1 in os.listdir(sub):
                sub1 = os.path.join(sub, entry1)
                if os.path.isdir(sub):
                    if os.path.isdir(sub1):
                        print(sep + '└──',Back.MAGENTA + Fore.BLACK + Style.NORMAL + entry1+':'+ Style.RESET_ALL)
                        List = os.listdir(sub1)
                        print(2* sep, Back.YELLOW + Fore.BLACK + Style.NORMAL +
                              '%i %s files' % (len(List), List[0].split('.')[-1].upper()) + Style.RESET_ALL)
                        print(2* sep, ', '.join(List[:5]) + ', ...')
                    else:
                        print(sep + '└──',Back.WHITE + Fore.BLACK + Style.NORMAL + entry1+ Style.RESET_ALL)
    #-----------------------------------------------------------------
    
Path_Tree(PATH)

[40m[36m[22maclImdb:[0m
└── [46m[30m[22mtest:[0m
   └── [47m[30m[22mlabeledBow.feat[0m
   └── [45m[30m[22mneg:[0m
       [43m[30m[22m12500 TXT files[0m
       0_2.txt, 10000_4.txt, 10001_1.txt, 10002_3.txt, 10003_3.txt, ...
   └── [45m[30m[22mpos:[0m
       [43m[30m[22m12500 TXT files[0m
       0_10.txt, 10000_7.txt, 10001_9.txt, 10002_8.txt, 10003_8.txt, ...
   └── [47m[30m[22murls_neg.txt[0m
   └── [47m[30m[22murls_pos.txt[0m
└── [46m[30m[22mtrain:[0m
   └── [47m[30m[22mlabeledBow.feat[0m
   └── [45m[30m[22mneg:[0m
       [43m[30m[22m12500 TXT files[0m
       0_3.txt, 10000_4.txt, 10001_4.txt, 10002_1.txt, 10003_1.txt, ...
   └── [45m[30m[22mpos:[0m
       [43m[30m[22m12500 TXT files[0m
       0_9.txt, 10000_8.txt, 10001_10.txt, 10002_7.txt, 10003_8.txt, ...
   └── [45m[30m[22munsup:[0m
       [43m[30m[22m50000 TXT files[0m
       0_0.txt, 10000_0.txt, 10001_0.txt, 10002_0.txt, 10003_0.txt, ...
   └── [47m[30m[22m

We can remove unnessary files and save the the address of folders that we need for our modeling and analysis.

In [4]:
def Data_Info(PATH):
    Set = [];
    Subset = [];
    Size = [];
    DataDirs = {};
    Temp = []
    # Train and Test Sets subdirs
    for entry in os.listdir(PATH):
        sub = os.path.join(PATH, entry)
        if os.path.isdir(sub):
            DataDirs[entry] = sub
            for entry1 in os.listdir(sub):
                sub1 = os.path.join(sub, entry1)
                if os.path.isdir(sub1):
                    Temp.append(entry1)
    Temp = Counter(Temp)
    Temp = [x for x in Temp.keys() if Temp[x] ==2]
    for entry in os.listdir(PATH):
        sub = os.path.join(PATH, entry)
        if os.path.isdir(sub):
            DataDirs[entry] = sub
            for entry1 in os.listdir(sub):
                sub1 = os.path.join(sub, entry1)
                if (os.path.isdir(sub1) & (entry1 in Temp)):
                    DataDirs[entry + '_' +entry1] = sub1
                    Set.append(entry.title())
                    Subset.append(entry1.title())
                    Size.append(len(os.listdir(sub1)))
                else:
                    try:
                        os.remove(sub1)
                    except:
                        try:
                            os.rmdir(sub1)
                        except:
                            shutil.rmtree(sub1)
        else:
            os.remove(sub)

    DataFrame_Info = pd.DataFrame({'Set': Set, 'Subset': Subset, 'Size':Size})
    display(DataFrame_Info.set_index(['Set' , 'Subset']).T)
    return DataFrame_Info, DataDirs
    #-----------------------------------------------------------------
    
DataFrame_Info, DataDirs = Data_Info(PATH)

Set,Test,Test,Train,Train
Subset,Neg,Pos,Neg,Pos
Size,12500,12500,12500,12500


In [5]:
def Line(L=120): print(Fore.BLUE + Style.NORMAL + L*'=' + Style.RESET_ALL)

def Header(Text='Title', L=120):
    Text = Text + ':'
    print(Back.BLACK + Fore.GREEN + Style.NORMAL + Text + Style.RESET_ALL + ' ' + Fore.BLUE +
          Style.NORMAL +  (L- len(Text) - 1)*'=' + Style.RESET_ALL)

# ------------------------------------------------------------------------------------------------
batch_size = 32
# Train Set: 80% Validation: 20%
train_val_split_ratio = 0.2
# ------------------------------------------------------------------------------------------------
Header('Train Data')
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(DataDirs['train'], batch_size=batch_size,
                                                                  validation_split = train_val_split_ratio,
                                                                  subset="training", seed=1337)
print("Number of batches: %d"% tf.data.experimental.cardinality(raw_train_ds))
#
Header('Validation Data')
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(DataDirs['train'], batch_size = batch_size,
                                                                validation_split=train_val_split_ratio,
                                                                subset="validation", seed=1337)
print("Number of batches: %d" % tf.data.experimental.cardinality(raw_val_ds))
#
Header('Test Data')
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(DataDirs['test'],
                                                                 batch_size=batch_size)
print("Number of batches: %d" % tf.data.experimental.cardinality(raw_test_ds))
Line(120)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Number of batches: 625
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Number of batches: 157
Found 25000 files belonging to 2 classes.
Number of batches: 782


Let's preview a few samples:


In [6]:
def Comment_Sample(N = 2, L=120, ds = raw_train_ds, batch_size = batch_size):
    Pos = []
    Neg = []
    for text_batch, label_batch in ds.take(1):
        for i in range(batch_size):
            if label_batch.numpy()[i] ==1:
                Pos.append(i)
            else:
                Neg.append(i)
        for i in Pos[:N]:
            Text = 'A Positive Comment:'
            print(Back.GREEN + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + Fore.GREEN +
                  Style.NORMAL +  (L- len(Text) - 1)*'=' + Style.RESET_ALL)
            print(str(text_batch.numpy()[i]).replace("<br />", " "))
            print(Fore.GREEN + Style.NORMAL + L*'=' + Style.RESET_ALL)
        print('\n\n')
        for i in Neg[:N]:
            Text = 'A Negative Comment:'
            print(Back.RED + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + Fore.RED +
                  Style.NORMAL +  (L- len(Text) - 1)*'=' + Style.RESET_ALL)
            print(str(text_batch.numpy()[i]).replace("<br />", " "))
            print(Fore.RED + Style.NORMAL + L*'=' + Style.RESET_ALL)
            
Comment_Sample(1)

b'I\'ve seen tons of science fiction from the 70s; some horrendously bad, and others thought provoking and truly frightening. Soylent Green fits into the latter category. Yes, at times it\'s a little campy, and yes, the furniture is good for a giggle or two, but some of the film seems awfully prescient. Here we have a film, 9 years before Blade Runner, that dares to imagine the future as somthing dark, scary, and nihilistic. Both Charlton Heston and Edward G. Robinson fare far better in this than The Ten Commandments, and Robinson\'s assisted-suicide scene is creepily prescient of Kevorkian and his ilk. Some of the attitudes are dated (can you imagine a filmmaker getting away with the "women as furniture" concept in our oh-so-politically-correct-90s?), but it\'s rare to find a film from the Me Decade that actually can make you think. This is one I\'d love to see on the big screen, because even in a widescreen presentation, I don\'t think the overall scope of this film would receive its

<div class="alert alert-block alert-info">
<font size="+2"><b>
Modeling
</b></font>
</div>

## Prepareprocessing

Each comment contains a number of HTML breaks, `<br />`. To remove these substrings, we can use a variety of [tf.strings](https://www.tensorflow.org/api_docs/python/tf/strings ) module. In particular, we will use the following functions.

| Function      | Description                                                                     |
|---------------|---------------------------------------------------------------------------------|
|     lower     | Converts all uppercase characters into their respective lowercase replacements. |
| regex_replace |          Replace elements of input matching regex pattern with a rewrite.         |

In [7]:
def FilterData(Inp):
    # Lowercase the data
    Inp_low = tf.strings.lower(Inp)
    # replacing "<br />" with a space character, " ".
    Out = tf.strings.regex_replace(Inp_low, "<br />", " ")
    # Removing punctuations
    Out = tf.strings.regex_replace(Out, "[%s]" % re.escape(string.punctuation), "")
    return Out

# Model constants -----------------------------------------------------------------------------------------------
Max_Features = int(DataFrame_Info.loc[DataFrame_Info.Set == 'Train', 'Size'].sum() * (1-train_val_split_ratio))
embedding_dim = 128
# Maximum Sequence Length
Max_Seq_Length = 500
# ---------------------------------------------------------------------------------------------------------------

vectorize_layer = layers.experimental.preprocessing.TextVectorization(standardize = FilterData,
                                                                      max_tokens = Max_Features,
                                                                      output_mode = "int",
                                                                      output_sequence_length = Max_Seq_Length)

# seperating text from the labels
text_ds = raw_train_ds.map(lambda x, y: x)
# adapt: When this layer is adapted, it will analyze the dataset, determine the frequency of individual string values,
# and create a 'vocabulary' from them
vectorize_layer.adapt(text_ds)
clear_output()

## Vectorize the Data

In [8]:
def vectorize_text(text, label):
    # adding an additional dimension with the last dimension index
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Buffering of the data for best performance on GPU. Basically, while the model is executing training step s,
# the input pipeline is reading the data for step s+1
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)
clear_output()

## Build a model

In [9]:
# inputs
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Embedding layer
x = layers.Embedding(Max_Features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# a  hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# Output layer
predictions = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs, predictions, name="Keras_NLP")

model.summary()
plot_model(model, show_shapes=True, show_layer_names=False, expand_nested = True, rankdir = 'TB')

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Model: "Keras_NLP"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         114816    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16

## Train the model


In [10]:
epochs = 3
# Fitting the model using the train and test datasets.
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)
clear_output()

In [11]:
def Table_History(history):
    Table = pd.DataFrame(pd.DataFrame(history.history).values,
                     columns = pd.MultiIndex.from_product([['Train', 'Validation'], ['Loss', 'Accuracy']]))
    display(Table.style.background_gradient(subset= [('Train', 'Accuracy'), ('Validation', 'Accuracy')], cmap='BuGn')\
            .background_gradient(subset= [( 'Train','Loss'), ('Validation', 'Loss')], cmap='Wistia').set_precision(4))
    # -------------------------------------------------------------------------------------------
    
Table_History(history) 

Unnamed: 0_level_0,Train,Train,Validation,Validation
Unnamed: 0_level_1,Loss,Accuracy,Loss,Accuracy
0,0.4844,0.73,0.3186,0.8638
1,0.2174,0.9158,0.3221,0.8762
2,0.1127,0.9592,0.4081,0.875


## Final Model

In [12]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
indices = vectorize_layer(inputs)
outputs = model(indices)

# Our end to end model
final_model = tf.keras.Model(inputs, outputs)
final_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# Predictions
Pred = final_model.predict(raw_test_ds)
clear_output()

***

# References
1. Text classification from scratch, https://keras.io/examples/nlp/text_classification_from_scratch/
1. Large Movie Review Dataset, https://ai.stanford.edu/~amaas/data/sentiment/
1. Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
***