## Email Classification based on subject and body
This program is to demostrate how to build 3 layers of NN to integrate subject and body's classification into one result. 

The background is, we need to consider both subject and body to make the email classification, the approach can be:<br> </br>
1) Build one NN to predict label of email classification (Subject as input), another NN to predict the same label ( Body as input).<br> </br>
2) On top of the above, build one more NN to take their inputs to predict the same label.<br> </br>
3) Train the above 3 NNs to gain the optimal weights and hyperparameters.

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import itertools
import os
import logging
import csv        

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding,LSTM,Flatten,GRU
from keras.preprocessing import text, sequence
from keras import utils

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

### Load the mock dataset

In [None]:
source = pd.read_excel('../98_data/mail_timesheet_admin_woissue.xlsx', encoding='latin-1')
source.head()

In [None]:
# Rename the columns
df=source[['subject','mailquestion','Category']]
df.columns=['Subject','Emails','Cat']
# Remove all rows whose emails or subjects are empty
df = df[pd.notnull(df['Emails'])]
df=df[pd.notnull(df['Subject'])]
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['Cat'].value_counts()

In [None]:
from google_text_classification.explore_data import get_num_words_per_sample,plot_sample_length_distribution

print("Median words per sample:",get_num_words_per_sample(df['Emails']))

plot_sample_length_distribution(df['Emails'])

In [None]:
# Split data into train, validation, and test
train_narrative,test_narrative,train_product, test_product = train_test_split(df['Emails'], df['Cat'],
                                                                              random_state=42, train_size=0.8
                                                                              )
print ("Train size: %d" % train_product.shape[0])
print ("Test size: %d" % test_product.shape[0])

### Convert the words to integers

In [None]:
# Find out the maximum words for the dimension of word vectors
def text2word(doc):
    """
    Usage: Convert one text into word list.
    Input: doc - string list
    Output: documents - string list: all sentences in words list.
            text_len - int: maximum numbers of words in one sentense
            max_text - strig list: the words list of the sentence with maximum words
    """
    return_docs=[]
    text_len=0
    max_text=''
    #for item in df['Emails']:
    for item in doc:
        text_words=[word for word in str(item).lower().split()]
        return_docs.append(text_words)
        if len(text_words)> text_len:
            text_len=len(text_words)
            max_text=text_words
    #max_words = text_len
    
    return return_docs,text_len,max_text

documents,max_words,_=text2word(df['Emails'])
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [None]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train_o = tokenize.texts_to_matrix(train_narrative)
x_test_o = tokenize.texts_to_matrix(test_narrative)
word_index = tokenize.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train_o = encoder.transform(train_product)
y_test_o = encoder.transform(test_product)

In [None]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train_o) + 1
y_train_o = utils.to_categorical(y_train_o, num_classes)
y_test_o = utils.to_categorical(y_test_o, num_classes)

In [None]:
print(encoder.classes_)

In [None]:
encoder_t = LabelEncoder()
encoder_t.fit(test_product)
print(encoder_t.classes_)

In [None]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train_o.shape)
print('x_test shape:', x_test_o.shape)
print('y_train shape:', y_train_o.shape)
print('y_test shape:', y_test_o.shape)

## 1.  Model of Body

### Build the model

In [None]:
# Open questions: why are those parameters used for?
batch_size = 5
epochs = 50

In [None]:
# Build the model
model = Sequential()
model.add(Dense(16, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.8))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

### Loss function and optimizer
A model needs a loss function and an optimizer for training. Since this is a categorical classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), we'll use the categorical_crossentropy loss function.
This isn't the only choice for a loss function, you could, for instance, choose mean_squared_error. But, generally, categorical_crossentropy is better for dealing with probabilities—it measures the "distance" between probability distributions, or in our case, between the ground-truth distribution and the predictions.
Later, when we are exploring regression problems (say, to predict the price of a house), we will see how to use another loss function called mean squared error.
Now, configure the model to use an optimizer and a loss function:

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

### Train the model

In [None]:
history = model.fit(x_train_o, y_train_o,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2)

In [None]:
b_prediction=np.zeros((x_train_o.shape[0],y_train_o.shape[1]))
for i in range(x_train_o.shape[0]):
    prediction = model.predict(np.array([x_train_o[i]]))
    b_prediction[i]=prediction[0]
    #s_prediction=np.vstack([s_prediction,prediction[0]])


In [None]:
b_prediction.shape

In [None]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test_o, y_test_o,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
# Produce the test set for integrated model
bt_prediction=np.zeros((x_test_o.shape[0],y_test_o.shape[1]))
for i in range(x_test_o.shape[0]):
    prediction = model.predict(np.array([x_test_o[i]]))
    bt_prediction[i]=prediction[0]

### Create a graph of accuracy and loss over time

`model.fit()` returns a `History` object that contains a dictionary with everything that happened during training:

In [None]:
history_dict = history.history
history_dict.keys()

There are four entries: one for each monitored metric during training and validation. We can use these to plot the training and validation loss for comparison, as well as the training and validation accuracy:

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In this plot, the dots represent the training loss and accuracy, and the solid lines are the validation loss and accuracy.
Notice the training loss decreases with each epoch and the training accuracy increases with each epoch. This is expected when using a gradient descent optimization—it should minimize the desired quantity on every iteration.
This isn't the case for the validation loss and accuracy—they seem to peak after about twenty epochs. This is an example of overfitting: the model performs better on the training data than it does on data it has never seen before. After this point, the model over-optimizes and learns representations specific to the training data that do not generalize to test data.
For this particular case, we could prevent overfitting by simply stopping the training after twenty or so epochs. Later, you'll see how to do this automatically with a callback.

### Evaluate model

#### Scenario 1: Verify 10 records in test dataset

In [None]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test_o[i]]))
    print(prediction,np.argmax(prediction))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

In [None]:
encoder.classes_

## 2.  Model of Subject

In [None]:
# Split data into train, validation, and test for subjects
s_train_narrative,s_test_narrative,s_train_product, s_test_product = train_test_split(df['Subject'], df['Cat'],
                                                                              random_state=42, train_size=0.8
                                                                              )
print ("Train size: %d" % s_train_product.shape[0])
print ("Test size: %d" % s_test_product.shape[0])

In [None]:
subjects,s_max_words,_=text2word(df['Subject'])
s_tokenize = text.Tokenizer(num_words=s_max_words, char_level=False)

In [None]:
s_tokenize.fit_on_texts(s_train_narrative) # only fit on train
s_x_train_o = s_tokenize.texts_to_matrix(s_train_narrative)
s_x_test_o = s_tokenize.texts_to_matrix(s_test_narrative)
s_word_index = s_tokenize.word_index
print('Found %s unique tokens.' % len(s_word_index))

In [None]:
# Use sklearn utility to convert label strings to numbered index
s_encoder = LabelEncoder()
s_encoder.fit(s_train_product)
s_y_train_o = s_encoder.transform(s_train_product)
s_y_test_o = s_encoder.transform(s_test_product)

In [None]:
# Converts the labels to a one-hot representation
s_num_classes = np.max(s_y_train_o) + 1
s_y_train_o = utils.to_categorical(s_y_train_o, s_num_classes)
s_y_test_o = utils.to_categorical(s_y_test_o, s_num_classes)

In [None]:
s_encoder_t = LabelEncoder()
s_encoder_t.fit(s_test_product)
print(s_encoder_t.classes_)

In [None]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('s_x_train shape:', s_x_train_o.shape)
print('s_x_test shape:', s_x_test_o.shape)
print('s_y_train shape:', s_y_train_o.shape)
print('s_y_test shape:', s_y_test_o.shape)

In [None]:
# Open questions: why are those parameters used for?
batch_size = 5
epochs = 50

In [None]:
# Build the model
s_model = Sequential()
s_model.add(Dense(16, input_shape=(s_max_words,)))
s_model.add(Activation('relu'))
s_model.add(Dropout(0.8))
s_model.add(Dense(s_num_classes))
s_model.add(Activation('softmax'))

In [None]:
s_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
s_model.summary()

In [None]:
s_history = s_model.fit(s_x_train_o, s_y_train_o,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2)

In [None]:
# Evaluate the accuracy of our trained model
s_score = s_model.evaluate(s_x_test_o, s_y_test_o,
                       batch_size=batch_size, verbose=1)
print('Test score:', s_score[0])
print('Test accuracy:', s_score[1])

In [None]:
# Produce the traning set for integrated model
s_prediction=np.zeros((s_x_train_o.shape[0],s_y_train_o.shape[1]))
for i in range(s_x_train_o.shape[0]):
    prediction = s_model.predict(np.array([s_x_train_o[i]]))
    s_prediction[i]=prediction[0]
    #s_prediction=np.vstack([s_prediction,prediction[0]])

In [None]:
# Produce the test set for integrated model
st_prediction=np.zeros((s_x_test_o.shape[0],s_y_test_o.shape[1]))
for i in range(s_x_test_o.shape[0]):
    prediction = s_model.predict(np.array([s_x_test_o[i]]))
    st_prediction[i]=prediction[0]

## 3.  Model of Integration

In [None]:
#### Take the prediction of subject and body as input to predict the label

In [None]:
# Open questions: why are those parameters used for?
batch_size = 5
epochs = 50

In [None]:
# Build the model
i_model = Sequential()
i_model.add(Dense(16, input_shape=(s_num_classes+num_classes,)))
i_model.add(Activation('relu'))
i_model.add(Dropout(0.8))
i_model.add(Dense(num_classes))
i_model.add(Activation('softmax'))

In [None]:
i_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
i_model.summary()

In [None]:
i_prediction=np.concatenate((b_prediction,s_prediction),axis=1)
i_prediction.shape

In [None]:
i_history = i_model.fit(i_prediction, y_train_o,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2)

In [None]:
# Evaluate the accuracy of our trained model
it_prediction=np.concatenate((bt_prediction,st_prediction),axis=1)
it_prediction.shape

i_score = i_model.evaluate(it_prediction, y_test_o,
                       batch_size=batch_size, verbose=1)
print('Test score:', i_score[0])
print('Test accuracy:', i_score[1])

In [None]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    # predict label based on subject:
    es_prediction = s_model.predict(np.array([s_x_test_o[i]]))
    #print(es_prediction[0],np.argmax(prediction))
    predicted_label = text_labels[np.argmax(es_prediction)]
    print("[Subject]: ",s_test_narrative.iloc[i][:50], "...")
    print("Predicted label: " + predicted_label)
    
    prediction = model.predict(np.array([x_test_o[i]]))
    #print(prediction[0],np.argmax(prediction))
    predicted_label = text_labels[np.argmax(prediction)]
    print("[Body]: ",test_narrative.iloc[i][:50], "...")
    print("Predicted label: " + predicted_label)
    
    ei_prediction=np.concatenate((es_prediction,prediction),axis=1)
    f_prediction = i_model.predict(ei_prediction)
    #print(ei_prediction[0],np.argmax(f_prediction))
    predicted_label = text_labels[np.argmax(f_prediction)]
    print(">> Integrated prediction")
    print("-->Predicted label: " + predicted_label)
    print('-->Actual label:' + test_product.iloc[i] + "\n")