# Deep Learning for Text Classification
## This program is used to take tensorflow to perform the text classification, which was made based on the folk from:
## - susanli2016/Machine-Learning-with-Python

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

  from ._conv import register_converters as _register_converters


You have TensorFlow version 1.7.0


Using TensorFlow backend.


In [3]:
df = pd.read_excel('./98_data/qalist_answer.xlsx', encoding='latin-1')
df.head()

Unnamed: 0,question,answer
0,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
1,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
2,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
3,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
4,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."


In [6]:
df.columns=['Emails', 'Cat']
col = ['Emails', 'Cat']
#df = df[col]
df = df[pd.notnull(df['Emails'])]
df.head()

Unnamed: 0,Emails,Cat
0,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
1,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
2,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
3,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."
4,I have the problem on The field city could not...,"The system setting is missing, and it is corre..."


In [7]:
df.isnull().sum()

Emails    0
Cat       0
dtype: int64

In [8]:
df['Cat'].value_counts()

The system bug and it is working now after applying patch.                                                      5808
The role was not changes since the change was not transferred to system.It was done, and should be work now.    3872
The system setting is missing, and it is corrected after reinputting it.                                        3872
The system change was done, and this field was removed. Please check the annoucement sent last month.           3872
The system maintenance was completed, and it is working now.                                                    1936
Please clear your IE cache, if not work, please upgrade your windows to windows 10.                              484
Your authorization is insufficient, please apply for the additional authorizations.                              242
The system setting was changed, and we will restore the system setting, after that, please try it again.         242
Name: Cat, dtype: int64

In [9]:
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 16262
Test size: 4066


In [10]:
train_narrative = df['Emails'][:train_size]
train_product = df['Cat'][:train_size]

test_narrative = df['Emails'][train_size:]
test_product = df['Cat'][train_size:]

In [11]:
# Open question: why the max_words is required?
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [12]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)

In [13]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [14]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [15]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (16262, 1000)
x_test shape: (4066, 1000)
y_train shape: (16262, 8)
y_test shape: (4066, 8)


In [16]:
# Open questions: why are those parameters used for?
batch_size = 32
epochs = 5

In [17]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [18]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 14635 samples, validate on 1627 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.0006621574547917816
Test accuracy: 1.0


In [20]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

It is really frastruted to meet with the following ...
Actual label:The system change was done, and this field was removed. Please check the annoucement sent last month.
Predicted label: The system change was done, and this field was removed. Please check the annoucement sent last month.

It is really frastruted to meet with the following ...
Actual label:The system change was done, and this field was removed. Please check the annoucement sent last month.
Predicted label: The system change was done, and this field was removed. Please check the annoucement sent last month.

It is really frastruted to meet with the following ...
Actual label:The system change was done, and this field was removed. Please check the annoucement sent last month.
Predicted label: The system change was done, and this field was removed. Please check the annoucement sent last month.

It is really frastruted to meet with the following ...
Actual label:The system change was done, and this field was removed. Please