In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

print("You have tensorflow version", tf.__version__)

You have tensorflow version 2.14.0


In [9]:
df = pd.read_csv('customer_complaints.csv', encoding='latin-1')
df.dropna(inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [5]:
df['product'].nuniqueque()

array(['credit_card', 'retail_banking', 'credit_reporting',
       'mortgages_and_loans', 'debt_collection'], dtype=object)

In [10]:
df['narrative'].isnull().sum()

0

In [11]:
df['product'].value_counts()

credit_reporting       91172
debt_collection        23148
mortgages_and_loans    18990
credit_card            15566
retail_banking         13535
Name: product, dtype: int64

In [12]:
train_size = int(len(df)*.8)
print("Train size:%d" % train_size)
print("Test size: %d" % (len(df) - train_size))

Train size:129928
Test size: 32483


In [14]:
train_narrative = df['narrative'][:train_size]
train_product = df['product'][:train_size]

test_narrative = df['narrative'][train_size:]
test_product = df['product'][train_size:]

In [16]:
max_word = 1000
tokenize = text.Tokenizer(num_words=max_word, char_level=False)

tokenize.fit_on_texts(train_narrative)
X_train = tokenize.texts_to_matrix(train_narrative)
X_test = tokenize.texts_to_matrix(test_narrative)

In [18]:
# use sklearn utility to convert string to number index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [19]:
# convert the labels to a one-hot representation.
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test =  utils.to_categorical(y_test, num_classes)

In [21]:
# Insect the dimenstions of our training and test data(this is helpful to debug)
print("x train:", X_train.shape)
print("x test:", X_test.shape)
print("y train:", y_train.shape)
print("y test:", y_test.shape)

x train: (129928, 1000)
x test: (32483, 1000)
y train: (129928, 5)
y test: (32483, 5)


In [22]:
# building model
model = Sequential()

# add layers(row)
model.add(Dense(512, input_shape=(max_word,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [24]:
batch_size = 32
epochs = 5

history = model.fit(X_train, y_train,
                   batch_size = batch_size,
                   epochs = epochs,
                   verbose= 1,
                   validation_split= 0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
score = model.evaluate(X_test, y_test,
                      batch_size = batch_size, verbose = 1)
print(score)
print('Test score:', score[0])
print('Test accuracy:', score[1])

[0.4541122317314148, 0.8486900925636292]
Test score: 0.4541122317314148
Test accuracy: 0.8486900925636292


In [28]:
# here is how to generate a prediction on individual example
text_labels = encoder.classes_

for i in range(50):
    prediction = model.predict(np.array([X_test[i]]))
    prediction_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Acutual label:' + test_product.iloc[i])
    print('predicted label:' + prediction_label + '\n')

following copy email message sent yesterday top ex ...
Acutual label:credit_card
predicted label:credit_reporting

following detailed account distress frustration we ...
Acutual label:mortgages_and_loans
predicted label:mortgages_and_loans

following list event occurred national credit syst ...
Acutual label:debt_collection
predicted label:debt_collection

following list credit card sychrony bank follows c ...
Acutual label:credit_card
predicted label:credit_card

following short summary complaint filed today flor ...
Acutual label:credit_reporting
predicted label:credit_reporting

following timeline event detailing issue complaint ...
Acutual label:credit_reporting
predicted label:credit_reporting

following timeline event detailing issue complaint ...
Acutual label:credit_reporting
predicted label:credit_reporting

following timeline event detailing issue complaint ...
Acutual label:credit_reporting
predicted label:credit_reporting

following incorrect personal information please re 

following show several account opened name social  ...
Acutual label:credit_reporting
predicted label:credit_reporting

following show several account opened name social  ...
Acutual label:credit_reporting
predicted label:credit_reporting

