In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("complaints_with_topics.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,complaint,type
0,0,Debt collection Credit card debt,2
1,1,Good morning my name is XXXX XXXX and I apprec...,5
2,2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,5
3,3,Mortgage Conventional home mortgage,4
4,4,Credit card or prepaid card General-purpose c...,1


In [6]:
df["type"].unique()

array([2, 5, 4, 1, 3], dtype=int64)

In [7]:
import keras
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:


def preprocess(texts):
    
    texts = [re.findall(r'\w+', line.lower()) for line in texts]
    # remove stopwords
    texts = [remove_stopwords(' '.join(line)).split() for line in texts]
    # remove punctuation
    texts = [strip_punctuation(' '.join(line)).split() for line in texts]
    # remove words that are only one or two characters
    texts = [[token for token in line if len(token) >2] for line in texts]
    # remove numbers
    texts = [[token for token in line if not token.isnumeric()] for line in texts]
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts]
    texts = [" ".join(line) for line in texts]

    return texts

In [9]:
preprocessed_text = preprocess(df["complaint"])

In [10]:
preprocessed_text

['debt collection credit card debt',
 'good morning xxxx xxxx appreciate help stop chase bank cardmember services wrote chase asking debt verification sent statement acceptable asking bank validate debt instead receiving mail month attempting collect debt right know information consumer chase account xxxx xxxx xxxx xxxx thanks advance help debt collection credit card debt',
 'upgraded xxxx xxxx card told agent upgrade anniversary date change turned agent giving wrong information order upgrade account xxxx changed anniversary date xxxx xxxx consent xxxx recording agent misled credit card prepaid card general purpose credit card charge card',
 'mortgage conventional home mortgage',
 'credit card prepaid card general purpose credit card charge card',
 'checking savings account checking account',
 'checking savings account checking account',
 'mortgage conventional home mortgage',
 'checking savings account checking account',
 'checking savings account checking account',
 'chase card repor

In [12]:
from keras.preprocessing.text import one_hot

In [13]:
voc_size=10000

In [14]:
encoded = [one_hot(words, voc_size) for words in preprocessed_text]

In [15]:
encoded

[[4215, 3530, 1750, 7694, 4215],
 [6588,
  3086,
  5095,
  5095,
  624,
  2123,
  28,
  4203,
  8581,
  4635,
  5492,
  2805,
  4203,
  8456,
  4215,
  2097,
  667,
  4990,
  5448,
  8456,
  8581,
  1051,
  4215,
  8570,
  7426,
  3024,
  5992,
  1420,
  7225,
  4215,
  5470,
  490,
  8464,
  4217,
  4203,
  6906,
  5095,
  5095,
  5095,
  5095,
  4170,
  2164,
  2123,
  4215,
  3530,
  1750,
  7694,
  4215],
 [7751,
  5095,
  5095,
  7694,
  4709,
  913,
  6312,
  5035,
  5788,
  7847,
  2281,
  913,
  7978,
  7456,
  8464,
  1884,
  6312,
  6906,
  5095,
  1282,
  5035,
  5788,
  5095,
  5095,
  5182,
  5095,
  3635,
  913,
  3217,
  1750,
  7694,
  8441,
  7694,
  6181,
  920,
  1750,
  7694,
  8791,
  7694],
 [7713, 441, 9075, 7713],
 [1750, 7694, 8441, 7694, 6181, 920, 1750, 7694, 8791, 7694],
 [523, 7295, 6906, 523, 6906],
 [523, 7295, 6906, 523, 6906],
 [7713, 441, 9075, 7713],
 [523, 7295, 6906, 523, 6906],
 [523, 7295, 6906, 523, 6906],
 [4203,
  7694,
  5213,
  2788,
  5499,


In [16]:
from keras.layers import Embedding, Dense, Flatten
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [19]:
padded_encoded = pad_sequences(encoded)

In [20]:
t = Tokenizer()
t.fit_on_texts(preprocessed_text)

In [21]:

vector_size = 200

In [22]:


embeddings_index = dict()   # A dictionary that will hold word and mapped numeric vector
f = open('glove.6B.200d.txt', encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.200d.txt'

27965

In [119]:
matrix_size = len(t.word_index)+1
embedding_matrix = np.zeros((matrix_size, vector_size))
for word,i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [120]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.39943999  0.70542002 -0.075412   ... -0.0077033   0.044716
   0.78061998]
 [ 0.42771    -0.18483999  0.40634    ...  0.50226003  0.18767001
   0.32892999]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.025712    0.20636    -0.61913002 ...  0.52411997  0.82837999
  -0.20194   ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


[('googling', 8740),
 ('eighth', 14836),
 ('ccalled', 16456),
 ('deferrment', 13708),
 ('sincerity', 12420),
 ('blurry', 11794),
 ('banks', 251),
 ('censured', 14060),
 ('mybank', 20487),
 ('poas', 9188),
 ('illegall', 16198),
 ('notei', 26768),
 ('scores', 1629),
 ('violationof', 18174),
 ('struggles', 6536),
 ('frau', 16376),
 ('numeber', 22234),
 ('initialing', 14896),
 ('unsuitable', 10882),
 ('tantamount', 10453),
 ('obligator', 21516),
 ('squared', 8074),
 ('wedges', 15756),
 ('modificationwent', 25563),
 ('represntatives', 19183),
 ('guardianship', 5525),
 ('warrantied', 16325),
 ('bowever', 19787),
 ('buts', 18731),
 ('profusely', 7464),
 ('pairing', 27675),
 ('unaccountable', 9249),
 ('savigns', 13272),
 ('hurrican', 15282),
 ('retaliatory', 4545),
 ('inkling', 17957),
 ('noon', 5233),
 ('acceptable', 1600),
 ('leery', 24471),
 ('debate', 8941),
 ('bulb', 22618),
 ('reverseit', 22212),
 ('divert', 7243),
 ('instruct', 4742),
 ('taunting', 15358),
 ('progression', 9655),
 ('fie

In [139]:
input_len = len(padded_encoded[1])


In [140]:
input_len

3004

In [149]:
X = padded_encoded
y = pd.get_dummies(df["type"])

In [150]:
X.shape

(78313, 3004)

In [151]:
y.shape

(78313, 5)

In [238]:
model = Sequential()
model.add(Embedding(matrix_size,vector_size,weights=[embedding_matrix],input_length=input_len,trainable=False))
model.add(Flatten())
model.add(Dense(128,activation="relu"))
model.add(Dense(64,activation="relu"))
model.add(Dense(5,activation="softmax"))
model.compile(optimizer="rmsprop",loss="categorical_crossentropy",metrics=["accuracy"])

In [2]:
model.summary()

NameError: name 'model' is not defined

In [240]:
from sklearn.model_selection import train_test_split

In [241]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
model.fit(Xtrain,ytrain,batch_size=50,epochs=10,validation_data=(Xtest,ytest))

Train on 62650 samples, validate on 15663 samples
Epoch 1/10
13250/62650 [=====>........................] - ETA: 10:41 - loss: 0.0814 - acc: 0.9773

In [1]:
model.save("complaint_model_save")

NameError: name 'model' is not defined

In [198]:
loss, accuracy = model.evaluate(X,y)



In [231]:
test_doc = ["Today I check my bank account and I deposit some money please provide your bank services"]

In [232]:
encoded_test_doc = t.texts_to_sequences(test_doc)

In [233]:
encoded_test_doc

[[146, 24, 7, 2, 18446, 80, 20717, 12, 107, 7, 44]]

In [234]:
padded_encoded_doc = pad_sequences(encoded_test_doc,maxlen=input_len)

In [235]:
model.predict(padded_encoded_doc)

array([[2.6382942e-09, 2.6436223e-02, 9.7315526e-01, 6.9035667e-18,
        4.0851621e-04]], dtype=float32)