# Importing Modules

In [101]:
%load_ext autotime

import warnings
import numpy as np
import pandas as pd
import preprocessing as pp
from sklearn import metrics
import neptune.new as neptune
from tensorflow.keras import layers
from sklearn.pipeline import Pipeline
from gensim.models.word2vec import Word2Vec
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from neptune.new.integrations.tensorflow_keras import NeptuneCallback

warnings.filterwarnings("ignore")

# import pickle
# import subprocess

# import seaborn as sn
# import xgboost as xgb

# from tqdm.notebook import tqdm
# import matplotlib.pyplot as plt


# import sklearn
# 
# from sklearn.svm import LinearSVC
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_validate
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.feature_extraction.text import TfidfVectorizer

# import nltk

# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

# import tensorflow as tf


The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 0 ns (started: 2022-01-27 18:54:49 +05:30)


## Converting sentences to vec

In [4]:
def sent2vec(s, vec_s, embed):
    words = word_tokenize(s)
    M = []
    for w in words:
        try:
            M.append(embed[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(vec_s)
    return v / np.sqrt((v ** 2).sum())

time: 0 ns (started: 2022-01-27 16:30:46 +05:30)


# Importing Data

In [5]:
con_com_use = pd.read_csv('train.csv')

time: 110 ms (started: 2022-01-27 16:30:47 +05:30)


# Configuration

In [6]:
# Independent variables
INDEPENDENT_FEATURES = ['Consumer complaint narrative']

# Dependent variable
DEPENDENT_FEATURES = ['Product']

# Sample size
TRAIN_SIZE = 10000

TEST_SIZE = 3000

# Random state
RANDOM_STATE = 0

# Word2Vec config
VECTOR_SIZE = 160
MIN_COUNT = 2 # Minimum frequency of words to be used
WINDOW = 7 # Maximum distance between the current and predicted word within a sentence
EPOCHS=50 # Number of iterations over the corpus
SG=1 # Training algo. 1 means skip-gram

# Padding size
MAX_LENGHT = 140

time: 0 ns (started: 2022-01-27 16:30:47 +05:30)


# Train-test split

In [53]:
xtrain, xtest, ytrain, ytest = train_test_split(con_com_use[INDEPENDENT_FEATURES[0]],
                                                con_com_use[DEPENDENT_FEATURES[0]],
                                                stratify=con_com_use[DEPENDENT_FEATURES[0]],
                                                random_state=RANDOM_STATE,
                                                test_size=0.3)
print (xtrain.shape)
print (xtest.shape)

(7000,)
(3000,)
time: 16 ms (started: 2022-01-27 16:54:03 +05:30)


# Encoding Dependent Labels

In [54]:
lab_enc = LabelEncoder()

ytrain = lab_enc.fit_transform(ytrain)
ytest = lab_enc.transform(ytest)

ytrain_enc = to_categorical(ytrain)
ytest_enc = to_categorical(ytest)

time: 16 ms (started: 2022-01-27 16:54:03 +05:30)


# Preprocessing Pipeline

In [55]:
text_process_pipe = Pipeline([
    
    ('text_cleanup', pp.textcleanup()),
    
    ('text_tokenenize', pp.texttokenize()),
    
    ('text_stopwordremove', pp.textstopwordremove()),
    
    ('text_lemmatize', pp.textlemmatize()),
    
    ('text_stemmer', pp.textstemmer()),
    
    ('text_token', pp.texttokenize2(MAX_LENGHT)),
    
])

text_process_pipe.fit(xtrain)

Pipeline(steps=[('text_cleanup', textcleanup()),
                ('text_tokenenize', texttokenize()),
                ('text_stopwordremove', textstopwordremove()),
                ('text_lemmatize', textlemmatize()),
                ('text_stemmer', textstemmer()),
                ('text_token', texttokenize2(variable=140))])

time: 20.3 s (started: 2022-01-27 16:54:03 +05:30)


In [56]:
xtrain = text_process_pipe.transform(xtrain)
xtest = text_process_pipe.transform(xtest)

word_index = eval(text_process_pipe.named_steps['text_token'].token.get_config()['word_index'])

time: 29.3 s (started: 2022-01-27 16:54:23 +05:30)


# Training a Word2Vec model

In [57]:
word2vecModel = Word2Vec(con_com_use[INDEPENDENT_FEATURES[0]].str.split(' ').tolist(),
                         min_count=MIN_COUNT,
                         vector_size=VECTOR_SIZE,
                         workers=12, 
                         window=WINDOW, 
                         sg=SG,
                         epochs=EPOCHS)
gensim_embbed = dict(zip(word2vecModel.wv.index_to_key, word2vecModel.wv.vectors))

embedding_matrix = np.zeros((len(word_index) + 1, VECTOR_SIZE))
for word, i in word_index.items():
    embedding_vector = gensim_embbed.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

time: 1min 34s (started: 2022-01-27 16:54:53 +05:30)


# Bi-Directional LSTM

In [104]:
modelLSTM = Sequential()
modelLSTM.add(layers.Embedding(len(word_index)+1,
                               VECTOR_SIZE,
                               weights=[embedding_matrix],
                               input_length=MAX_LENGHT,
                               trainable=False))

modelLSTM.add(layers.Bidirectional(layers.LSTM(512, dropout=0.2)))

modelLSTM.add(layers.Dense(1024, activation='relu'))
modelLSTM.add(layers.Dropout(0.8))

modelLSTM.add(layers.Dense(1024, activation='relu'))
modelLSTM.add(layers.Dropout(0.8))

modelLSTM.add(layers.Dense(8))
modelLSTM.add(layers.Activation('softmax'))

modelLSTM.compile(loss='categorical_crossentropy', optimizer='adam')
earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=0)

run = neptune.init(
    project="kumars/Consumer-Complaint",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1MmY3NWVjYy0wNDliLTQ1MDktODIzYS01NzBhOWMzYmVkYTYifQ==",
) 
neptune_clbk = NeptuneCallback(run=run, base_namespace='metrics')

modelLSTM.summary()

https://app.neptune.ai/kumars/Consumer-Complaint/e/CON-3
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 140, 160)          1573120   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 1024)              2756608   
_________________________________________________________________
dense_15 (Dense)             (None, 1024)              1049600   
_________________________________________________________________
dropout_10 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 1024)  

In [105]:
modelLSTM.fit(xtrain,
              y=ytrain_enc,
              batch_size=512,
              epochs=100,
              verbose=1,
              validation_split=0.3,
              callbacks=[earlystop])

run.stop()

2022/01/27 18:58:00 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4b944969db764809a1bd2637d508d98e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100




INFO:tensorflow:Assets written to: C:\Users\krsid\AppData\Local\Temp\tmp40tzfvpy\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\krsid\AppData\Local\Temp\tmp40tzfvpy\model\data\model\assets


Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.


All 1 operations synced, thanks for waiting!
time: 1min 59s (started: 2022-01-27 18:58:00 +05:30)


In [63]:
print(metrics.classification_report(
    np.argmax(ytest_enc, axis=-1),
    np.argmax(modelLSTM.predict(xtest), axis=-1),
    target_names=con_com_use['Product'].unique()))

                                                                                 precision    recall  f1-score   support

                                                                Debt collection       0.54      0.51      0.53       215
   Credit reporting, credit repair services, or other personal consumer reports       0.39      0.17      0.23       149
                                                                   Student loan       0.57      0.71      0.63       328
                                                    Credit card or prepaid card       0.81      0.87      0.84      1150
                                                                       Mortgage       0.81      0.72      0.76       615
                             Money transfer, virtual currency, or money service       0.00      0.00      0.00        85
                                       Bank account or service, Savings account       0.65      0.82      0.72       342
Consumer loan, Vehicle loan or 

# Score new data

In [73]:
data = pd.read_csv('test.csv')

xtest_new = text_process_pipe.transform(data[INDEPENDENT_FEATURES[0]])
ytest_new = to_categorical(lab_enc.transform(data[DEPENDENT_FEATURES[0]]))

time: 9.48 s (started: 2022-01-27 17:48:20 +05:30)


In [74]:
pred_test_new = modelLSTM.predict(xtest_new)

time: 1.61 s (started: 2022-01-27 17:48:30 +05:30)


In [75]:
print(metrics.classification_report(
    np.argmax(ytest_new, axis=-1),
    np.argmax(pred_test_new, axis=-1),
    target_names=data[DEPENDENT_FEATURES[0]].unique()))

                                                                                 precision    recall  f1-score   support

                                                                   Student loan       0.58      0.53      0.55       215
                                                                Debt collection       0.32      0.15      0.20       149
                                       Bank account or service, Savings account       0.59      0.70      0.64       328
   Credit reporting, credit repair services, or other personal consumer reports       0.79      0.88      0.83      1150
                                                    Credit card or prepaid card       0.76      0.71      0.73       615
                             Money transfer, virtual currency, or money service       1.00      0.08      0.15        85
Consumer loan, Vehicle loan or lease, Payday loan, title loan, or personal loan       0.65      0.75      0.69       342
                               

# Conclusion

We are ready for deployment