In [None]:
# Load packages
import numpy as np
import pandas as pd
from google.colab import files
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, GRU, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Import the cleaned dataset with the glove embeddings
df = pd.read_csv('/content/customer_complaints_clean_vF.csv')
df.head()

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


In [None]:
import urllib.request

url = 'https://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip')


('glove.6B.zip', <http.client.HTTPMessage at 0x7fb82752ce80>)

In [None]:
import zipfile

with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('glove')


In [None]:
def load_embedding(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        embedding_dict = {}
        for line in lines:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_dict[word] = coefs
    return embedding_dict

In [None]:
embedding_dict = load_embedding('glove/glove.6B.100d.txt')

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Separate target and input data
X = df['narrative'].astype(str).tolist()
Y = df['product'].astype(str).tolist()

In [None]:
print(df.dtypes)


product      object
narrative    object
dtype: object


In [None]:
from sklearn.preprocessing import LabelEncoder
# Convert target labels to numerical labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
from keras.utils import to_categorical

# Convert target data to one-hot encoded vectors
Y = to_categorical(Y, num_classes=5)

# Remove extra dimension from Y
Y = Y.reshape(-1, 5)

In [None]:
Y.shape

(112191, 5)

In [None]:
# Convert X to list of strings
X_str = [str(sentence) for sentence in X]

# Tokenize input data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_str)
X = tokenizer.texts_to_sequences(X_str)
X = pad_sequences(X, maxlen=200)


In [None]:
# Create embedding matrix for input vocabulary
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Convert one-hot encoded labels to class labels
Y_train_labels = np.argmax(Y_train, axis=1)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(Y_train_labels), y=Y_train_labels)

# Convert class weights to dictionary
class_weights_dict = dict(enumerate(class_weights))

In [None]:
#Y = np.squeeze(Y)

In [None]:
# Build GRU model
import tensorflow as tf
model = tf.keras.Sequential()
model.add(Embedding(len(word_index) + 1,
                    100,
                    weights=[embedding_matrix],
                    input_length=200,
                    trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(GRU(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
scores = model.evaluate(X_test, Y_test)
print(f"Test Loss: {scores[0]:.4f}")
print(f"Test Accuracy: {scores[1]*100:.2f}%")

Test Loss: 0.4571
Test Accuracy: 83.21%


In [None]:
Y_test = np.argmax(Y_test, axis=1)
Y_pred = model.predict(X_test)
Y_pred = np.argmax(Y_pred, axis=1)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      2978
           1       0.87      0.86      0.86      8896
           2       0.82      0.75      0.78      4145
           3       0.79      0.89      0.84      3760
           4       0.85      0.84      0.84      2660

    accuracy                           0.83     22439
   macro avg       0.82      0.83      0.82     22439
weighted avg       0.83      0.83      0.83     22439

