In [1]:
import pandas as pd
import numpy as np
import re
import string
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
df = pd.read_csv("D:/New download/rows.csv")
df = df[["Consumer complaint narrative", "Product"]]
df.dropna(inplace=True)

df.columns = ["text", "label"]

  df = pd.read_csv("D:/New download/rows.csv")


In [5]:
LABEL_MAP = {
    "Credit card": "billing",
    "Bank account or service": "technical",
    "Mortgage": "service_delay",
    "Debt collection": "billing",
    "Student loan": "general",
    "Checking or savings account": "technical",
    "Money transfer": "billing"
}

df["label"] = df["label"].map(LABEL_MAP)
df.dropna(inplace=True)

print("Dataset size:", df.shape)
print("Label distribution:\n", df["label"].value_counts())

Dataset size: (208111, 2)
Label distribution:
 label
billing          105548
service_delay     52987
technical         27766
general           21810
Name: count, dtype: int64


In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text"] = df["text"].apply(clean_text)

In [9]:
le = LabelEncoder()
df["label_enc"] = le.fit_transform(df["label"])
num_classes = len(le.classes_)

joblib.dump(le, "customer_label_encoder.joblib")
print("Customer Label encoder saved.")

Customer Label encoder saved.


In [11]:
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
X = pad_sequences(sequences, maxlen=MAX_LEN, padding="post")
y = df["label_enc"].values

joblib.dump(tokenizer, "tokenizer.joblib")
print("Tokenizer saved.")

Tokenizer saved.


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
EMBED_DIM = 100
LSTM_UNITS = 64

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])



In [19]:
model.build(input_shape=(None, MAX_LEN))

In [21]:
model.summary()

In [23]:
checkpoint = ModelCheckpoint("customer_model-{epoch:03d}.keras",monitor="val_loss",verbose=1,save_best_only=True,mode="min")

In [25]:
history = model.fit(X_train, y_train,validation_split=0.2,epochs=10,batch_size=64,callbacks=[checkpoint])

Epoch 1/10
[1m2082/2082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.7940 - loss: 0.5673
Epoch 1: val_loss improved from inf to 0.33082, saving model to customer_model-001.keras
[1m2082/2082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 136ms/step - accuracy: 0.7941 - loss: 0.5673 - val_accuracy: 0.8937 - val_loss: 0.3308
Epoch 2/10
[1m2082/2082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.9068 - loss: 0.2913
Epoch 2: val_loss improved from 0.33082 to 0.30739, saving model to customer_model-002.keras
[1m2082/2082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 117ms/step - accuracy: 0.9068 - loss: 0.2913 - val_accuracy: 0.9015 - val_loss: 0.3074
Epoch 3/10
[1m2082/2082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.9184 - loss: 0.2501
Epoch 3: val_loss improved from 0.30739 to 0.30537, saving model to customer_model-003.keras
[1m2082/2082[0m [32m━━━━━━━━━━━━━━━━

In [27]:
model.load_weights("customer_model-003.keras")
print("Best model weights loaded.")

Best model weights loaded.


In [29]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {acc*100:.2f}%")


Test Accuracy: 90.43%
