In [None]:
pip install emoji

In [None]:
import re, string, nltk, emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import pandas as pd, numpy as np, re, nltk, matplotlib.pyplot as plt, warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

warnings.filterwarnings("ignore")

In [None]:
url = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"
df = pd.read_csv(url, compression="zip")
df.head(5)

In [None]:
def categorize_product(product):
    if 'Credit reporting' in product or 'Credit repair' in product:
        return 'Credit reporting, repair, or other'
    elif product == 'Debt collection':
        return 'Debt collection'
    elif product == 'Consumer Loan':
        return 'Consumer Loan'
    elif product == 'Mortgage':
        return 'Mortgage'
    else:
        return 'Credit reporting, repair, or other' # Group all other categories into the 0th class

df['Product'] = df['Product'].apply(categorize_product)

# Display the count of each product category after re-categorization
print(df["Product"].value_counts())

In [None]:
print(df.head())

In [None]:
df = df.rename(columns=lambda x: x.strip().lower())
df = df[["product", "consumer complaint narrative"]].dropna()

categories = [
    "Credit reporting, repair, or other",
    "Debt collection",
    "Consumer Loan",
    "Mortgage"
]

df = df[df["product"].isin(categories)]
df = df.rename(columns={"consumer complaint narrative": "text"}).sample(frac=1, random_state=42)

print(f"Filtered dataset size: {len(df)}")
print(df["product"].value_counts(), "\n")

In [None]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["product"])

In [None]:
import re, string, emoji, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initialize objects
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Add extra common or dataset-specific stopwords
custom_stopwords = {
    "xxxx", "xx", "na", "nan", "n/a", "account", "creditor", "loan", "report",
    "complaint", "consumer", "finance", "company"
}
stop_words |= custom_stopwords

In [None]:
def clean_text_advanced(text):
    """
    Cleans and normalizes consumer complaint text.
    Steps:
    1️⃣ Lowercasing
    2️⃣ Removing URLs, emails, numbers, HTML tags
    3️⃣ Expanding contractions (can't → cannot)
    4️⃣ Removing punctuation, emojis, and extra spaces
    5️⃣ Lemmatizing + removing stopwords
    """

    # Convert to lowercase
    text = str(text).lower()

    # Remove URLs, emails, HTML tags, and numbers
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"\S*@\S*\s?", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"\d+", " ", text)

    # Expand common contractions
    contractions = {
        "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are",
        "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"
    }
    for k, v in contractions.items():
        text = text.replace(k, v)

    # Remove emojis and punctuation
    text = emoji.replace_emoji(text, replace=' ')
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenize words
    tokens = word_tokenize(text)

    # Lemmatize + remove stopwords and short tokens
    clean_tokens = [
        lemmatizer.lemmatize(tok) for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]

    # Join back into single string
    return " ".join(clean_tokens).strip()

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
import nltk
try:
    from nltk.tokenize import word_tokenize
    nltk.data.find('tokenizers/punkt')
    tokenizer = word_tokenize
except LookupError:
    print("punkt tokenizer not found, using simple split() instead.")
    tokenizer = lambda x: x.split()

# Update function to use fallback tokenizer
def clean_text_final(text):
    # Use your advanced cleaning steps
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"\S*@\S*\s?", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"\d+", " ", text)

    contractions = {
        "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are",
        "'s": " is", "'d": " would", "'ll": " will", "'t": " not",
        "'ve": " have", "'m": " am"
    }
    for k, v in contractions.items():
        text = text.replace(k, v)

    text = emoji.replace_emoji(text, replace=' ')
    text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = tokenizer(text)
    clean_tokens = [lemmatizer.lemmatize(tok) for tok in tokens if tok not in stop_words and len(tok) > 2]
    return " ".join(clean_tokens).strip()

# Apply cleaning
print("Cleaning text (this may take a minute)...")
df["clean_text"] = df["text"].apply(clean_text_final)

# Show sample results
print(df[["text", "clean_text"]].sample(3, random_state=42))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # unigrams + bigrams
X = tfidf.fit_transform(df["clean_text"])
y = df["label"]

print("TF-IDF vectorization completed. Feature matrix shape:", X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape[0]} samples, Test set: {X_test.shape[0]} samples")

# RNN

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

In [None]:
max_words = 5000  # Maximum number of words to keep
maxlen = 100  # Maximum length of sequences

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])

In [None]:
# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

# Split data for RNN
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(
    padded_sequences, df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [None]:
# Build the RNN model
rnn_model = Sequential([
    Embedding(max_words, 64, input_length=maxlen),
    SimpleRNN(64),
    Dense(len(le.classes_), activation='softmax') # Output layer with number of classes
])

In [None]:
# Compile the model
rnn_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

In [None]:
# Train the model
history = rnn_model.fit(X_train_rnn, y_train_rnn,
                        epochs=5,
                        batch_size=32,
                        validation_split=0.2)

In [None]:
# Evaluate the model
loss, accuracy = rnn_model.evaluate(X_test_rnn, y_test_rnn, verbose=0)
print(f"RNN Model Accuracy: {accuracy:.4f}")

In [None]:
# Predict and display classification report
y_pred_rnn = np.argmax(rnn_model.predict(X_test_rnn), axis=-1)
print("Classification Report:\n")
print(classification_report(y_test_rnn, y_pred_rnn, target_names=le.classes_))


In [None]:
# Confusion Matrix
cm_rnn = confusion_matrix(y_test_rnn, y_pred_rnn)
ConfusionMatrixDisplay(confusion_matrix=cm_rnn, display_labels=le.classes_).plot(
    cmap="Blues", xticks_rotation=45
)
plt.title("Confusion Matrix – Simple RNN")
plt.show()

# BiLSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Parameters for tokenization and padding
max_words = 5000  # Maximum number of words to keep
maxlen = 100      # Maximum length of sequences

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])

In [None]:
# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

# Split data for BiLSTM
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [None]:
# Build the BiLSTM model
bilstm_model = Sequential([
    Embedding(max_words, 64, input_length=maxlen),
    Bidirectional(LSTM(64)),
    Dense(len(le.classes_), activation='softmax')  # Output layer with number of classes
])

In [None]:
# Compile the model
bilstm_model.compile(optimizer='adam',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

In [None]:
# Train the model
history = bilstm_model.fit(X_train, y_train,
                           epochs=5,
                           batch_size=32,
                           validation_split=0.2)

In [None]:
# Evaluate the model
loss, accuracy = bilstm_model.evaluate(X_test, y_test, verbose=0)
print(f"BiLSTM Model Accuracy: {accuracy:.4f}")

In [None]:
# Predict and display classification report
y_pred = np.argmax(bilstm_model.predict(X_test), axis=-1)
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_).plot(
    cmap="Blues", xticks_rotation=45
)
plt.title("Confusion Matrix – BiLSTM")
plt.show()

# BERT

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

In [None]:

# Parameters for tokenization
maxlen = 100  # Maximum length of sequences

# Tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_tokens = tokenizer(list(df["clean_text"]), padding=True, truncation=True, max_length=maxlen, return_tensors="tf")

In [None]:
# Split data for BERT
X_train, X_test, y_train, y_test = train_test_split(
    X_tokens['input_ids'], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [None]:
# Convert the data into tf.data.Dataset format
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
# Build the BERT model for sequence classification
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df["label"].unique()))

In [None]:
# Compile the model
bert_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),  # Use smaller learning rate for BERT
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   metrics=['accuracy'])

In [None]:
# Train the model with early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
history = bert_model.fit(
    train_data.batch(32),
    epochs=5,
    validation_data=test_data.batch(32),
    callbacks=[early_stopping]
)

In [None]:
# Evaluate the model
loss, accuracy = bert_model.evaluate(test_data.batch(32), verbose=0)
print(f"BERT Model Accuracy: {accuracy:.4f}")

In [None]:
# Predict and display classification report
y_pred = np.argmax(bert_model.predict(X_test.batch(32)).logits, axis=-1)
print("\n📈 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=df["label"].unique()))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=df["label"].unique()).plot(
    cmap="Blues", xticks_rotation=45
)
plt.title("Confusion Matrix – BERT")
plt.show()