In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout, Input
import warnings
import spacy
import numpy as np
import re
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from transformers import pipeline
import joblib
tf.keras.utils.set_random_seed(42)
from imblearn.under_sampling import RandomUnderSampler

In [9]:
df = pd.read_csv('consumer_complaints.csv')
df, _ = train_test_split(df, train_size=26000, random_state=42)
df.rename(columns={'consumer_disputed?':"consumer_disputed"}, inplace=True)
df.replace({"Yes":1, "No":0}, inplace=True)
df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id
317627,11/06/2015,Bank account or service,Checking account,"Account opening, closing, or management",,"Citizens Bank XXXX XXXX, RI sent me, by US mai...",,"Citizens Financial Group, Inc.",NH,030XX,Older American,Consent provided,Web,11/10/2015,Closed with explanation,1,0,1644032
399121,12/05/2011,Credit card,,Closing/Cancelling account,,,,Amex,NY,10580,,,Web,12/06/2011,Closed with relief,1,0,2801
505916,02/02/2016,Debt collection,Credit card,Taking/threatening an illegal action,Seized/Attempted to seize property,I have had my wages garnished starting on XX/X...,,"Weltman, Weinberg & Reis",OH,441XX,,Consent provided,Web,02/02/2016,Closed with explanation,1,0,1769306
538320,04/11/2016,Credit reporting,,Incorrect information on credit report,Information is not mine,,Company has responded to the consumer and the ...,"TransUnion Intermediate Holdings, Inc.",CA,90036,,Consent not provided,Web,04/11/2016,Closed with explanation,1,1,1872829
390186,08/27/2013,Consumer Loan,Vehicle loan,Managing the loan or lease,,,,Credit Acceptance Corporation,CA,92553,,,Phone,08/28/2013,Closed with explanation,1,0,508109


In [10]:
X = df["issue"]
y = df["consumer_disputed"]
X = X.to_frame()
y = y.to_frame()

# Randomly undersample target class to balance out the values
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

print(y_resampled.value_counts())  # Print out value distribution

consumer_disputed
0                    5158
1                    5158
Name: count, dtype: int64


In [None]:
def preprocessing(text:str):
    text = text.lower()  # Lowercase the text
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"[\s+]", " ", text)  # Replace multiple spaces with one space
    
    # Vectorize the text
    nlp = spacy.load('en_core_web_lg')
    doc = nlp(text)
    return np.array([token.vector for token in doc if token.has_vector])  # Return the vectorized tokens
    
X_resampled = X_resampled["issue"].apply(preprocessing)

In [12]:
# X = pd.read_csv("X.csv")
# y = pd.read_csv("y.csv")

# X = X["issue"].apply(preprocessing)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, train_size=0.75, test_size=0.25, stratify=y_resampled, shuffle=True, random_state=42)

X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [16]:
X_train.shape

(7737, 300)

In [None]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(126)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Bidirectional(LSTM(16)),
    Dense(1, activation="softmax")
])
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.fit(X_train, y_train, epochs=15, validation_split=0.1, verbose=1)

X_test = np.array(X_test)
y_test = np.array(y_test)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

Epoch 1/15


ValueError: Input 0 of layer "bidirectional_9" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 252)

In [20]:
model = Sequential([
    Input(shape=(300,)),
    Dense(128, activation="sigmoid"),
    Dense(64, activation="sigmoid"),
    Dropout(0.3),
    Dense(32, activation="sigmoid"),
    Dense(16, activation="sigmoid"),
    Dropout(0.3),
    Dense(8, activation="sigmoid"),
    Dense(4, activation="sigmoid"),
    Dense(1, activation="softmax")
])
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.fit(X_train, y_train, epochs=15, validation_split=0.15, verbose=1)

X_test = np.array(X_test)
y_test = np.array(y_test)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

Epoch 1/15
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4971 - loss: 0.6957 - val_accuracy: 0.5228 - val_loss: 0.6936
Epoch 2/15
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4971 - loss: 0.6931 - val_accuracy: 0.5228 - val_loss: 0.6935
Epoch 3/15
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4971 - loss: 0.6924 - val_accuracy: 0.5228 - val_loss: 0.6921
Epoch 4/15
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4971 - loss: 0.6909 - val_accuracy: 0.5228 - val_loss: 0.6909
Epoch 5/15
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4971 - loss: 0.6893 - val_accuracy: 0.5228 - val_loss: 0.6889
Epoch 6/15
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4971 - loss: 0.6878 - val_accuracy: 0.5228 - val_loss: 0.6899
Epoch 7/15
[1m206/206[0m 

In [None]:
# summarization_model = pipeline('summarization')  # Create Summarization model

# # Save models
# X.to_csv("X.csv", index=False)
# y.to_csv("y.csv", index=False)
# joblib.dump(model, "../disputes/models/model.pkl")
# joblib.dump(summarization_model, "../disputes/models/summarization_model.pkl")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


['../disputes/models/summarization_model.pkl']