In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout, Input
import warnings
import spacy
import numpy as np
import re
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from transformers import pipeline
import joblib
tf.keras.utils.set_random_seed(42)
from imblearn.under_sampling import RandomUnderSampler

In [17]:
df = pd.read_csv('consumer_complaints.csv')
df, _ = train_test_split(df, train_size=9600, random_state=42)
df.rename(columns={'consumer_disputed?':"consumer_disputed"}, inplace=True)
df.replace({"Yes":1, "No":0}, inplace=True)
df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id
473484,06/20/2013,Credit reporting,,Incorrect information on credit report,Reinserted previously deleted info,,,Equifax,MS,39056,,,Web,06/20/2013,Closed with explanation,1,0,438917
186272,02/26/2015,Payday loan,Payday loan,Can't contact lender,Can't contact lender,,,ACE Cash Express Inc.,OH,44647,,,Postal mail,03/04/2015,Closed with explanation,1,0,1259078
416232,03/21/2012,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,,,HSBC North America Holdings Inc.,NY,10003,,,Referral,03/22/2012,Closed without relief,1,1,39698
46539,03/21/2014,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt is not mine,,,"CMRE Financial Services, Inc.",TX,78202,,,Web,03/27/2014,Closed with explanation,1,0,771275
210005,03/25/2015,Bank account or service,Other bank product/service,"Account opening, closing, or management",,,,Bank of America,FL,33401,,,Referral,03/26/2015,Closed with explanation,1,0,1300921


In [18]:
X = df["issue"]
y = df["consumer_disputed"]
X = X.to_frame()
y = y.to_frame()

# Randomly undersample target class to balance out the values
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

print(y_resampled.value_counts())  # Print out value distribution

consumer_disputed
0                    1942
1                    1942
Name: count, dtype: int64


In [15]:
def preprocessing(text:str):
    text = text.lower()  # Lowercase the text
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"[\s+]", " ", text)  # Replace multiple spaces with one space
    
    # Vectorize the text
    nlp = spacy.load('en_core_web_lg')
    doc = nlp(text)
    return doc.vector  # Return the vectorized text
    
# X_resampled = X_resampled["issue"].apply(preprocessing)

In [16]:
X = pd.read_csv("X.csv")
y = pd.read_csv("y.csv")

X = X["issue"].apply(preprocessing)

KeyboardInterrupt: 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X["issue"], y, train_size=0.75, test_size=0.25, stratify=y, shuffle=True, random_state=42)

X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [14]:
model = Sequential([
    Input(shape=(300,)),
    Dense(128, activation="relu"),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dropout(0.3),
    Dense(8, activation="relu"),
    Dense(4, activation="relu"),
    Dense(1, activation="softmax")
])
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.fit(X_train, y_train, epochs=15, validation_split=0.1, verbose=1)

X_test = np.array(X_test)
y_test = np.array(y_test)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

ValueError: Invalid dtype: str1280

In [None]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(126)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Bidirectional(LSTM(16)),
    Dense(1, activation="softmax")
])
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
model.fit(X_train, y_train, epochs=15, validation_split=0.1, verbose=1)

X_test = np.array(X_test)
y_test = np.array(y_test)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

Epoch 1/15
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 127ms/step - accuracy: 0.4990 - loss: 0.6942 - val_accuracy: 0.5514 - val_loss: 0.6843
Epoch 2/15
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 128ms/step - accuracy: 0.4990 - loss: 0.6888 - val_accuracy: 0.5514 - val_loss: 0.6812
Epoch 3/15
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 123ms/step - accuracy: 0.4990 - loss: 0.6884 - val_accuracy: 0.5514 - val_loss: 0.6801
Epoch 4/15
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 121ms/step - accuracy: 0.4990 - loss: 0.6873 - val_accuracy: 0.5514 - val_loss: 0.6792
Epoch 5/15
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 120ms/step - accuracy: 0.4990 - loss: 0.6875 - val_accuracy: 0.5514 - val_loss: 0.6786
Epoch 6/15
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 119ms/step - accuracy: 0.4990 - loss: 0.6870 - val_accuracy: 0.5514 - val_loss: 0.6784
Epoch 7/15
[1m82/82[

In [28]:
summarization_model = pipeline('summarization')  # Create Summarization model

# Save models
X.to_csv("X.csv", index=False)
y.to_csv("y.csv", index=False)
joblib.dump(model, "../disputes/models/model.pkl")
joblib.dump(summarization_model, "../disputes/models/summarization_model.pkl")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


['../disputes/models/summarization_model.pkl']