**Importing Libraries**

In [None]:
!pip install klib

: 

In [None]:
from nltk.collocations import *


In [3]:
import concurrent.futures

In [None]:
pip install spacy

In [None]:
pip install pandas 


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib # install if not loaded '!pip install klib'
import spacy
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, LSTM, Dense, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Loading the dataset
data = pd.read_csv('consumer_complaints.csv')

In [None]:
# First 10 rows of the datset
data.head(10)

In [None]:
data.info() # For checking the information about the columns

In [None]:
# FOr checking if dataset contains any null values
data.isnull().sum()

In [None]:
# To drop the duplicate rows, dropping missing data, reducing memory usage.
data = klib.data_cleaning(data)

In [None]:
data.info()

In [17]:
# to clean the column names like converting it into lowercase
data = klib.clean_column_names(data)

In [18]:
# Converting the datatypes into appropriate datatypes
data = klib.convert_datatypes(data)

In [19]:
# Dropping Rows which have missing values in all columns
data = klib.drop_missing(data)

In [None]:
# Dataframe shape
data.shape

In [None]:
data.info()

In [None]:
columns_to_convert = [
    'date_received', 'product', 'sub_product', 'issue', 'sub_issue',
    'company_public_response', 'company', 'state', 'zip_code', 'tags',
    'consumer_consent_provided', 'submitted_via', 'date_sent_to_company',
    'company_response_to_consumer', 'timely_response'
]

# Convert specified columns to object
data[columns_to_convert] = data[columns_to_convert].astype(object)

In [23]:
# I want to drop all those rows which have null values in my target column
data.dropna(subset = ['consumer_disputed'], inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.head(5)

In [None]:
data.isnull().sum()

### **Handelling Missing Values**

As I have huge dataset, and modelling with all the columns is not feasiable, so we can drop few columns which are unnecessary for us.

In [None]:
data.columns

In [29]:
# Correcting column names to match those in the DataFrame
clean_df = data.drop(columns=['consumer_consent_provided', 'complaint_id', 'date_sent_to_company',
                              'zipcode', 'state', 'date_received', 'sub_product', 'consumer_complaint_narrative',
                              'sub_issue'])
# Removing rows with any missing values
clean_df.dropna(inplace=True)


In [None]:
clean_df.isnull().sum()

In [None]:
clean_df.shape

In [32]:
clean_df = clean_df.reset_index(drop=True)

In [None]:
clean_df.head(100)

ideas ~

1. Either we can delete 'consumer_complaint_narrative' or we have to do a sentiment analysis on this column and classify first this texts into positive, negative and neutral then do the process


Is the target column balanced or not ?

In [None]:
clean_df['consumer_disputed'].value_counts()

Clearly dataframe is imbalanced. So we can consider the following methods for correcting this issue

- Synthetic Minority Over-sampling Technique (SMOTE)
- Random Over Sampler
- Random Forests or Support Vector Machines
- Stratified Sampling
- Ensemble Methods like Bagging and Boosting
- providing class weights to penalize misclassifications of the minority class more heavily

## Data Cleaning & Pre Processing

---
Checking the product columns for valid product names

In [None]:
print('Product')
for i in clean_df['product'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

print('timely_response')
for i in clean_df['timely_response'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

print('tags')
for i in clean_df['tags'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

print('company_response_to_consumer')
for i in clean_df['company_response_to_consumer'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

print('submitted_via')
for i in clean_df['submitted_via'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

print('consumer_disputed')
for i in clean_df['consumer_disputed'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

print('company_public_response')
for i in clean_df['company_public_response'].unique():
    print(i)

print('```````````````````````````````````````````````````````')

In [None]:
clean_df['tags'] = clean_df['tags'].replace({'Older American, Servicemember': 'Older American and Servicemember'})

Performing all the NLP preprocessing tasks
- Removing punctuations
- Tokenization
- Lower Casing
- Stop Word Removal
- Lemmatization

In [37]:
df = pd.DataFrame.copy(clean_df)

In [38]:
stemmer = PorterStemmer()

def preprocess_text(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

def parallel_preprocess(column):
    df[column] = df[column].apply(preprocess_text)

columns_to_preprocess = ['product', 'issue', 'company_public_response', 'company', 'tags', 'submitted_via', 'company_response_to_consumer', 'timely_response']
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(parallel_preprocess, columns_to_preprocess)

**Vectorizing the dataset**

Changing the target column into 0 & 1

In [39]:
df['consumer_disputed'] = df['consumer_disputed'].apply(lambda x: 1 if x == 'Yes' else 0)

**Now we can use WordToVec for embedding the other columns**

In [None]:
df.head(3)

In [None]:
import nltk
nltk.download('punkt')


In [None]:
pip install nltk


In [None]:
import nltk
nltk.download('punkt', download_dir='C:/nltk_data')


In [None]:
import nltk
nltk.download('punkt')


In [None]:
import nltk
nltk.download('punkt', download_dir='C:/nltk_data')
nltk.data.path.append('C:/nltk_data')


In [None]:
nltk.download('punkt', download_dir='C:/nltk_data')
nltk.data.path.append('C:/nltk_data')


In [None]:
import nltk
print(nltk.data.path)


In [64]:
import nltk
nltk.data.path.append('C:/nltk_data')  # Adjust the path if necessary


In [58]:
df['combined_text'] = df['product'].astype(str) + ' ' + df['issue'].astype(str) + ' ' + df['company_public_response'].astype(str) + ' ' + df['company'].astype(str) + ' ' + df['tags'].astype(str) + ' ' + df['submitted_via'].astype(str) + ' ' + df['company_response_to_consumer'].astype(str) + ' ' + df['timely_response'].astype(str)



In [None]:

df['tokenized_text'] = df['combined_text'].apply(word_tokenize)

In [None]:
w2v_model = Word2Vec(sentences = df['tokenized_text'], vector_size=100, window=5, min_count=1, sg=0)

In [None]:
word_embeddings = [w2v_model.wv[word] for word in df['tokenized_text']]

In [None]:
from gensim.models import Word2Vec

w2v_model.save("word2vec_model.bin")

In [None]:
document_embeddings = [np.mean(embeddings, axis=0) if embeddings.any() else np.zeros(w2v_model.vector_size) for embeddings in word_embeddings]

In [None]:
df['document_embeddings'] = document_embeddings

In [None]:
df.head(2)

In [None]:
len(df['document_embeddings'][0])

# **Modelling**

In [None]:
l = [len(i) for i in df['document_embeddings']]
print(sorted(l)[int(len(l)/2)])
print(len(df['document_embeddings'][0]))
print(len(df['document_embeddings']))

Total length 29480, each row length 100

In [None]:
X = df['document_embeddings']
y = df['consumer_disputed']
X = np.array(X)
y = np.array(y)

**RandomOverSampler is a technique used in machine learning to handle imbalanced datasets. It works by randomly duplicating instances of the minority class in the dataset until it is balanced with the majority class.**

In [None]:
X.shape,y.shape

In [None]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.reshape(-1, 1), y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
X_train.shape, X_test.shape

In [None]:
train_class_distribution = np.bincount(y_train)
test_class_distribution = np.bincount(y_test)

print(f"Train Class Distribution: {train_class_distribution}")
print(f"Test Class Distribution: {test_class_distribution}")

In [None]:
X_train = X_train.reshape(X_train.shape[0], 100, 1)
X_test = X_test.reshape(X_test.shape[0], 100, 1)

In [None]:
early_stopping = EarlyStopping(monitor = 'accuracy' , patience = 1  ,restore_best_weights = True )
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(32))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model_hist= model.fit(X_train, y_train, epochs=10, batch_size=32,
                      validation_data=(X_test, y_test),verbose=1,callbacks=[early_stopping])

In [None]:
y_pred = model.predict(X_test)
from sklearn.metrics import classification_report

y_pred = np.argmax(y_pred, axis=1)
y_pred = (y_pred > 0.2).astype('int32')
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)

In [None]:
y_pred = model.predict(X_test)
from sklearn.metrics import classification_report

y_pred = np.argmax(y_pred, axis=1)
y_pred = (y_pred > 0.75).astype('int32')
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)

In [None]:
plt.plot(Model_hist.history['loss'])
plt.plot(Model_hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(Model_hist.history['accuracy'])
plt.plot(Model_hist.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
model.save('lstm_model.h5')

In [None]:
from tensorflow.keras.models import load_model

loaded_model = load_model('lstm_model.h5')

Further Prediction

In [None]:
w2v_model = Word2Vec.load("word2vec_model.bin")

In [None]:
example_text = "mortgag Loan servicing, payments, escrow account compani choos provid public respons bank america nation associ older american postal mail close explan ye"


tokenized_text = word_tokenize(example_text.lower())  # Convert to lowercase for consistency

word_embeddings_2 = [w2v_model.wv[word] for word in tokenized_text if word in w2v_model.wv]


In [None]:
document_embeddings_2 = [np.mean(embeddings, axis=0) if embeddings.any() else np.zeros(w2v_model.vector_size) for embeddings in word_embeddings_2]

In [None]:
import numpy as np

# Assuming 'example_text' is the text you want to convert
example_tokens = word_tokenize(example_text)

# Initialize an empty list to store word vectors
example_vectors = []

# Iterate through each token in the example text
for token in example_tokens:
    try:
        vector = w2v_model.wv[token]
        example_vectors.append(vector)
    except KeyError:
        # Handle the case where a token is not in the vocabulary
        pass

# If there are no vectors for any tokens, add a zero vector
if not example_vectors:
    example_vectors.append(np.zeros(w2v_model.vector_size))

# Calculate the mean of the word vectors
example_vector = np.mean(example_vectors, axis=0)

# 'example_vector' now contains the vector representation of the example text



In [None]:
vec = np.array(example_vector)

final_vec= np.expand_dims(vec, axis=-1)

In [None]:
final_vec.shape

In [None]:
final_vec = vec.reshape((1, vec.shape[0],1))

In [None]:
pred = model.predict(final_vec)

In [None]:
pred

In [None]:
pred_class = 'Yes'
if pred[0][0]<0.5:
    pred_class = 'Yes'
else:
    pred_class = 'No'

pred_class