In [43]:
import pandas as pd
import numpy as np
from collections import Counter

from joblib import dump, load

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from gensim.models import Phrases
from gensim.utils import simple_preprocess
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

[nltk_data] Downloading package punkt to /Users/kd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kd/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Classification of Consumer Complaints

The Consumer Financial Protection Bureau publishes the Consumer Complaint Database, a collection of complaints about consumer financial products and services that were sent to companies for response. Complaints are published after the company responds, confirming a commercial relationship with the consumer, or after 15 days, whichever comes first. 

You have been provided with a dataset of over 350,000 such complaints for 5 common issue types. Your goal is to train a text classification model to identify the issue type based on the consumer complaint narrative.

In [2]:
complaints = pd.read_csv("../data/complaints.csv")

In [3]:
#making all words lowercase.
complaints = complaints.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [6]:
#removing all the xx's
complaints = complaints.replace(to_replace=r'[xx]', value='', regex=True)

In [10]:
#https://medium.com/@maleeshadesilva21/preprocessing-steps-for-natural-language-processing-nlp-a-beginners-guide-d6d9bf7689c9
#removing all the digits.
complaints = complaints.replace(to_replace=r'\d', value='', regex=True)

In [13]:
#https://stackoverflow.com/questions/61217024/getting-the-length-of-text-in-a-dataframe-in-python.
complaints['word_count'] = complaints['Consumer complaint narrative'].str.split().str.len()

In [12]:
complaints['Issue'].value_counts().sort_index()

attempts to collect debt not owed        73163
communication tactics                    21243
fraud or scam                            12347
incorrect information on your report    229305
struggling to pay mortgage               17374
Name: Issue, dtype: int64

In [35]:
complaints

Unnamed: 0,Consumer complaint narrative,Issue,word_count
0,"[my, name, is, this, complaint, is, not, made,...",incorrect information on your report,115
1,"[i, searched, on, for, and, was, pointed, to, ...",fraud or scam,346
2,"[i, have, a, particular, account, that, is, st...",incorrect information on your report,55
3,"[i, have, not, supplied, proof, under, the, do...",attempts to collect debt not owed,578
4,"[hello, i, 'm, writing, regarding, account, on...",incorrect information on your report,156
...,...,...,...
353427,"[collections, account, i, have, no, knowledge,...",attempts to collect debt not owed,7
353428,"[dear, cfpb, team, ,, the, reason, for, my, co...",attempts to collect debt not owed,759
353429,"[frca, violations, :, failing, to, follow, deb...",attempts to collect debt not owed,697
353430,"[my, father, ,, a, acquired, an, hecm, reverse...",struggling to pay mortgage,856


In [16]:
complaints['Consumer complaint narrative'] = complaints['Consumer complaint narrative'].apply(word_tokenize)

In [25]:
stop_words = set(stopwords.words('english'))

In [32]:
complaints.head()

Unnamed: 0,Consumer complaint narrative,Issue,word_count
0,"[my, name, is, this, complaint, is, not, made,...",incorrect information on your report,115
1,"[i, searched, on, for, and, was, pointed, to, ...",fraud or scam,346
2,"[i, have, a, particular, account, that, is, st...",incorrect information on your report,55
3,"[i, have, not, supplied, proof, under, the, do...",attempts to collect debt not owed,578
4,"[hello, i, 'm, writing, regarding, account, on...",incorrect information on your report,156


In [40]:
complaints_counter = Counter(word for x in complaints['Consumer complaint narrative'] for word in x if word not in stop_words)


In [41]:
complaints_counter.most_common()

[('.', 3011365),
 (',', 1795935),
 ('credit', 732755),
 ('//', 541817),
 ('account', 492410),
 (')', 465493),
 ('(', 445156),
 ('report', 414122),
 ('information', 384477),
 (':', 353717),
 ('$', 334477),
 ('{', 324690),
 ('}', 324364),
 ('reporting', 280577),
 ('debt', 229362),
 ('consumer', 199730),
 ('accounts', 175985),
 ('balance', 152193),
 ('company', 149242),
 ('payment', 142067),
 ('identity', 138022),
 ('``', 132470),
 ('would', 129381),
 ('please', 126594),
 ('also', 122655),
 ('theft', 122599),
 (';', 121747),
 ('never', 121730),
 ('date', 120777),
 ('loan', 116558),
 ('sent', 115688),
 ('received', 113752),
 ('collection', 110479),
 ('letter', 107567),
 ('number', 107290),
 ('time', 102001),
 ('file', 101413),
 ('remove', 96982),
 ('days', 96509),
 ('name', 96338),
 ('reported', 94409),
 ("n't", 93362),
 ('section', 93036),
 ('told', 92752),
 ('due', 87858),
 ('paid', 87509),
 ('dispute', 87390),
 ('!', 86086),
 ('payments', 85989),
 ('-', 85698),
 ('fraudulent', 85648),
 

In [44]:
porter = PorterStemmer()

In [None]:
seed = 711
for statement in complaints.loc[complaints['Issue'] == 'fraud or scam', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

In [None]:
seed = 711
for statement in complaints.loc[complaints['Issue'] == 'attempts to collect debt not owed', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

In [None]:
seed = 711
for statement in complaints.loc[complaints['Issue'] == 'struggling to pay mortgage', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

In [None]:
complaints

In [None]:
test_1 = sent_tokenize(complaints['Consumer complaint narrative'].loc[1])

In [None]:
sent_tokenize(complaints['Consumer complaint narrative'].loc[1])

In [None]:
test_1

In [None]:
processed_sentences = [simple_preprocess(sentence) for sentence in test_1]

In [None]:
print(processed_sentences)

In [None]:
bigram_finder = Phrases(
    sentences=processed_sentences,
    min_count=25,
    connector_words=ENGLISH_CONNECTOR_WORDS
)

In [None]:
sorted(bigram_finder.find_phrases(processed_sentences).items(), key = lambda x: -x[1])