In [24]:
import pandas as pd
import numpy as np

from joblib import dump, load

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from nltk.tokenize import sent_tokenize

from gensim.models import Phrases
from gensim.utils import simple_preprocess
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

# Classification of Consumer Complaints

The Consumer Financial Protection Bureau publishes the Consumer Complaint Database, a collection of complaints about consumer financial products and services that were sent to companies for response. Complaints are published after the company responds, confirming a commercial relationship with the consumer, or after 15 days, whichever comes first. 

You have been provided with a dataset of over 350,000 such complaints for 5 common issue types. Your goal is to train a text classification model to identify the issue type based on the consumer complaint narrative.

In [4]:
complaints = pd.read_csv("../data/complaints.csv")

In [8]:
complaints.head(10)

Unnamed: 0,Consumer complaint narrative,Issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report
5,On XXXX XX/XX/XXXX an electronic letter was se...,Incorrect information on your report
6,"By the way, also following Federal Trade Commi...",Attempts to collect debt not owed
7,This account was opened over 10 years ago. Thi...,Incorrect information on your report
8,In accordance with the Fair Credit Reporting a...,Incorrect information on your report
9,I got an alert a late payment was reported on ...,Incorrect information on your report


In [7]:
complaints['Issue'].value_counts().sort_index()

Attempts to collect debt not owed        73163
Communication tactics                    21243
Fraud or scam                            12347
Incorrect information on your report    229305
Struggling to pay mortgage               17374
Name: Issue, dtype: int64

In [14]:
seed = 711
for statement in complaints.loc[complaints['Issue'] == 'Fraud or scam', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

I have been sending multiple written requests however, the incorrect items listed below still remain on my Credit Report in violation of Federal Law. They failed to comply with Fair Credit Reporting Act, 15 USC sections 1681i within the time set forth by law and continued reporting of unverified ( erroneous ) information which now, given all my attempts to address it directly with the creditor, as willful negligence and non-compliance with federal statutes. 

XXXX Date Opened : XX/XX/XXXX Balance : {$0.00} XXXX/XXXX Date Opened : XX/XX/XXXX Balance : {$0.00} XXXX XXXX XXXX Date Opened : XX/XX/XXXX Balance : {$0.00} XXXX  XXXX Date Opened : XX/XX/XXXX Balance : {$0.00} XXXX XXXX Date Opened : XX/XX/XXXX Balance : {$400.00} XXXX  XXXX  XXXX Date Opened : XX/XX/XXXX Balance : {$470.00}
-----------------------------
I listed 2 XXXX XXXX concert tickets on XXXX on Friday, XX/XX/XXXX and received a text inquiry from a XXXX XXXX offering me {$600.00} for the 2 3-Day pass wristbands. He offere

In [15]:
seed = 711
for statement in complaints.loc[complaints['Issue'] == 'Attempts to collect debt not owed', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

On  XXXX   XXXX ,2017, I received a call from a company that did not disclose their name who said that I owed money to a credit card company. They said that if I did not contact them to resolve it, it would affect employment, assets and/or bank accounts. This matter has been resolved through the legitimate company and HSBC Bank and is no longer owed. They told me when I called them that they will see me in court and said I better have proof of my payment. I am worried that they will continue to harass me for this bill. This is the second fraudulent company trying to collect on this bill. I think they are just passing my information around to different people and that worries me. The number they called me from is  XXXX .
-----------------------------
XX/XX/XXXX XXXX XXXX XXXX XXXX XXXX XXXX 
XXXX XXXX, CA XXXX XXXX : XX/XX/XXXX SS # : XXXX WILLIAMS AND FUDGE INC XXXX XXXX XXXX XXXX XXXX ( XXXX ) XXXX To Whom It May Concern, There is no possibility that this is my debt. My name is XXXX X

In [16]:
seed = 711
for statement in complaints.loc[complaints['Issue'] == 'Struggling to pay mortgage', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

My husband who is a 100 % combat related XXXX veteran died on XX/XX/XXXX. Our home mortgage is in his name. I as the surviving spouse reported his death to Mr. Cooper mortgage company. With XXXX effecting so many people I was offered forbearance for a year. I graciously took this. At the end of the year, I was told that I had forbearance till the end of XXXX, XXXX. I would be given the opportunity to renegotiate the mortgage terms as of XXXX, XXXX. 

I have contacted Mr. Cooper on numerous occasions to get this process accomplished. Every time I am told that I did not send them complete information. For example, Mr. Cooper wanted verification of XXXX XXXX. They said they would accept either a letter from XXXX or 2 months of bank statements with the XXXX XXXX credited to my account. I gave them 2 months of bank statements. And I highlighted the deposits on each month to make it easy to identify. Six weeks after I submit these statements, I get a call from Mr. Cooper stating that they wi

In [21]:
complaints['word_count'] = complaints['Consumer complaint narrative'].str.split().str.len()

In [22]:
complaints

Unnamed: 0,Consumer complaint narrative,Issue,word_count
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report,123
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam,358
2,I have a particular account that is stating th...,Incorrect information on your report,55
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed,607
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report,160
...,...,...,...
353427,Collections account I have no knowledge of,Attempts to collect debt not owed,7
353428,"Dear CFPB Team, The reason for my complaint is...",Attempts to collect debt not owed,813
353429,FRCA violations : Failing to Follow Debt Dispu...,Attempts to collect debt not owed,713
353430,"My Father, a XXXX XXXX acquired an HECM rever...",Struggling to pay mortgage,891


In [29]:
test_1 = sent_tokenize(complaints['Consumer complaint narrative'].loc[1])

In [42]:
sent_tokenize(complaints['Consumer complaint narrative'].loc[1])

['I searched on XXXX for XXXXXXXX XXXX  and was pointed to a website I legitimately believed was XXXX.',
 'The website was XXXX whereas the authentic website is XXXX I proceeded to buy a gun from the fraudulent website and sent my money via XXXX as the spam seller requested, this was a total of {$450.00} on XX/XX/XXXX.',
 'I received an email stating the transaction was approved and on XX/XX/XXXX I attempted to reach back out to the company in an attempt to give them the contact info of my chosen XXXX dealer, the emails started to bounce back ( there are also photos of this ).',
 'This was when I realized that this website was a faked, copycat website.',
 'I attempted to report it to my bank immediately when I realized a company I was dealing with was committing fraud and pretending to be a firearms dealer that has been in business for over 80 years and is a very well-known company all over the world.',
 'As my bank, I expected them to help protect me.',
 "I was told to wait until the 

In [39]:
test_1

['I searched on XXXX for XXXXXXXX XXXX  and was pointed to a website I legitimately believed was XXXX.',
 'The website was XXXX whereas the authentic website is XXXX I proceeded to buy a gun from the fraudulent website and sent my money via XXXX as the spam seller requested, this was a total of {$450.00} on XX/XX/XXXX.',
 'I received an email stating the transaction was approved and on XX/XX/XXXX I attempted to reach back out to the company in an attempt to give them the contact info of my chosen XXXX dealer, the emails started to bounce back ( there are also photos of this ).',
 'This was when I realized that this website was a faked, copycat website.',
 'I attempted to report it to my bank immediately when I realized a company I was dealing with was committing fraud and pretending to be a firearms dealer that has been in business for over 80 years and is a very well-known company all over the world.',
 'As my bank, I expected them to help protect me.',
 "I was told to wait until the 

In [44]:
processed_sentences = [simple_preprocess(sentence) for sentence in test_1]

In [46]:
print(processed_sentences)

[['searched', 'on', 'xxxx', 'for', 'xxxxxxxx', 'xxxx', 'and', 'was', 'pointed', 'to', 'website', 'legitimately', 'believed', 'was', 'xxxx'], ['the', 'website', 'was', 'xxxx', 'whereas', 'the', 'authentic', 'website', 'is', 'xxxx', 'proceeded', 'to', 'buy', 'gun', 'from', 'the', 'fraudulent', 'website', 'and', 'sent', 'my', 'money', 'via', 'xxxx', 'as', 'the', 'spam', 'seller', 'requested', 'this', 'was', 'total', 'of', 'on', 'xx', 'xx', 'xxxx'], ['received', 'an', 'email', 'stating', 'the', 'transaction', 'was', 'approved', 'and', 'on', 'xx', 'xx', 'xxxx', 'attempted', 'to', 'reach', 'back', 'out', 'to', 'the', 'company', 'in', 'an', 'attempt', 'to', 'give', 'them', 'the', 'contact', 'info', 'of', 'my', 'chosen', 'xxxx', 'dealer', 'the', 'emails', 'started', 'to', 'bounce', 'back', 'there', 'are', 'also', 'photos', 'of', 'this'], ['this', 'was', 'when', 'realized', 'that', 'this', 'website', 'was', 'faked', 'copycat', 'website'], ['attempted', 'to', 'report', 'it', 'to', 'my', 'bank', 

In [47]:
bigram_finder = Phrases(
    sentences=processed_sentences,
    min_count=25,
    connector_words=ENGLISH_CONNECTOR_WORDS
)

In [48]:
sorted(bigram_finder.find_phrases(processed_sentences).items(), key = lambda x: -x[1])

[]