In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import io
import requests
import zipfile

%matplotlib inline

from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

# Multiclass Text Classification 

## 1. Problem Definition
The problem at hand is to develop a multiclass classification model using the Naive Bayes algorithm on processed text data. The goal is to enable machines to understand and gain insights from human language by categorizing text into more than two classes.

## 2. Data
The dataset contains more than two million customer complaints about consumer financial products. Amongst the various available columns, we have a column that contains the actual text of the complaint and one column containing the product for which the customer is raising the complaint
The original data came from the USA data catalog.

https://catalog.data.gov/dataset/consumer-complaint-database


## 3. Evaluation


## Data Exploration

In [7]:
url = "https://files.consumerfinance.gov/ccdb/complaints.json.zip"
response = requests.get(url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
file_name = zip_file.namelist()[0]
data = pd.read_json(zip_file.open(file_name))

df = pd.DataFrame(data)
df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,complaint_what_happened,company_public_response,company,state,zip_code,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response,timely,consumer_disputed,complaint_id
0,2022-01-26,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",VA,20166,,Consent not provided,Web,2022-01-26,Closed with explanation,Yes,,5152476
1,2022-03-10,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,CA,92394,,,Phone,2022-03-10,Closed with explanation,Yes,,5308860
2,2022-03-10,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,,DISCOVER BANK,DE,19703,,,Referral,2022-03-11,Closed with monetary relief,Yes,,5318105
3,2022-03-10,Checking or savings account,Checking account,Closing an account,Can't close your account,,,CAPITAL ONE FINANCIAL CORPORATION,TN,38135,,,Referral,2022-03-10,Closed with explanation,Yes,,5308021
4,2021-11-12,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,I am XXXX XXXX XXXX and I am submitting this c...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",FL,34236,,Consent provided,Web,2021-11-30,Closed with non-monetary relief,Yes,,4906602


In [8]:
df.shape

(3501441, 18)

In [11]:
df['complaint_what_happened'][3401]

'I WAS TAKEN ADVANTAGE OF DUE TO MALPRACTICE WHILE SEVERAL LAWS WERE BEING BROKEN BY YOUR COMPANY UNDER THE F.C.R.A, THIS HAS CAUSE ME FINANCIAL INJURY AND THIS IS DEFAMATION OF CHARACHTER. I HAVE BEEN DENIED JOBS BECAUSE OF WHATS ON MY CREDIT REPORT. I HAVE BEEN UNABLE TO LEVERAGE MY ASSETS DURING THE PANDEMIC BECAUSE OF WHATS ON MY CREDIT REPORT. I HAVE BEEN DENIED AN S.B.A. LOAN BECAUSE OF THE NEGATIVE, UNDULY AND ERRONEOUS ITEMS ON MY CREDIT REPORT.'

In [12]:
df['product'][3401]

'Credit reporting, credit repair services, or other personal consumer reports'

### Selecting only the important columns for this project

In [13]:
data = df[['product', 'sub_product', 'complaint_what_happened']]
data.head()

Unnamed: 0,product,sub_product,complaint_what_happened
0,"Credit reporting, credit repair services, or o...",Credit reporting,
1,"Credit reporting, credit repair services, or o...",Credit reporting,
2,"Credit reporting, credit repair services, or o...",Credit reporting,
3,Checking or savings account,Checking account,
4,"Credit reporting, credit repair services, or o...",Credit reporting,I am XXXX XXXX XXXX and I am submitting this c...


In [14]:
data.isna().sum()

product                    0
sub_product                0
complaint_what_happened    0
dtype: int64

In [None]:
data.isnull().sum()

### Drop rows with missing `complaint`

In [15]:
data.dropna(inplace=True)

KeyboardInterrupt: 