In [38]:
import numpy as np
import pandas as pd
import json 

# Loading data


In [39]:
data = json.load(open("Client_data.json"))

In [40]:
complaint = []

In [42]:
data

[{'_id': '3211475',
  '_index': 'complaint-public-v2',
  '_score': 0.0,
  '_source': {'company': 'JPMORGAN CHASE & CO.',
   'company_public_response': None,
   'company_response': 'Closed with explanation',
   'complaint_id': '3211475',
   'complaint_what_happened': '',
   'consumer_consent_provided': 'Consent not provided',
   'consumer_disputed': 'N/A',
   'date_received': '2019-04-13T12:00:00-05:00',
   'date_sent_to_company': '2019-04-13T12:00:00-05:00',
   'issue': 'Attempts to collect debt not owed',
   'product': 'Debt collection',
   'state': 'CA',
   'sub_issue': 'Debt is not yours',
   'sub_product': 'Credit card debt',
   'submitted_via': 'Web',
   'tags': None,
   'timely': 'Yes',
   'zip_code': '90301'},
  '_type': 'complaint'},
 {'_id': '3229299',
  '_index': 'complaint-public-v2',
  '_score': 0.0,
  '_source': {'company': 'JPMORGAN CHASE & CO.',
   'company_public_response': None,
   'company_response': 'Closed with explanation',
   'complaint_id': '3229299',
   'complai

In [43]:
data[1]["_source"]["complaint_what_happened"]

'Good morning my name is XXXX XXXX and I appreciate it if you could help me put a stop to Chase Bank cardmember services. \nIn 2018 I wrote to Chase asking for debt verification and what they sent me a statement which is not acceptable. I am asking the bank to validate the debt. Instead I been receiving mail every month from them attempting to collect a debt. \nI have a right to know this information as a consumer. \n\nChase account # XXXX XXXX XXXX XXXX Thanks in advance for your help.'

In [44]:
for x in data:
    complaint.append(str(x["_source"]["complaint_what_happened"])+" " +str(x["_source"]["product"]) +" "+ str(x["_source"]["sub_product"]))

In [45]:
len(complaint)

78313

In [46]:
complaint = np.array(complaint)

In [47]:
complaint = complaint[complaint != np.array(None)]

In [48]:
import nltk
import re

In [49]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

# Preproceessing data

In [50]:
def preprocessing(texts):
   
    texts = [re.findall(r'\w+', line.lower()) for line in texts]
    texts = [remove_stopwords(' '.join(line)).split() for line in texts]
    texts = [strip_punctuation(' '.join(line)).split() for line in texts]
    texts = [[token for token in line if len(token) >2] for line in texts]
    texts = [[token for token in line if not token.isnumeric()] for line in texts]
    lemmatizer = WordNetLemmatizer()
    texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts]

    return texts

In [51]:
pre_processed_complaints = preprocessing(complaint)

In [52]:
pre_processed_complaints[1]

['good',
 'morning',
 'xxxx',
 'xxxx',
 'appreciate',
 'help',
 'stop',
 'chase',
 'bank',
 'cardmember',
 'services',
 'wrote',
 'chase',
 'asking',
 'debt',
 'verification',
 'sent',
 'statement',
 'acceptable',
 'asking',
 'bank',
 'validate',
 'debt',
 'instead',
 'receiving',
 'mail',
 'month',
 'attempting',
 'collect',
 'debt',
 'right',
 'know',
 'information',
 'consumer',
 'chase',
 'account',
 'xxxx',
 'xxxx',
 'xxxx',
 'xxxx',
 'thanks',
 'advance',
 'help',
 'debt',
 'collection',
 'credit',
 'card',
 'debt']

In [53]:
pre_processed_text = list(filter(None,pre_processed_complaints))

In [54]:
pre_processed_text

[['debt', 'collection', 'credit', 'card', 'debt'],
 ['good',
  'morning',
  'xxxx',
  'xxxx',
  'appreciate',
  'help',
  'stop',
  'chase',
  'bank',
  'cardmember',
  'services',
  'wrote',
  'chase',
  'asking',
  'debt',
  'verification',
  'sent',
  'statement',
  'acceptable',
  'asking',
  'bank',
  'validate',
  'debt',
  'instead',
  'receiving',
  'mail',
  'month',
  'attempting',
  'collect',
  'debt',
  'right',
  'know',
  'information',
  'consumer',
  'chase',
  'account',
  'xxxx',
  'xxxx',
  'xxxx',
  'xxxx',
  'thanks',
  'advance',
  'help',
  'debt',
  'collection',
  'credit',
  'card',
  'debt'],
 ['upgraded',
  'xxxx',
  'xxxx',
  'card',
  'told',
  'agent',
  'upgrade',
  'anniversary',
  'date',
  'change',
  'turned',
  'agent',
  'giving',
  'wrong',
  'information',
  'order',
  'upgrade',
  'account',
  'xxxx',
  'changed',
  'anniversary',
  'date',
  'xxxx',
  'xxxx',
  'consent',
  'xxxx',
  'recording',
  'agent',
  'misled',
  'credit',
  'card',
  

In [55]:
from gensim.corpora import Dictionary

In [56]:
dict_data = Dictionary(pre_processed_text)

In [57]:
corpus = [dict_data.doc2bow(text) for text in pre_processed_text]

In [58]:
from gensim.models import LdaModel
from sklearn.decomposition import NMF

# Model traning

In [79]:
model = LdaModel(corpus=corpus,id2word=dict_data,num_topics=5,chunksize=600,passes=30)

In [110]:
model.save("model_final")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [111]:
model = LdaModel.load("model_final")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [82]:
model.print_topics(num_topics=5,num_words=3)

[(0, '0.313*"card" + 0.219*"credit" + 0.056*"charge"'),
 (1, '0.158*"credit" + 0.062*"reporting" + 0.056*"debt"'),
 (2, '0.270*"account" + 0.117*"checking" + 0.116*"bank"'),
 (3, '0.322*"mortgage" + 0.068*"loan" + 0.057*"conventional"'),
 (4, '0.126*"xxxx" + 0.046*"chase" + 0.009*"told"')]

1 - Banking Services
0 - Loans
2 - Fraudulent Reporting
3 - Card
4 - others

In [83]:
model.show_topics(num_topics=5)

[(0,
  '0.313*"card" + 0.219*"credit" + 0.056*"charge" + 0.037*"general" + 0.037*"prepaid" + 0.036*"purpose" + 0.011*"cards" + 0.008*"balance" + 0.007*"points" + 0.006*"fee"'),
 (1,
  '0.158*"credit" + 0.062*"reporting" + 0.056*"debt" + 0.043*"consumer" + 0.041*"loan" + 0.032*"personal" + 0.031*"report" + 0.031*"collection" + 0.030*"reports" + 0.026*"services"'),
 (2,
  '0.270*"account" + 0.117*"checking" + 0.116*"bank" + 0.076*"service" + 0.045*"savings" + 0.035*"money" + 0.026*"check" + 0.017*"funds" + 0.016*"product" + 0.013*"transfer"'),
 (3,
  '0.322*"mortgage" + 0.068*"loan" + 0.057*"conventional" + 0.045*"home" + 0.035*"fixed" + 0.016*"modification" + 0.015*"student" + 0.015*"property" + 0.014*"line" + 0.014*"equity"'),
 (4,
  '0.126*"xxxx" + 0.046*"chase" + 0.009*"told" + 0.008*"payment" + 0.008*"called" + 0.007*"received" + 0.007*"time" + 0.007*"said" + 0.005*"bank" + 0.005*"letter"')]

In [84]:
model.get_document_topics(corpus[2])

[(0, 0.39963576), (2, 0.016381752), (4, 0.573912)]

In [85]:
model.log_perplexity(corpus)

-6.36663831502579

In [86]:
!pip install pyldavis



DEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.


In [87]:
import pyLDAvis
import pyLDAvis.gensim as ldavis
pyLDAvis.enable_notebook()
vis = ldavis.prepare(model, corpus, dict_data)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [99]:
def result(x):
    max_tup = (0,0)
    for i in x:
        if i[1]>max_tup[1]:
            max_tup = i
    return max_tup[0]

In [100]:
model.get_document_topics(corpus[0])

[(0, 0.24925222),
 (1, 0.6507478),
 (2, 0.033333335),
 (3, 0.033333335),
 (4, 0.033333335)]

In [101]:
complaint[1]

'Good morning my name is XXXX XXXX and I appreciate it if you could help me put a stop to Chase Bank cardmember services. \nIn 2018 I wrote to Chase asking for debt verification and what they sent me a statement which is not acceptable. I am asking the bank to validate the debt. Instead I been receiving mail every month from them attempting to collect a debt. \nI have a right to know this information as a consumer. \n\nChase account # XXXX XXXX XXXX XXXX Thanks in advance for your help. Debt collection Credit card debt'

In [102]:
print(complaint)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [103]:
complaint = list(filter(None,complaint))

In [104]:
topics = []
for i in corpus:
    topics.append(result(model.get_document_topics(i)) + 1)

2 - Banking
4 - Loan
5 - Fraud
3 - Card
1 - Others

In [105]:
df_results= pd.DataFrame([complaint,topics]).T

In [106]:
df_results.columns = ["complaint","type"]

In [107]:
pd.set_option('display.max_colwidth', 500)

In [108]:
df_results.sample(40)

Unnamed: 0,complaint,type
5849,Debt collection Credit card debt,2
58559,Mortgage Other mortgage,4
48842,Chase is not doing what they said they would for under water and people behind on their mortage they did nothing with the arrears did n't not lower my mortgage and they are now foreclosing on my wife and I which we are XXXX. And I have been trying to get a modification and they go up on monthly payments Mortgage Conventional adjustable mortgage (ARM),4
22732,"Today I went to chase bank in XXXX XXXX at XXXX to open a bank account. I was told to have a seat and someone would be right with me. After waiting about 25 minutes I wanted to ask if someone was available to help me, so I approach a young blonde female and as I approach her window she walked away to talk to a young XXXX XXXX female. As they were smiling and trying to avoid me I felt so upset I walked away but as I was walking away I noticed the blonde female walking back up to the window so...",5
53582,Bank account or service Checking account,3
75120,"The reason for my dispute is with 6 credits cards companies is to try to get my money back for using them to buy XXXX memberships and start an on line business but XXXX has been shut down for an investigation by the FTC for fraudulent activity and being a scam. \n-Description of Services/Products : XXXX is a business education platform. I paid and registered for a training program, sponsored by XXXX XXXX, to learn about online business practices and marketing techniques. XXXX XXXX which we...",5
29976,"I 'm former account holder with Chase, I say former because over the past 3 years all my accounts at Chase ( checking / savings / credit card ) have been jacked four times - very interesting that only my accounts with Chase have been affected, none of my other accounts with other financial institution have been ever affected. \nAnyhow, On XXXX XXXX my Chase Freedom Account was part of a fraudulent scheme yet again ( over XXXX funds transfer / point redemption, etc. ). I went to the nearest b...",5
178,"When booking a hotel with XXXX, I checked the properties listed ( 4 ) as the stay being in one of the four. There was not a listing for "" XXXX ''. XXXX, XXXX, XXXX and one other was shown. Again -- no "" XXXX '' listing. I stayed at the XXXX in 2014 and it was TERRIBLE. Feeling safe, I hit "" book it now '', and what turns out to be the property? The XXXX. XXXX hid the name "" XXXX '' by using the name XXXX. I immediately called XXXX to rebook to another property. XXXX service center ran me a...",5
5050,Credit card or prepaid card General-purpose credit card or charge card,1
13837,Bank account or service Checking account,3


In [109]:
df_results.to_csv("complaints_with_topics.csv")