<a href="https://colab.research.google.com/github/ITU-Business-Analytics-Team/Business_Analytics_for_Professionals/blob/main/Part%20I%20%3A%20Methods%20%26%20Technologies%20for%20Business%20Analytics/Chapter%207%3A%20Text%20Analytics/7_5_Topic_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Text Analytics**
## Topic Models
Topic modeling, similar to clustering, is an unsupervised learning method that helps to find the topics in the text when the searched groups . In this method, the primary goal is to utilize mathematical and statistical methods to identify hidden and latent semantic patterns in a corpus of data. 

In [None]:
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# read the data  
url=   'https://drive.google.com/file/d/1Qcgk-dsDsAi1zO_OiZltvCGBC2U6fVLc/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)

In [None]:
df[~df['Consumer complaint narrative'].isna()].head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1,623002,08/07/2016,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,Enrolled in paperless billing. Was not billed ...,,ACS Education Services,NY,110XX,,Consent provided,Web,08/07/2016,Closed with explanation,Yes,No,2049664
12,226883,11/11/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt was paid,We lost our home ( of almost 14 years ) to for...,,"Diversified Consultants, Inc.",WI,539XX,,Consent provided,Web,11/11/2015,Closed with explanation,Yes,No,1650657
28,602022,09/20/2016,Credit reporting,,Credit reporting company's investigation,No notice of investigation status/result,Trans Union had a glitch in their system that ...,Company has responded to the consumer and the ...,"TransUnion Intermediate Holdings, Inc.",TX,770XX,,Consent provided,Web,09/20/2016,Closed with non-monetary relief,Yes,,2121069
32,378090,02/23/2016,Debt collection,Credit card,Disclosure verification of debt,Not disclosed as an attempt to collect,received notice from collection agency that I ...,,Synchrony Financial,NY,139XX,Older American,Consent provided,Web,02/25/2016,Closed with monetary relief,Yes,No,1799253
37,365525,11/02/2015,Credit card,,Closing/Cancelling account,,I have had a Citi Bank Credit card for several...,Company chooses not to provide a public response,Citibank,LA,703XX,,Consent provided,Web,11/02/2015,Closed with explanation,Yes,No,1634875


In [None]:
#Select only complaint text data and take samples
cl_data = df[['Consumer complaint narrative']]
cl_data = cl_data[pd.notnull(cl_data['Consumer complaint narrative'])]
cl_data['Consumer complaint narrative'] = cl_data['Consumer complaint narrative'].str.replace('XXXX','')
complaints = cl_data['Consumer complaint narrative'].tolist()

In [None]:
# Load Stop Words
stop = stopwords.words('english')

# Load Lemmatizer
lem = wordnet.WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    # Remove punctuations
    for c in string.punctuation:
        text= text.replace(c,"")
        
    ## Tokenize (convert from string to list) and convert to lowercase
    lst_text = [word.lower() for word in text.split()]
            
    ## Remove Stopwords
    lst_text = [word for word in lst_text if word not in stop] 
    
    ## Lemmatisation (convert the word into root word)
    lst_text = [lem.lemmatize(word) for word in lst_text] 
    
    return lst_text

In [None]:
nltk.download('wordnet')
vectorizer = TfidfVectorizer(max_features=10000, use_idf=True, tokenizer=preprocess_text, ngram_range=(1,2), max_df=0.9, min_df=0.1)
#fit the vectorizer to data
tfidf_matrix = vectorizer.fit_transform(complaints)
terms = np.array(vectorizer.get_feature_names())
print(tfidf_matrix.shape)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
(1688, 89)


In [None]:
topic_count = 5

### 5.1	Latent Dirichlet Allocation (LDA)
Using Dirichlet distributions, it creates a topic per document model and a words per topic model, which are both used in the analysis. The model uses the topic for each word, the distribution over topics for each document, and the distribution of words per topic.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components =topic_count, max_iter=100, max_doc_update_iter=50, learning_method='online', batch_size=1740, learning_offset=50., random_state=1, n_jobs=8)
document_topics = lda_model.fit_transform(tfidf_matrix)

In [None]:
# Term-Topic matrix
topic_names = ["topic_1","topic_2","topic_3","topic_4","topic_5"]
encoding_matrix = pd.DataFrame(lda_model.components_, index = topic_names , columns = (terms)).T
encoding_matrix

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
account,55.747196,12.141492,12.913193,70.248232,44.956023
agency,1.803167,8.224702,0.412084,35.927665,0.704039
also,19.735404,15.796552,2.736318,12.137725,5.855707
amount,34.079858,9.123685,0.232214,8.795503,6.176427
another,17.698695,4.892368,0.741612,8.804510,3.408676
...,...,...,...,...,...
without,14.387017,3.267772,0.405290,10.197797,7.038904
work,14.428655,17.852717,0.225111,1.392196,1.607232
would,51.219257,17.425856,0.651501,9.426870,14.539913
xxxx,48.938334,2.524256,0.408600,15.075444,2.129077


In [None]:
encoding_matrix.idxmax(axis=0)

topic_1       loan
topic_2       call
topic_3    request
topic_4     credit
topic_5       card
dtype: object

In [None]:
top_terms = 5
topic_terms = lda_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterms = terms[topic_key_term_idxs]

In [None]:
topics = [', '.join(topic) for topic in topic_keyterms]
topics_df = pd.DataFrame(topics, columns = ['Terms per Topic'],index=topic_names)
topics_df

Unnamed: 0,Terms per Topic
topic_1,"loan, payment, mortgage, bank, account"
topic_2,"call, debt, phone, number, company"
topic_3,"request, file, complaint, information, account"
topic_4,"credit, report, debt, credit report, account"
topic_5,"card, credit card, account, credit, charge"


### 5.2	Latent Semantic Indexing (LSI)
This method accepts as input a collection of documents. The document co-occurrence matrix is used to generate a word-document matrix. It makes ad-vantage of the TF-IDF conversion to eliminate superfluous high-frequency terms from the word-document matrix. Then, each document's weight is nor-malized to its unit length (normalization process). Finally, the Single Value Decomposition (SVD) technique is used to reduce the file size. This method selects the biggest single values.

In [None]:
from sklearn.decomposition import TruncatedSVD
lsi_model = TruncatedSVD(n_components=topic_count, n_iter=100, random_state=1)
document_topics = lsi_model.fit_transform(tfidf_matrix)
topic_terms = lsi_model.components_
topic_terms.shape

(5, 89)

In [None]:
topic_terms = lsi_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterms = terms[topic_key_term_idxs]

In [None]:
topics = [', '.join(topic) for topic in topic_keyterms]
topics_df = pd.DataFrame(topics, columns = ['Terms per Topic'],index=topic_names)
topics_df

Unnamed: 0,Terms per Topic
topic_1,"account, credit, payment, loan, report"
topic_2,"credit, report, loan, credit report, payment"
topic_3,"account, debt, loan, card, bank"
topic_4,"debt, loan, credit, report, credit report"
topic_5,"account, card, credit card, loan, credit"


### 5.3	Non-Negative Matrix Factorization (NMF)
NMF factorizes higher dimensional matrices into lower dimensional fac-tors. Coefficients of low-dimensional matrices are not negative. Let's take a matrix where we have a matrix of articles by words. When this matrix is de-composed, articles by topics and topics by words are obtained.

In [None]:
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=topic_count, solver='cd', max_iter=500, random_state=1, alpha=.1, l1_ratio=.80)
document_topics = nmf_model.fit_transform(tfidf_matrix)

In [None]:
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterms = terms[topic_key_term_idxs]

In [None]:
topics = [', '.join(topic) for topic in topic_keyterms]
topics_df = pd.DataFrame(topics, columns = ['Terms per Topic'],index=topic_names)
topics_df.head()

Unnamed: 0,Terms per Topic
topic_1,"loan, payment, mortgage, would, told"
topic_2,"credit, report, credit report, reporting, info..."
topic_3,"account, bank, information, check, money"
topic_4,"debt, collection, company, call, agency"
topic_5,"card, credit card, credit, charge, fee"
