In [11]:
# DF-ICF - Data Analysis

import os
import re
import nltk
import itertools

import pandas as pd
import numpy as np

from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Min percentage of docs in cluster where a term must appear to be chosen as a topic term
min_term_df = 0.03    # ~0.03

#file_start_text = "Data_for_DF-ICF_"    # A unique string that identifies a particular file as a cluster of documents
file_start_text = "issues"

data_dir = os.getcwd() + "/"

#indir = "/Volumes/IN-REGI-MerckProject38/Devon-Shuo/OutofShuoWang/DF-ICF/Scale(1-3)/bi/" 
indir = "C:/Users/wangs/OneDrive/桌面/"
outdir = os.getcwd() + "/"

In [12]:
# Counts the number of POSTS in a CLUSTER with at least one occurence of a term
def get_post_as_doc_freq(ck):
    term_counts = dict()
    num_posts = len(ck)
    
    for l, post in enumerate(ck):
        tmp = str(post).replace(",", "").split()
        
        
        single_terms = list(set(tmp))
        
        for i, wi in enumerate(single_terms):                        
            if wi in term_counts.keys():
                term_counts[wi] += 1
            
            else:
                term_counts.update({str(wi): 1})
      
    return term_counts, num_posts

In [13]:
# Counts the number of CLUSTERS in which a term appears at least once
def get_clus_as_doc_freq(c_fs):
    clus_term_counts = dict()
    
    for k, ck_fs in c_fs.items():
        tmp = list(ck_fs.keys())
        all_term = list(set(tmp))    # Removes duplicate terms


        for i, wi in enumerate(list(ck_fs.keys())):
            if wi in clus_term_counts.keys():
                clus_term_counts[wi] += 1
            
            else:
                clus_term_counts.update({str(wi): 1})
   
    return clus_term_counts

In [14]:
# Calculates the Topic Diversity as the percentage of unique terms given all clusters top "n" topic terms
def calculate_diversity(all_top_terms):
    no_duplicate_terms = list(set(all_top_terms))
    
    total_topic_word_count = len(all_top_terms)    # This should be 250 in the case of 10 clusters and 25 topic words
    unique_word_count = len(no_duplicate_terms)
    
    return float(unique_word_count/total_topic_word_count)

In [15]:
# Counts the number of POSTS in a CLUSTER with at least one occurence of both terms wi and wj
def get_wi_wj_overlap_freq(wi, wj, ck):
    num_overlaps = 0
    num_posts = len(ck)
    
    for l, post in enumerate(ck):
        tmp = str(post).replace(",", "").split()
        single_terms = list(set(tmp))
        
        if (wi in single_terms) and (wj in single_terms):            
            num_overlaps += 1
            
    return num_overlaps    # Returns the number of post co-occurrences within cluster ck for terms wi and wj

In [16]:
# Calculate the Normalized Pointwise Mutual Information for a given topic/cluster ck
def calculate_npmi(corp_dfs, ck_top_terms, corp_size, corp_texts):    
    num_top_terms = len(ck_top_terms)
    D = int(corp_size)
    TC_k = 0.0
    
    for i, dfitf_tuple in enumerate(ck_top_terms):
        t_wi = str(dfitf_tuple[0])    # The actual string term wj
        
        try:
            D_wi= int(corp_dfs[t_wi])    # Number of documents in the input corpus with wi

        except (KeyError):
            D_wi = 0
                
        j = i + 1
        
        while (j < (num_top_terms)) and (j > i):            
            t_wj = str(tuple(ck_top_terms[j])[0])   # The actual string term wj
            
            try:
                D_wj = int(corp_dfs[t_wj])    # Number of documents in the input corpus with wj
            
            except (KeyError):
                D_wj = 0
            
            D_wi_wj = get_wi_wj_overlap_freq(t_wi, t_wj, corp_texts)  # Number of documents in the input corpus with wi and wj
            
            if D_wi_wj == 0:  # In the case there is no term overlap for topic terms wi and wj
                f_wi_wj = -1
            
            else:    # Otherwise calculate the NPMI with the previously determined frequencies
                f_wi_wj = -1 + (np.log(D_wi)+np.log(D_wj)-2.0*np.log(D))/(np.log(D_wi_wj)-np.log(D))
            
            TC_k += f_wi_wj    # Sum across the top 10 terms in a topic/cluster ck
            j += 1
        
    return TC_k

In [17]:
def calculate_topic_metrics(clus_dfs, clus_itfs, clus_dfitfs, clus_texts, clus_sizes):    
    num_top_terms_coh = 10
    num_top_terms_div = 25
    
    num_clusters = len(list(clus_sizes.keys()))
    all_top_terms = list()
    clus_TCs = dict()
    corpus_term_dfs = dict()
    
    # Make a flat list of all documents in the corpus, effectively combines all clusters into a single corpus list of posts
    flat_corpus = [entry for sublist in list(clus_texts.values()) for entry in sublist]
    
    # For each post in the entire corpus
    for k, doc in enumerate(flat_corpus):
        for term in str(doc).split():    # For each term in each document in the corpus
            if term in corpus_term_dfs.keys():
                corpus_term_dfs[term] += 1

            else:
                corpus_term_dfs.update({str(term): int(1)})
    
    # Calculate the TC for each cluster
    for k in clus_sizes.keys():        
        ck_top_terms = list(clus_dfitfs[k])
        
        all_top_terms.extend(list(ck_top_terms))    # Make a list of ALL top terms selected from all topics for TD calculation
        
        # Considering the entire corpus (Blei's Method):
        TC_ck = float(calculate_npmi(dict(corpus_term_dfs), list(ck_top_terms[:num_top_terms_coh]), int(len(corpus_term_dfs)), list(flat_corpus)))
        
        TC_ck /= (num_top_terms_coh*(num_top_terms_coh-1)/2)    # This is where the (1/45) is applied
        
        print("Topic: ", k)
        print("TC_ck = ", TC_ck)
        
        clus_TCs.update({str(k): float(TC_ck)})        
            
    # The overall TC is then calculated as the average TC for each topic/cluster
    TC = (1/num_clusters) * sum(list(clus_TCs.values()))
    # TC = np.mean(list(clus_TCs.values()))    # This gives the same values as the above line
    TD = float(calculate_diversity(all_top_terms))
    
    print("\nTopic Coherence: ", TC)
    print("Topic Diversity: ", TD)

In [24]:
# Determine topic terms using DF-ITF, where each POST is a DOCUMENT and each CLUSTER is a TOPIC
def calculate_dfitf(indf_dict):
    df_clus_dict = dict()
    itf_clus_dict = dict()
    clus_texts = dict()
    clus_dfitfs = dict()
    clus_sizes = dict()

    for k, ck_df in indf_dict.items():
        ck_tex = ck_df[ck_df.columns[0]].tolist()    # Get all posts in cluster ck as a list of strings
        #print(ck_df.columns[7])
        temp_dfs, ck_size = get_post_as_doc_freq(ck_tex)

        df_clus_dict.update({str(k): dict(temp_dfs)})
        clus_texts.update({str(k): list(ck_tex)})
        clus_sizes.update({str(k): int(ck_size)})

    clus_doc_freqs = get_clus_as_doc_freq(df_clus_dict)
 
    for k, ck_dfs in df_clus_dict.items():
        term_dfitfs = dict()
        min_count_topic_terms = dict()
        ck_size = int(clus_sizes[k])

        for i, (wi, df) in enumerate(ck_dfs.items()):
            wi_df = float(df/clus_sizes[k])    # Adjusted for cluster/topic size
            # wi_df = float(df)
            wi_itf = float(np.log(((num_clusters)/(clus_doc_freqs[wi]))))

            wi_dfitf = float(wi_df*wi_itf)

            if wi in term_dfitfs.keys():
                term_dfitfs[wi] += float(wi_dfitf)

            else:
                term_dfitfs.update({str(wi): float(wi_dfitf)})

        # Uncomment the next two lines to take average instead of summation
        # for i, (wi, df) in enumerate(ck_dfs.items()):
            # term_dfitfs[wi] /= int(df)

        for wi, dfitf in term_dfitfs.items():
            wi_df = float(ck_dfs[wi])
            
            if wi_df >= (ck_size*min_term_df):
                min_count_topic_terms.update({str(wi): float(dfitf)})
            
        clus_dfitfs.update({str(k): sorted(min_count_topic_terms.items(), key=lambda item: item[1], reverse=True)})

    for k, ck_dfitfs in clus_dfitfs.items():
        print("\nTopic ", k, ":")    #TEMP
        print(ck_dfitfs[:20])    #TEMP
    
    print("\n")
    
    return dict(df_clus_dict), dict(itf_clus_dict), dict(clus_dfitfs), dict(clus_texts), dict(clus_sizes)

In [25]:
# Initialize indf dictionary and cluster counter
indf_dict = dict()
num_clusters = 0

for file in os.listdir(indir):
    if (file.endswith(".csv")) and (file.startswith(str(file_start_text))):
        df = pd.read_csv(str(indir) + str(file), encoding='utf-8')
        #clus_num = re.search(r"(\d+)", str(file))
        #print(clus_num)
        #indf_dict.update({str(clus_num.group(1)): df})
        for i in range(10):
            indf_dict.update({str(i): df})
        #num_clusters += 1

df_clus_dict, itf_clus_dict, clus_dfitfs, clus_texts, clus_sizes = calculate_dfitf(dict(indf_dict))


calculate_topic_metrics(dict(df_clus_dict), dict(itf_clus_dict), dict(clus_dfitfs), dict(clus_texts), dict(clus_sizes))


Topic  0 :
[('increase', -inf), ('power', -inf), ('alert', -inf), ('conservative', -inf), ('oos', -inf), ('tower', -inf), ('transformer', -inf), ('cool', -inf), ('one', -inf), ('supply', -inf), ('hold', -inf), ('to', -inf), ('due', -inf), ('outage', -inf), ('maintenance', -inf), ('in', -inf), ('condenser', -inf), ('progress', -inf), ('planned', -inf), ('reduced', -inf)]

Topic  1 :
[('increase', -inf), ('power', -inf), ('alert', -inf), ('conservative', -inf), ('oos', -inf), ('tower', -inf), ('transformer', -inf), ('cool', -inf), ('one', -inf), ('supply', -inf), ('hold', -inf), ('to', -inf), ('due', -inf), ('outage', -inf), ('maintenance', -inf), ('in', -inf), ('condenser', -inf), ('progress', -inf), ('planned', -inf), ('reduced', -inf)]

Topic  2 :
[('increase', -inf), ('power', -inf), ('alert', -inf), ('conservative', -inf), ('oos', -inf), ('tower', -inf), ('transformer', -inf), ('cool', -inf), ('one', -inf), ('supply', -inf), ('hold', -inf), ('to', -inf), ('due', -inf), ('outage', -

  wi_itf = float(np.log(((num_clusters)/(clus_doc_freqs[wi]))))


Topic:  0
TC_ck =  -0.22516620002950802
Topic:  1
TC_ck =  -0.22516620002950802
Topic:  2
TC_ck =  -0.22516620002950802
Topic:  3
TC_ck =  -0.22516620002950802
Topic:  4
TC_ck =  -0.22516620002950802
Topic:  5
TC_ck =  -0.22516620002950802
Topic:  6
TC_ck =  -0.22516620002950802
Topic:  7
TC_ck =  -0.22516620002950802
Topic:  8
TC_ck =  -0.22516620002950802
Topic:  9
TC_ck =  -0.22516620002950802

Topic Coherence:  -0.22516620002950802
Topic Diversity:  0.1


In [26]:
df

Unnamed: 0,Reason_text
0,increase power
1,conservative operations alert
2,one cool tower transformer oos
3,power supply alert
4,hold power due to s g condition
...,...
1581,waterbox cleaning
1582,waterbox cleaning rod pattern adjustment
1583,will be come offline 1500 edt to repair a stat...
1584,will conduct a turbine valve freedom test today
