# Gauge how well the clusters separate the data

#### Suggested approach:

Coherence is not meant to work with clusters, only topics. It can only work with either 
- an existing LDA model, in which case it does something clever
- A corpus, a dictionary and a list of [sets of?] key words representing topics

To make models from different sources comparable, it could be quite tricky

The other tool I could use is Silhouette scores, but they'll only work really for flat clustering models.

TODO
- Implement coherence against tokens
- Implement Silhouette scoring against TF-IDF representations
- Implement relevance scoring for retrieval of top tokens

In [1]:
import re
import json
import gensim

import numpy as np
import pandas as pd
import seaborn as sns

from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem.porter import *

# Define which stemmer to use in the pipeline later
stemmer = PorterStemmer()

# Useful flatten function from Alex Martelli on https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
def preprocess_desc(description):
    """ Helper, tokeniser """
    return( [stemmer.stem(token) for token in simple_preprocess(str(description)) if token not in STOPWORDS] )


def get_stats(df):
    """ Helper, for printing basic info on corpus extent """
    
    df['doc_size'] = df['clean_text'].apply(lambda x: len(x.split()))

    print(np.mean(df['doc_size']))
    print(df.shape[0])
    print(max(df['date']))
    print(min(df['date']))
    
    return 0


def get_keyword_stats(df, search_term_path = "D:/Dropbox/news_crow/scrape_settings.json"):
    """Retrive the set of search terms used for Bing, sum stories that contain them """
    with open(search_term_path, "r") as f:
        scrape_config = json.load(f)
        
    search_terms = scrape_config['disaster_search_list']
    search_terms = re.sub(r"[^0-9A-Za-z ]", "", " ".join(search_terms)).lower().split()
    search_terms = set(search_terms)
    
    term_results = {}
    
    for term in search_terms:
        term_results[term] = sum(df['clean_text'].apply(lambda x: term in x.lower()))
    
    return(term_results)

In [3]:
def get_corpus_model_coherence(df, cluster_column="cluster", tokens_column="tokens"):
    """ Helper, encapsulates entire coherence model-building process for (flat) models """
    # Create the vocabulary record
    bow_dictionary = gensim.corpora.Dictionary(list(df[tokens_column]))
    
    # Create a BOW model
    bow_corpus = [bow_dictionary.doc2bow(doc) for doc in df[tokens_column]]
    
    # Flattened list of all tokens for all documents for each "topic"
    topics = {}
    for topic in pd.unique(df['cluster']):
        subset = df[df['cluster'] == topic]
        topics[topic] = flatten(list(subset['tokens']))
    
    # Calculate ALL THE COHERENCE
    coherences = {}
    
    # c_v is most performant indirect confirmation measure
    cm = CoherenceModel(topics=list(topics.values()),
                        texts=list(df['tokens']),
                        dictionary=bow_dictionary,
                        coherence='c_v')
    coherences['c_v'] = cm.get_coherence()
    
    # c_npmi is most performant direct confirmation measure (that I don't have to implement myself)
    cm = CoherenceModel(topics=list(topics.values()),
                        texts=list(df['tokens']),
                        dictionary=bow_dictionary,
                        coherence='c_npmi')
    coherences['c_npmi'] = cm.get_coherence()
    
    # Redundant Measure
    #cm = CoherenceModel(topics=list(topics.values()),
    #                    corpus=bow_corpus,
    #                    dictionary=bow_dictionary,
    #                    coherence='u_mass')
    #coherences['u_mass'] = cm.get_coherence()
        
    return(coherences)

In [7]:
def load_evaluate_corpus(data_path):
    """ Helper - process a corpus csv, return its coherence scores """
    df = pd.read_csv(data_path)
    
    df["tokens"] = df["clean_text"].apply(preprocess_desc)
    
    # Restore "tokens" from horrific string format
    # REF CHECK THIS FOR COMPATIBILITY WITH OTHER CLEANING STEPS
    #df['tokens'] = df['tokens'].apply(lambda x: re.sub(r"[^a-zA-Z0-9,]", "", x).split(","))
    
    return get_corpus_model_coherence(df)

## Testing requirements of manual coherence scoring

### Create corpus, dictionary and lists of all tokens for each topic

In [8]:
corpus_files = ["disaster_clustered_lda.csv",
                "disaster_clustered_w2v.csv",
                "disaster_clustered_cliques.csv",
                "disaster_clustered_louvain.csv",
                "disaster_clustered_bigclam.csv"]

In [9]:
coherences = {}

for file in corpus_files:
    print("Processing: {}".format(file))
    coherences[file.strip(".csv")] = load_evaluate_corpus("working/" + file)
    print(coherences[file.strip(".csv")])

KeyboardInterrupt: 

In [None]:
coherences