# Gauge how well the clusters separate the data

#### Suggested approach:

Coherence is not meant to work with clusters, only topics. It can only work with either 
- an existing LDA model, in which case it does something clever
- A corpus, a dictionary and a list of [sets of?] key words representing topics

To make models from different sources comparable, it could be quite tricky

The other tool I could use is Silhouette scores, but they'll only work really for flat clustering models.

TODO
- Implement coherence against tokens
- Implement Silhouette scoring against TF-IDF representations
- Implement relevance scoring for retrieval of top tokens

In [9]:
import re
import json
import gensim

import numpy as np
import pandas as pd
import seaborn as sns

from gensim.models.coherencemodel import CoherenceModel

# Useful flatten function from Alex Martelli on https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
flatten = lambda l: [item for sublist in l for item in sublist]

In [10]:
def get_stats(df):
    """ Helper, for printing basic info on corpus extent """
    
    df['doc_size'] = df['clean_text'].apply(lambda x: len(x.split()))

    print(np.mean(df['doc_size']))
    print(df.shape[0])
    print(max(df['date']))
    print(min(df['date']))
    
    return 0

In [11]:
def get_keyword_stats(df, search_term_path = "D:/Dropbox/news_crow/scrape_settings.json"):
    """Retrive the set of search terms used for Bing, sum stories that contain them """
    with open(search_term_path, "r") as f:
        scrape_config = json.load(f)
        
    search_terms = scrape_config['disaster_search_list']
    search_terms = re.sub(r"[^0-9A-Za-z ]", "", " ".join(search_terms)).lower().split()
    search_terms = set(search_terms)
    
    term_results = {}
    
    for term in search_terms:
        term_results[term] = sum(df['clean_text'].apply(lambda x: term in x.lower()))
    
    return(term_results)

In [26]:
def get_corpus_model_coherence(df, cluster_column="cluster", tokens_column="tokens"):
    """ Helper, encapsulates entire coherence model-building process for (flat) models """
    # Create the vocabulary record
    bow_dictionary = gensim.corpora.Dictionary(list(df[tokens_column]))
    
    # Create a BOW model
    bow_corpus = [bow_dictionary.doc2bow(doc) for doc in df[tokens_column]]
    
    # Flattened list of all tokens for all documents for each "topic"
    topics = {}
    for topic in pd.unique(df['cluster']):
        subset = df[df['cluster'] == topic]
        topics[topic] = flatten(list(subset['tokens']))
    
    # Calculate ALL THE COHERENCE
    coherences = {}
    
    cm = CoherenceModel(topics=list(topics.values()), texts=list(df['tokens']), dictionary=bow_dictionary, coherence='c_v')
    coherences['c_v'] = cm.get_coherence()
    
    cm = CoherenceModel(topics=list(topics.values()), texts=list(df['tokens']), dictionary=bow_dictionary, coherence='c_npmi')
    coherences['c_npmi'] = cm.get_coherence()
    
    cm = CoherenceModel(topics=list(topics.values()), corpus=bow_corpus, dictionary=bow_dictionary, coherence='u_mass')
    coherences['u_mass'] = cm.get_coherence()
        
    return(coherences)

## Testing requirements of manual coherence scoring

### Create corpus, dictionary and lists of all tokens for each topic

In [20]:
df = pd.read_csv("working/disaster_clustered_lda.csv")

# Restore "tokens" from horrific string format
# REF CHECK THIS FOR COMPATIBILITY WITH OTHER CLEANING STEPS
df['tokens'] = df['tokens'].apply(lambda x: re.sub(r"[^a-zA-Z0-9,]", "", x).split(","))

df.head()

Unnamed: 0,node,index,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text,tokens,corpus_tfidf,cluster,score
0,0,0,West Midlands <b>flood</b> warnings prompt &#3...,Residents have been warned to &quot;remain vig...,2019-11-17T17:35:00.0000000Z,https://www.bbc.co.uk/news/uk-england-50451817,www.bbc.co.uk,2019-11-17 19:50:58.278878,bing_news_api,West Midlands flood warnings prompt ;remain vi...,"[west, midland, flood, warn, prompt, remain, v...","[(0, 0.10888666697011559), (1, 0.1157487091387...",22,0.320047
1,1,1,New <b>flood</b> warnings issued with more hom...,The Environment Agency has a number of <b>floo...,2019-11-17T18:35:00.0000000Z,https://www.hulldailymail.co.uk/news/hull-east...,www.hulldailymail.co.uk,2019-11-17 19:50:58.278928,bing_news_api,New flood warnings issued with more homes at r...,"[new, flood, warn, issu, home, risk, environ, ...","[(1, 0.237740646083191), (2, 0.118534432452920...",72,0.328762
2,2,2,UK weather forecast – More than 100 <b>flood</...,<b>FLOOD</b>-ravaged villages in the UK have b...,2019-11-17T13:45:00.0000000Z,https://www.thesun.co.uk/news/10342583/uk-weat...,www.thesun.co.uk,2019-11-17 19:50:58.278953,bing_news_api,UK weather forecast – More than 100 flood aler...,"[uk, weather, forecast, flood, alert, britain,...","[(2, 0.22914879635567142), (8, 0.1474814226459...",74,0.344483
3,3,5,UK <b>flood</b> warning map: <b>Flood</b> chao...,The Environment Agency has issued 57 <b>flood<...,2019-11-17T16:38:00.0000000Z,https://www.express.co.uk/news/weather/1205629...,www.express.co.uk,2019-11-17 19:50:58.279028,bing_news_api,UK flood warning map: Flood chaos to continue ...,"[uk, flood, warn, map, flood, chao, continu, t...","[(1, 0.13183366830610094), (2, 0.1314611473326...",72,0.3709
4,4,6,UK weather forecast: <b>Flood</b> chaos contin...,Despite some areas enduring their &#39;wettest...,2019-11-17T18:32:00.0000000Z,https://www.mirror.co.uk/news/uk-news/uk-weath...,www.mirror.co.uk,2019-11-17 19:50:58.279047,bing_news_api,UK weather forecast: Flood chaos continues wit...,"[uk, weather, forecast, flood, chao, continu, ...","[(8, 0.1193340118121436), (9, 0.25139370146632...",22,0.235525


In [27]:
coherences = get_corpus_model_coherence(df)

In [28]:
coherences

{'c_v': 0.6540867957747499,
 'c_npmi': 0.1013493367023562,
 'u_mass': -3.23000860531844}