# Gauge how well the clusters separate the data

#### Suggested approach:

Coherence is not meant to work with clusters, only topics. It can only work with either 
- an existing LDA model, in which case it does something clever
- A corpus, a dictionary and a list of [sets of?] key words representing topics

To make models from different sources comparable, it could be quite tricky

The other tool I could use is Silhouette scores, but they'll only work really for flat clustering models.

TODO
- Implement coherence against tokens
- Implement Silhouette scoring against TF-IDF representations
- Implement relevance scoring for retrieval of top tokens

In [28]:
import re

import numpy as np
import pandas as pd
import seaborn as sns

from gensim.models.coherencemodel import CoherenceModel

# Useful flatten function from Alex Martelli on https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
flatten = lambda l: [item for sublist in l for item in sublist]

In [21]:
df = pd.read_csv("working/disaster_clustered_lda.csv", nrows=200)

# Restore "tokens" from horrific string format
# REF CHECK THIS FOR COMPATIBILITY WITH OTHER CLEANING STEPS
df['tokens'] = df['tokens'].apply(lambda x: re.sub(r"[^a-zA-Z0-9,]", "", x).split(","))

df.head()

Unnamed: 0,node,index,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text,tokens,corpus_tfidf,cluster,score
0,0,0,West Midlands <b>flood</b> warnings prompt &#3...,Residents have been warned to &quot;remain vig...,2019-11-17T17:35:00.0000000Z,https://www.bbc.co.uk/news/uk-england-50451817,www.bbc.co.uk,2019-11-17 19:50:58.278878,bing_news_api,West Midlands flood warnings prompt ;remain vi...,"[west, midland, flood, warn, prompt, remain, v...","[(0, 0.10888666697011559), (1, 0.1157487091387...",22,0.320047
1,1,1,New <b>flood</b> warnings issued with more hom...,The Environment Agency has a number of <b>floo...,2019-11-17T18:35:00.0000000Z,https://www.hulldailymail.co.uk/news/hull-east...,www.hulldailymail.co.uk,2019-11-17 19:50:58.278928,bing_news_api,New flood warnings issued with more homes at r...,"[new, flood, warn, issu, home, risk, environ, ...","[(1, 0.237740646083191), (2, 0.118534432452920...",72,0.328762
2,2,2,UK weather forecast – More than 100 <b>flood</...,<b>FLOOD</b>-ravaged villages in the UK have b...,2019-11-17T13:45:00.0000000Z,https://www.thesun.co.uk/news/10342583/uk-weat...,www.thesun.co.uk,2019-11-17 19:50:58.278953,bing_news_api,UK weather forecast – More than 100 flood aler...,"[uk, weather, forecast, flood, alert, britain,...","[(2, 0.22914879635567142), (8, 0.1474814226459...",74,0.344483
3,3,5,UK <b>flood</b> warning map: <b>Flood</b> chao...,The Environment Agency has issued 57 <b>flood<...,2019-11-17T16:38:00.0000000Z,https://www.express.co.uk/news/weather/1205629...,www.express.co.uk,2019-11-17 19:50:58.279028,bing_news_api,UK flood warning map: Flood chaos to continue ...,"[uk, flood, warn, map, flood, chao, continu, t...","[(1, 0.13183366830610094), (2, 0.1314611473326...",72,0.3709
4,4,6,UK weather forecast: <b>Flood</b> chaos contin...,Despite some areas enduring their &#39;wettest...,2019-11-17T18:32:00.0000000Z,https://www.mirror.co.uk/news/uk-news/uk-weath...,www.mirror.co.uk,2019-11-17 19:50:58.279047,bing_news_api,UK weather forecast: Flood chaos continues wit...,"[uk, weather, forecast, flood, chao, continu, ...","[(8, 0.1193340118121436), (9, 0.25139370146632...",22,0.235525


## Testing requirements of manual coherence scoring

In [11]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel
topics = [
     ['human', 'computer', 'system', 'interface'],
     ['graph', 'minors', 'trees', 'eps']
]

cm = CoherenceModel(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
coherence = cm.get_coherence()
coherence

-9.243677429867034

In [7]:
common_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

In [8]:
common_dictionary

<gensim.corpora.dictionary.Dictionary at 0x1947566ba88>

### Create corpus, dictionary and lists of all tokens for each topic

In [34]:
import gensim

# Create the vocabulary record
bow_dictionary = gensim.corpora.Dictionary(list(df['tokens']))

# Remove extreme values (words that are too rare, too common)
#bow_dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# Create a BOW model
bow_corpus = [bow_dictionary.doc2bow(doc) for doc in df['tokens']]

In [35]:
# Flattened list of all tokens for all documents for each "topic"
topics = {}
for topic in pd.unique(df['cluster']):
        subset = df[df['cluster'] == topic]
        topics[topic] = flatten(list(subset['tokens']))

In [36]:
list(topics.values())

[['west',
  'midland',
  'flood',
  'warn',
  'prompt',
  'remain',
  'vigil',
  'alert',
  'resid',
  'warn',
  'quot',
  'remain',
  'vigilantquot',
  'flood',
  'warn',
  'place',
  'west',
  'midland',
  'rain',
  'forecast',
  'warn',
  'affect',
  'worcestershir',
  'river',
  'severn',
  'avon',
  'teme',
  'shropshir',
  'flood',
  'defenc',
  'ironbridg',
  'saturday',
  'even',
  'environ',
  'agenc',
  'ea',
  'said',
  'river',
  'uk',
  'weather',
  'forecast',
  'flood',
  'chao',
  'continu',
  'freez',
  'follow',
  'wettest',
  'autumn',
  'despit',
  'area',
  'endur',
  'wettest',
  'autumn',
  'need',
  'relief',
  'heavi',
  'rainfal',
  'forecast',
  'flood',
  'hit',
  'area',
  'come',
  'day',
  'midland',
  'flood',
  'alert',
  'monday',
  'coldest',
  'night',
  'autumn',
  'hit',
  'dozen',
  'flood',
  'alert',
  'warn',
  'remain',
  'place',
  'region',
  'saturday',
  'morn',
  'novemb',
  'minu',
  'week',
  'part',
  'midlad',
  'risk',
  'flood',
  '

In [37]:
cm2 = CoherenceModel(topics=list(topics.values()), corpus=bow_corpus, dictionary=bow_dictionary, coherence='u_mass')
coherence = cm2.get_coherence()
coherence

In [39]:
dir(bow_dictionary)

['__abstractmethods__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_adapt_by_suffix',
 '_load_specials',
 '_save_specials',
 '_smart_save',
 'add_documents',
 'cfs',
 'compactify',
 'dfs',
 'doc2bow',
 'doc2idx',
 'filter_extremes',
 'filter_n_most_frequent',
 'filter_tokens',
 'from_corpus',
 'from_documents',
 'get',
 'id2token',
 'items',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'load',
 'load_from_text',
 'merge_with',
 'num_docs',
 'num_nnz',
 'num_pos',
 'patch_with_special_tokens',
 'save',
 'save_as_text',
 'token2id',
 'values']