# HDBSCAN should work well on embedding representations

In [1]:
import os
import re
import json
import hdbscan

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from collections import Counter
import matplotlib.pyplot as plt

import lib.helper as helper
import lib.embedding_models as reps

from importlib import reload

%matplotlib inline

## 1.  Retrieve Corpus

The corpus is being scraped by the "run_news_scrapes.py" script (and windows task scheduler) every 12 hours, a bit past midday and a bit past midnight.

The "bing" corpus are news titles and text extracts gotten from the bing news search API, using a few Home Office - related keywords.

The "RSS" corpus is plugged directly into a number of RSS feeds for world news sites and local british news sites, with no filters for news story types or subjects applied.

In [2]:
# Should be same path for all my PC's, it's where each scrape goes as a separate json file.
storage_path = "D:/Dropbox/news_crow/scrape_results"

# "bing" is targeted news search corpus, "RSS" is from specific world and local news feeds.
corpus_type = "RSS"

# Load up
corpus = helper.load_clean_corpus(storage_path, corpus_type)

# Make sure after cleaning etc it's indexed from 0
corpus.reset_index(inplace=True)
corpus.index.name = "node"

# See how it turned out
print(corpus.shape)
corpus.head()

Total files: 484
9.9 percent of files read.
19.8 percent of files read.
29.8 percent of files read.
39.7 percent of files read.
49.6 percent of files read.
59.5 percent of files read.
69.4 percent of files read.
79.3 percent of files read.
89.3 percent of files read.
99.2 percent of files read.
(119964, 9)


Unnamed: 0_level_0,index,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,Hurricane Dorian lashes US as Bahamas counts cost,"Life-threatening US storm surges are feared, a...","Thu, 05 Sep 2019 16:03:44 GMT",https://www.bbc.co.uk/news/world-us-canada-495...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Hurricane Dorian lashes US as Bahamas counts c...
1,1,Kohistan video murders: Three guilty in 'honou...,They are relatives of a group of Pakistani wom...,"Thu, 05 Sep 2019 13:53:17 GMT",https://www.bbc.co.uk/news/world-asia-49592540,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Kohistan video murders: Three guilty in 'honou...
2,2,MH17 Ukraine plane crash: 'Key witness' released,A Ukrainian court releases a potentially key w...,"Thu, 05 Sep 2019 13:46:06 GMT",https://www.bbc.co.uk/news/world-europe-49591148,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,MH17 Ukraine plane crash: 'Key witness' releas...
3,3,Article 370: The weddings 'ruined' by Kashmir'...,Indian-administered Kashmir is under a securit...,"Thu, 05 Sep 2019 07:32:34 GMT",https://www.bbc.co.uk/news/world-asia-india-49...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Article 70: The weddings 'ruined' by Kashmir's...
4,4,Syria war: Turkey warns Europe of new migrant ...,President Erdogan demands international help t...,"Thu, 05 Sep 2019 16:11:48 GMT",https://www.bbc.co.uk/news/world-europe-49599297,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Syria war: Turkey warns Europe of new migrant ...


## 2.  Build Text Model (Representation, eg; word2vec, entities list...)

- Trying with the world corpus and with the bing corpus, neither worked with InferSent.  Suspect the problem lies in the PCA step, which may not be working well on this high-dimensional (vector length = 4096) form.
- Summed keywords works rather better with the world corpus.
- Summed keywords still fail the bing/home office corpus, giving me a cluster about "immigration" and a cluster for the American Supreme Court.

In [3]:
# Windows didn't play nicely with the vector datasets, Some obscure encoding problem (python in Conda
# kept trying to decode using cp1252 regardless of whatever other options I specified!)
# Solution; rewrite file and drop any characters the Windows encoder refuses to recognise.
# I shouldn't loose too much info.
#with open('./lib/InferSent/dataset/fastText/crawl-300d-2M.vec', "r", encoding="cp1252", errors="ignore") as infile:
#    with open('./lib/InferSent/dataset/fastText/crawl-300d-2M_win.vec', "wb") as outfile:
#        for line in infile:
#            outfile.write(line.encode('cp1252'))

In [4]:
#infersent = reps.InferSentModel(list(corpus['clean_text']),
#                                list(corpus['clean_text']),
#                                W2V_PATH = './lib/InferSent/dataset/fastText/crawl-300d-2M_win.vec')

#embeddings = infersent.get_embeddings()

In [5]:
#reload(reps)

In [6]:
# Whereas this worked first time!
#glove = reps.NounGloveWordModel(list(corpus['clean_text']), list(corpus['clean_text']))

#embeddings = glove.get_embeddings()

In [7]:
# Turn that into a DF for me
#embeddings_df = pd.DataFrame({"clean_text": list(embeddings.keys()),
#                              "embeddings": list(embeddings.values())})
#embeddings_df.shape

In [8]:
#embeddings_df.head()

## 2a.  Try a really simple averaged word vector model!

With a complex noun extraction function 'cause that part's slow so I multi-threaded it.

In [9]:
from gensim.models import Word2Vec
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.models.phrases import Phrases, Phraser

# Set vector size
vec_size = 100

In [10]:
def get_phrased_nouns(sentences):
    """ Use spacy to get all of the actual entities, conjoin bigram nouns. """

    # Get the lists of nouns
    noun_lists = []
    for doc in sentences:
        parsed = nlp(doc)
        noun_lists.append([token.lemma_ for token in parsed if token.pos_ == 'PROPN'])

    # Build the phrase model
    phrases = Phrases(noun_lists, min_count=5, threshold=0.5)

    # Get the set of phrases present in the model
    results = []
    for nouns in noun_lists:
        results.append(phrases[nouns])

    return results

# Get phrase-conjoined, lemmatized tokens
test = get_phrased_nouns(corpus['clean_text'])

# Detect and conjoin bigrams
model = Word2Vec(test, size=vec_size, window=5, min_count=1, workers=10)

In [11]:
def get_averaged_vec(token_list, model):
    
    vecs = []
    for token in token_list:
        try:
            vector = model.wv[token]
        except: 
            vector = np.zeros(vec_size)
        vecs.append(vector)
    
    if len(vecs) > 0:
        return np.mean(np.asarray(vecs), axis=0)
    else:
        return np.zeros(vec_size)

In [12]:
vectors = [get_averaged_vec(tokens, model) for tokens in test]

In [13]:
vectors[0:2]

[array([ 1.53781638e-01,  1.45246491e-01,  4.81294952e-02,  9.48079899e-02,
        -9.42944176e-03,  2.29330864e-02, -2.49224395e-04, -1.95647225e-01,
        -5.35061397e-02,  1.34792432e-01, -8.04454982e-02,  1.03012256e-01,
         1.39015868e-01, -1.48054585e-01,  1.28703788e-02,  1.57565400e-01,
        -2.62922615e-01, -5.50911129e-02,  2.24276543e-01,  1.87517181e-01,
        -7.09659234e-02, -4.66934629e-02, -4.49050404e-02, -6.00992851e-02,
        -1.50685273e-02, -8.90230313e-02, -4.17743534e-01,  2.08824635e-01,
        -1.86446775e-02,  9.76672173e-02, -3.62350978e-02,  1.63821891e-01,
        -1.82177246e-04, -2.11004123e-01,  6.59414977e-02, -2.25947931e-01,
        -2.74425358e-01, -2.22146377e-01, -1.83696255e-01,  7.64691159e-02,
         5.38772047e-02, -2.95495182e-01, -1.76657736e-01,  3.36392149e-02,
         3.71960193e-01,  1.79060921e-02,  1.33628771e-01, -1.23042464e-01,
        -1.23182228e-02, -4.78181355e-02,  1.20381452e-01,  2.35077932e-01,
        -1.6

## 3. Cluster Text

This is the part where the pipelines get a little more experimental
- First; PCA (HDBSCAN prefers < 50 dimensions if possible) (also, try just fitting 50 vectors!)
- Second; HDBSCAN clustering

In [14]:
embeddings_array = np.vstack(vectors)

# First, PCA the data
pca = PCA(n_components=20, svd_solver='full')

# Fit and check
embeddings_pca = pca.fit_transform(embeddings_array)

# Diagnostic stats - both should show exponential decay
print(pca.explained_variance_ratio_)
print(pca.singular_values_) 

[6.92003942e-01 1.62133154e-01 8.27845463e-02 2.28646405e-02
 1.32674371e-02 9.42607334e-03 6.38616359e-03 5.83969587e-03
 2.23419251e-03 1.10530670e-03 5.53566769e-04 4.31514541e-04
 2.64000561e-04 1.41732850e-04 9.77158000e-05 6.88924149e-05
 4.67668211e-05 4.56959620e-05 3.67081870e-05 2.66543060e-05]
[686.90759955 332.49104497 237.58481172 124.86083235  95.11249079
  80.16954973  65.98786192  63.10142281  39.03051961  27.45272369
  19.4280424   17.15306584  13.41672327   9.83058353   8.16256087
   6.85377339   5.64693726   5.58191154   5.00294242   4.26312166]


In [21]:
# Clusterer fitting
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=10)
clusterer.fit(embeddings_array)

# Examine results
Counter(clusterer.labels_)

Counter({-1: 92324,
         1021: 6311,
         104: 12,
         765: 32,
         443: 34,
         285: 19,
         326: 20,
         601: 22,
         439: 6,
         440: 19,
         384: 6,
         166: 11,
         204: 34,
         25: 108,
         1020: 1071,
         951: 9,
         321: 13,
         288: 14,
         21: 24,
         652: 36,
         387: 28,
         30: 42,
         100: 13,
         191: 61,
         46: 14,
         13: 72,
         296: 162,
         306: 12,
         10: 30,
         263: 11,
         420: 29,
         8: 127,
         474: 37,
         344: 33,
         165: 10,
         346: 56,
         219: 5,
         561: 44,
         494: 12,
         356: 31,
         282: 18,
         144: 17,
         336: 16,
         132: 5,
         228: 19,
         28: 11,
         108: 20,
         106: 20,
         341: 11,
         113: 13,
         59: 15,
         244: 12,
         5: 11,
         669: 73,
         99: 30,
         169: 130

In [22]:
# Diagnostic - calculate percentage of records assigned "outlier"
100.0 * sum(clusterer.labels_ == -1) / clusterer.labels_.shape[0]

76.95975459304458

In [23]:
len(pd.unique(clusterer.labels_))

1024

In [24]:
# Record cluster each was assigned to
corpus['cluster'] = clusterer.labels_

# Record the reverse of outlier score (therefore, higher number = higher certainty of membership)
corpus['score'] = 1.0 - clusterer.outlier_scores_

corpus.to_csv("working/RSS_clustered_w2v_direct.csv", index=False)

  self._outlier_scores = outlier_scores(self._condensed_tree)
