# Experimenting with Cleaning, Clustering & Summarization Pipelines

### To do (technical)
- Implement date windows on my corpus loader function

In [1]:
import os
import re
import json

import numpy as np
import pandas as pd
import networkx as nx

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import lib.helper as helper
import lib.embedding_models as reps

from importlib import reload

%matplotlib inline

# Useful flatten function from Alex Martelli on https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
flatten = lambda l: [item for sublist in l for item in sublist]

## 1.  Retrieve Corpus

The corpus is being scraped by the "run_news_scrapes.py" script (and windows task scheduler) every 12 hours, a bit past midday and a bit past midnight.

The "bing" corpus are news titles and text extracts gotten from the bing news search API, using a few Home Office - related keywords.

The "RSS" corpus is plugged directly into a number of RSS feeds for world news sites and local british news sites, with no filters for news story types or subjects applied.

In [2]:
# Should be same path for all my PC's, it's where each scrape goes as a separate json file.
storage_path = "D:/Dropbox/news_crow/scrape_results"

# "bing" is targeted news search corpus, "RSS" is from specific world and local news feeds.
corpus_type = "RSS"

# There's a helper function to go find and drag out the various JSON files created by the scrapers.
corpus = helper.load_clean_world_corpus(storage_path, corpus_type)

# Make sure after cleaning etc it's indexed from 0
corpus.reset_index(inplace=True)
corpus.index.name = "node"

# See how it turned out
print(corpus.shape)
corpus.head()

Total files: 495


KeyboardInterrupt: 

## 2. Use Detected Nouns to create a Graph Representation

In [3]:
# Generate the text representation
model = reps.NounAdjacencyModel2(list(corpus['clean_text']), list(corpus['clean_text']))

# Tabulate for convenience
nouns_df = model.table.copy()
nouns_df.head()

Unnamed: 0,lindstaedt,apach,fearsom,caiffa,mcenani,arquett,kamala_harri,petrel,nangarhar,muhandi,...,mathew,bobi,jarrid,mirjan,carolina_bahama,river,wetland,cabinet,pluvign,schaffner
"Hurricane Dorian lashes US as Bahamas counts cost. Life-threatening US storm surges are feared, as rescue work continues in the devastated Bahamas.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Kohistan video murders: Three guilty in 'honour killing' blood feud. They are relatives of a group of Pakistani women killed after being filmed singing at a wedding.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MH17 Ukraine plane crash: 'Key witness' released. A Ukrainian court releases a potentially key witness to the downing of the Malaysian airliner MH17.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Article 70: The weddings 'ruined' by Kashmir's lockdown. Indian-administered Kashmir is under a security crackdown after it was stripped of its special status.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Syria war: Turkey warns Europe of new migrant wave. President Erdogan demands international help to create a refugee ""safe zone"" in northern Syria.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Drop any noun/noun phrase containing one of the search terms, then create an adjacency matrix

#### Drop any noun/phrase occuring too infrequently

In [4]:
# Retrive the set of search terms used for Bing, so we can remove them before
# clustering.
with open("D:/Dropbox/news_crow/scrape_settings.json", "r") as f:
    scrape_config = json.load(f)

search_terms = scrape_config['disaster_search_list']
search_terms = re.sub(r"[^0-9A-Za-z ]", "", " ".join(search_terms)).lower().split()
search_terms = set(search_terms)

In [5]:
# Get X most common nouns
nouns_to_keep = list(nouns_df.\
                    sum(axis=0).\
                    sort_values(ascending=False).\
                    index)

# Cut out any nouns containing the original search terms
#nouns_to_keep = [noun for noun in nouns_to_keep if sum([term in noun for term in search_terms]) == 0]

# Keep only most common
nouns_to_keep = nouns_to_keep[:1500]

# Subset nouns dataframe
nouns_df = nouns_df[nouns_to_keep]

print(nouns_df.shape)

(31393, 1500)


In [6]:
embeddings = np.asarray(nouns_df)
adjacency = np.dot(embeddings, embeddings.T)
print(np.max(adjacency))

13


In [7]:
# If the "lower" limit is 1, the graph has so many edges it eats ALL the memory of my desktop, even
# with just 500-ish stories to process.
upper = 100
lower = 2
G = nx.Graph()
rows, cols = np.where((upper >= adjacency) & (adjacency >= lower))
weights = [float(adjacency[rows[i], cols[i]]) for i in range(len(rows))]
edges = zip(rows.tolist(), cols.tolist(), weights)
G.add_weighted_edges_from(edges)

# Simplify; remove self-edges - not sure if needed?
G.remove_edges_from(nx.selfloop_edges(G))

G.number_of_edges()

356404

In [8]:
nx.write_gml(G, "working/RSS_graph_2lim_tight.gml")

12596 to beat

## 3c.  Try CDLIB

In [3]:
import cdlib
from cdlib import algorithms
from cdlib import evaluation

In [4]:
def assign_cluster_from_model(corpus, model, threshold=5):
    """ Mine the cluster-of-node info from the model instance """
    community_lookup = {}
    for comm_index, members in enumerate(model.communities):
        for member in members:
            community_lookup[member] = comm_index
            
    # Add cluster to DF.  If node not in cluster, assign -1 (outlier)
    corpus['node'] = corpus.index
    corpus['cluster'] = corpus['node'].apply(lambda x: community_lookup.get(str(x), -1))
    corpus[['clean_text', 'cluster']].head(10)
    
    # If cluster is smaller than minimum limit, designate as outlier
    cs_lookup = corpus['cluster'].value_counts().to_dict()
    corpus['cluster'] = corpus['cluster'].apply(lambda x: -1 if (cs_lookup[x] < threshold) else x)
    
    return corpus

def generate_louvain(G, corpus, resolution=1.0, threshold=5):
    """ Develop a louvain model, assign clusters etc """
    print("\nLouvain resolution: ", resolution)
    
    # Find the communities
    louvain_coms = algorithms.louvain(G, resolution = resolution)
    
    corpus = assign_cluster_from_model(corpus, louvain_coms)
    
    # What percentage are now classed as outliers?
    print("Percent classed outlier: ", 100.0 * sum(corpus['cluster']==-1) / corpus.shape[0])
    
    # How many unique clusters after all this?  (minus one for outliers)
    print("Number of unique clusters: ", len(pd.unique(corpus['cluster'])))
    
    return corpus, len(pd.unique(corpus['cluster'])), 100.0 * sum(corpus['cluster']==-1) / corpus.shape[0]

In [5]:
G = nx.read_gml("working/RSS_graph_2lim_permissive.gml")

resolutions = [0.5, 1.0, 2.0, 3.0, 5.0, 7.0, 10.0]
cluster_count = []
outlier_pct = []
for res in resolutions:
    x, n_clusters, pct_outliers = generate_louvain(G, corpus, resolution=res)
    
    cluster_count.append(n_clusters)
    outlier_pct.append(pct_outliers)


Louvain resolution:  1.0
Percent classed outlier:  40.2159717134393
Number of unique clusters:  70

Louvain resolution:  3.0
Percent classed outlier:  48.45029146625044
Number of unique clusters:  683

Louvain resolution:  5.0
Percent classed outlier:  48.19545758608607
Number of unique clusters:  625

Louvain resolution:  7.0
Percent classed outlier:  48.20182843309018
Number of unique clusters:  625

Louvain resolution:  10.0
Percent classed outlier:  48.392953843213455
Number of unique clusters:  625


In [6]:
G = nx.read_gml("working/RSS_graph_2lim_permissive.gml")
corpus = generate_louvain(G, corpus, resolution=5.0)
corpus.to_csv("working/RSS_clustered_louvain_2lim_permissive_highres.csv")


Louvain resolution:  5.0
Percent classed outlier:  48.19545758608607
Number of unique clusters:  625
