# Experimenting with Cleaning, Clustering & Summarization Pipelines

### To do (technical)
- Implement date windows on my corpus loader function

In [1]:
import os
import re
import json

import numpy as np
import pandas as pd
import networkx as nx

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import lib.helper as helper
import lib.embedding_models as reps

from importlib import reload

%matplotlib inline

# Useful flatten function from Alex Martelli on https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
flatten = lambda l: [item for sublist in l for item in sublist]

## 1.  Retrieve Corpus

The corpus is being scraped by the "run_news_scrapes.py" script (and windows task scheduler) every 12 hours, a bit past midday and a bit past midnight.

The "bing" corpus are news titles and text extracts gotten from the bing news search API, using a few Home Office - related keywords.

The "RSS" corpus is plugged directly into a number of RSS feeds for world news sites and local british news sites, with no filters for news story types or subjects applied.

In [2]:
# Should be same path for all my PC's, it's where each scrape goes as a separate json file.
storage_path = "D:/Dropbox/news_crow/scrape_results"

# "bing" is targeted news search corpus, "RSS" is from specific world and local news feeds.
corpus_type = "RSS"

# There's a helper function to go find and drag out the various JSON files created by the scrapers.
corpus = helper.load_clean_world_corpus(storage_path, corpus_type)

# Make sure after cleaning etc it's indexed from 0
corpus.reset_index(inplace=True)
corpus.index.name = "node"

# See how it turned out
print(corpus.shape)
corpus.head()

Total files: 495
9.9 percent of files read.
19.8 percent of files read.
29.7 percent of files read.
39.6 percent of files read.
49.5 percent of files read.
59.4 percent of files read.
69.3 percent of files read.
79.2 percent of files read.
89.1 percent of files read.
99.0 percent of files read.
(31393, 9)


Unnamed: 0_level_0,index,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,Hurricane Dorian lashes US as Bahamas counts cost,"Life-threatening US storm surges are feared, a...","Thu, 05 Sep 2019 16:03:44 GMT",https://www.bbc.co.uk/news/world-us-canada-495...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Hurricane Dorian lashes US as Bahamas counts c...
1,1,Kohistan video murders: Three guilty in 'honou...,They are relatives of a group of Pakistani wom...,"Thu, 05 Sep 2019 13:53:17 GMT",https://www.bbc.co.uk/news/world-asia-49592540,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Kohistan video murders: Three guilty in 'honou...
2,2,MH17 Ukraine plane crash: 'Key witness' released,A Ukrainian court releases a potentially key w...,"Thu, 05 Sep 2019 13:46:06 GMT",https://www.bbc.co.uk/news/world-europe-49591148,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,MH17 Ukraine plane crash: 'Key witness' releas...
3,3,Article 370: The weddings 'ruined' by Kashmir'...,Indian-administered Kashmir is under a securit...,"Thu, 05 Sep 2019 07:32:34 GMT",https://www.bbc.co.uk/news/world-asia-india-49...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Article 70: The weddings 'ruined' by Kashmir's...
4,4,Syria war: Turkey warns Europe of new migrant ...,President Erdogan demands international help t...,"Thu, 05 Sep 2019 16:11:48 GMT",https://www.bbc.co.uk/news/world-europe-49599297,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Syria war: Turkey warns Europe of new migrant ...


## 2. Use Detected Nouns to create a Graph Representation

In [3]:
# Generate the text representation
model = reps.NounAdjacencyModel(list(corpus['clean_text']), list(corpus['clean_text']))

# Tabulate for convenience
nouns_df = model.table.copy()
nouns_df.head()

Unnamed: 0,Naegleria,islandapos;s,Ski,Gloria,Tam,Michelin,Uruguay,CDC,Clinton_Trump,U.S.,...,BOOTED,IRA,Newfoundland,Invisible,apos;suspiciousapos,cupidapos;s,Coronavirus_France,Shead,Shane,Brexiteers
"Hurricane Dorian lashes US as Bahamas counts cost. Life-threatening US storm surges are feared, as rescue work continues in the devastated Bahamas.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Kohistan video murders: Three guilty in 'honour killing' blood feud. They are relatives of a group of Pakistani women killed after being filmed singing at a wedding.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MH17 Ukraine plane crash: 'Key witness' released. A Ukrainian court releases a potentially key witness to the downing of the Malaysian airliner MH17.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Article 70: The weddings 'ruined' by Kashmir's lockdown. Indian-administered Kashmir is under a security crackdown after it was stripped of its special status.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Syria war: Turkey warns Europe of new migrant wave. President Erdogan demands international help to create a refugee ""safe zone"" in northern Syria.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Drop any noun/noun phrase containing one of the search terms, then create an adjacency matrix

#### Drop any noun/phrase occuring too infrequently

In [4]:
# Retrive the set of search terms used for Bing, so we can remove them before
# clustering.
with open("D:/Dropbox/news_crow/scrape_settings.json", "r") as f:
    scrape_config = json.load(f)

search_terms = scrape_config['disaster_search_list']
search_terms = re.sub(r"[^0-9A-Za-z ]", "", " ".join(search_terms)).lower().split()
search_terms = set(search_terms)

In [5]:
# Get X most common nouns WHY NOT USE TFIDF HERE?
nouns_to_keep = list(nouns_df.\
                    sum(axis=0).\
                    sort_values(ascending=False).\
                    index)

# Cut out any nouns containing the original search terms
#nouns_to_keep = [noun for noun in nouns_to_keep if sum([term in noun for term in search_terms]) == 0]

# Keep only most common
nouns_to_keep = nouns_to_keep[:2000]

# Subset nouns dataframe
nouns_df = nouns_df[nouns_to_keep]

print(nouns_df.shape)

(31393, 2000)


In [6]:
embeddings = np.asarray(nouns_df)
adjacency = np.dot(embeddings, embeddings.T)
print(np.max(adjacency))

14


In [7]:
# If the "lower" limit is 1, the graph has so many edges it eats ALL the memory of my desktop, even
# with just 500-ish stories to process.
upper = 100
lower = 2
G = nx.Graph()
rows, cols = np.where((upper >= adjacency) & (adjacency >= lower))
weights = [float(adjacency[rows[i], cols[i]]) for i in range(len(rows))]
edges = zip(rows.tolist(), cols.tolist(), weights)
G.add_weighted_edges_from(edges)

# Simplify; remove self-edges - not sure if needed?
G.remove_edges_from(nx.selfloop_edges(G))

In [8]:
G.number_of_edges()

295996

In [9]:
nx.write_gml(G, "working/RSS_graph_2lim_permissive.gml")

12596 to beat

## 3c.  Try CDLIB

In [12]:
import cdlib
from cdlib import algorithms
from cdlib import evaluation

In [10]:
G = nx.read_gml("working/RSS_graph_2lim_permissive.gml")

In [13]:
# Simple (flat) clustering
lp_coms = algorithms.label_propagation(G)

# Traditional (easy) community detection
louvain_coms = algorithms.louvain(G)

In [14]:
# This result implies that the two methods have come to very similar conclusions...
# This function apparently isn't defined for overlapping communities
evaluation.normalized_mutual_information(lp_coms, louvain_coms)

MatchingResult(score=0.7776875206045083, std=None)

In [18]:
# Build dict of node-to-cluster lookup
community_lookup = {}
for comm_index, members in enumerate(lp_coms.communities):
    for member in members:
        community_lookup[member] = comm_index

# Add cluster to DF.  If node not in cluster, assign -1 (outlier)
corpus['node'] = corpus.index
corpus['cluster'] = corpus['node'].apply(lambda x: community_lookup.get(x, -1))
corpus[['clean_text', 'cluster']].head(10)

# If cluster is smaller than minimum limit, designate as outlier
cs_lookup = corpus['cluster'].value_counts().to_dict()
corpus['cluster'] = corpus['cluster'].apply(lambda x: -1 if (cs_lookup[x] < 5) else x)

In [19]:
# What percentage are now classed as outliers?
100.0 * sum(corpus['cluster']==-1) / corpus.shape[0]

100.0

In [20]:
# How many unique clusters after all this?  (minus one for outliers)
len(pd.unique(corpus['cluster']))

1

In [None]:
corpus.to_csv("working/RSS_clustered_louvain.csv")

In [None]:
#bigclam_coms.communities

In [None]:
#bigclam_coms.average_internal_degree()

In [None]:
#bigclam_coms.newman_girvan_modularity()

## 3.  Create (overlapping) clusters using Maximal Cliques
Idea from the docs, explanation at https://en.wikipedia.org/wiki/Clique_(graph_theory)
Expanded using k-clique-communities REF FIND PAPER

In [None]:
c = list(nx.algorithms.community.kclique.k_clique_communities(G, 4))
cliques = [(len(x), x) for x in c]

In [None]:
cliques

In [None]:
cliques_df = pd.DataFrame({"nodes_list": [x[1] for x in cliques],
                           "clique_size": [x[0] for x in cliques]}).\
                    sort_values("clique_size", ascending=False).\
                    reset_index()

cliques_df = cliques_df[(cliques_df['clique_size'] >= 3) & (cliques_df['clique_size'] <=100)]

In [None]:
cliques_df

In [None]:
cliqued = set(flatten(list(cliques_df['nodes_list'])))
len(cliqued)

In [None]:
# Flatten the cliques DF into long format
flattened = {"cluster_index":[], "node":[]}

for index, row in cliques_df.iterrows():
    for node in row["nodes_list"]:
        flattened["cluster_index"].append(index)
        flattened["node"].append(node)
        

partition_df = pd.DataFrame(flattened)

# Create a single string variable (";" separated) to record all clusters/cliques a single record belongs in
partition_df["cluster"] = partition_df.\
                          groupby("node")["cluster_index"].\
                          transform(lambda x: ";".join([str(i) for i in x if type(i)==int]))

# Clean up, set index of this and corpus so the two DF's can be joined with little effort
partition_df = partition_df[["node", "cluster"]].\
               drop_duplicates(["node", "cluster"], keep="first").\
               set_index("node")

corpus.drop(["cluster", "node"], axis=1).join(partition_df).\
       to_csv("working/RSS_clustered_cliques.csv")

### The below attempts overlapping community detection but can only run on connected graphs, think this is an implicit restriction of the algorithm logic.

In [None]:
# Get all connected components (will become less of an issue as graph size increases)
ccs = [(len(x), x) for x in nx.connected_components(G)]

# Sort by size (largest first)
ccs.sort(key = lambda x: x[0], reverse=True)

# Extract largest connected sub-graph
connected_sub = G.subgraph(ccs[0][1])

# re-index nodes from zero to maintain compatibility with CDLIB (sub-dependency, Karate)
# Will need to reverse this indexing when matching assigned clusters back to data
node_relabel_dict = {val: i for i, val in enumerate(list(connected_sub.nodes))}

connected_sub = nx.relabel_nodes(connected_sub, node_relabel_dict)

# Fire algo!
bigclam_coms = algorithms.big_clam(connected_sub)
#leiden_coms = algorithms.leiden(connected_sub)

In [None]:
bigclam_coms.communities

In [None]:
# Build dict of node-to-cluster lookup
community_lookup = {}
for comm_index, members in enumerate(bigclam_coms.communities):
    for member in members:
        community_lookup[member] = community_lookup.get(member, []) + [comm_index]

# Add cluster to DF.  If node not in cluster, assign -1 (outlier)
corpus['node'] = corpus.index
corpus['cluster'] = corpus['node'].apply(lambda x: community_lookup.get(x, [-1]))
corpus[['clean_text', 'cluster']].head(10)

# If cluster is smaller than minimum limit, designate as outlier
cs_lookup = corpus['cluster'].value_counts().to_dict()
corpus['cluster'] = corpus['cluster'].apply(lambda x: -1 if (cs_lookup[x] < 5) else x)

In [None]:
# What percentage are now classed as outliers?
100.0 * sum(corpus['cluster']==-1) / corpus.shape[0]

In [None]:
# How many unique clusters after all this?  (minus one for outliers)
len(pd.unique(corpus['cluster']))

In [None]:
corpus.to_csv("working/disaster_clustered_bigclam.csv")