# Experimenting with Cleaning, Clustering & Summarization Pipelines

### To do (technical)
- Implement date windows on my corpus loader function

In [1]:
import os
import re
import json

import numpy as np
import pandas as pd
import networkx as nx

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import lib.helper as helper
import lib.embedding_models as reps

from importlib import reload

%matplotlib inline

In [2]:
# Useful flatten function from Alex Martelli on https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
flatten = lambda l: [item for sublist in l for item in sublist]

In [3]:
# Should be same path for all my PC's, it's where each scrape goes as a separate json file.
storage_path = "/home/ozwald/Dropbox/news_crow/scrape_results"

# "bing" is targeted news search corpus, "RSS" is from specific world and local news feeds.
corpus_type = "RSS"

## 1.  Retrieve Corpus

The corpus is being scraped by the "run_news_scrapes.py" script (and windows task scheduler) every 12 hours, a bit past midday and a bit past midnight.

The "bing" corpus are news titles and text extracts gotten from the bing news search API, using a few Home Office - related keywords.

The "RSS" corpus is plugged directly into a number of RSS feeds for world news sites and local british news sites, with no filters for news story types or subjects applied.

### First, get a list of all the news dumps created so far

In [4]:
corpus = helper.load_clean_corpus(storage_path, corpus_type)

Total files: 203
Loading file: RSS_corpus_2019-11-01_0022.json
Loading file: RSS_corpus_2019-11-12_0021.json
Loading file: RSS_corpus_2019-10-25_1222.json
Loading file: RSS_corpus_2019-12-15_0022.json
Loading file: RSS_corpus_2019-11-23_0023.json
Loading file: RSS_corpus_2019-10-26_1221.json
Loading file: RSS_corpus_2019-10-18_1222.json
Loading file: RSS_corpus_2019-09-23_0020.json
Loading file: RSS_corpus_2019-10-15_0022.json
Loading file: RSS_corpus_2019-11-05_0022.json
Loading file: RSS_corpus_2019-09-06_0020.json
Loading file: RSS_corpus_2019-10-06_1223.json
Loading file: RSS_corpus_2019-12-23_1222.json
Loading file: RSS_corpus_2019-09-14_1222.json
Loading file: RSS_corpus_2019-09-20_1222.json
Loading file: RSS_corpus_2019-09-22_1222.json
Loading file: RSS_corpus_2019-11-18_1223.json
Loading file: RSS_corpus_2019-09-12_1222.json
Loading file: RSS_corpus_2019-11-30_0022.json
Loading file: RSS_corpus_2019-11-04_0022.json
Loading file: RSS_corpus_2019-10-28_0022.json
Loading file: RSS

Loading file: RSS_corpus_2019-10-03_1223.json
Loading file: RSS_corpus_2019-12-06_0022.json
Loading file: RSS_corpus_2019-09-17_1222.json
Loading file: RSS_corpus_2019-10-08_1223.json
Loading file: RSS_corpus_2019-10-22_0021.json
Loading file: RSS_corpus_2019-10-24_0022.json
Loading file: RSS_corpus_2019-12-07_0023.json
Loading file: RSS_corpus_2019-10-07_0021.json
Loading file: RSS_corpus_2019-12-05_0022.json
Loading file: RSS_corpus_2019-10-12_1223.json
Loading file: RSS_corpus_2019-10-31_0648.json
Loading file: RSS_corpus_2019-09-29_1224.json
Loading file: RSS_corpus_2019-09-21_0020.json
Loading file: RSS_corpus_2019-09-24_0020.json
Loading file: RSS_corpus_2019-10-23_1221.json
Loading file: RSS_corpus_2019-10-23_0022.json
Loading file: RSS_corpus_2019-11-02_1222.json
Loading file: RSS_corpus_2019-10-24_1222.json
Loading file: RSS_corpus_2019-12-09_1222.json
Loading file: RSS_corpus_2019-10-19_1222.json
Loading file: RSS_corpus_2019-11-16_1222.json
Loading file: RSS_corpus_2019-09-1

In [5]:
corpus.head()

Unnamed: 0,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text
0,Trump impeachment: House votes to formalise in...,The Democratic-controlled chamber approves a r...,"Thu, 31 Oct 2019 20:21:13 GMT",https://www.bbc.co.uk/news/world-us-canada-502...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-11-01 00:21:58.608417,rss_feed,Trump impeachment: House votes to formalise in...
1,Five men acquitted of gang-raping teenager in ...,A court ruled the men did not commit rape beca...,"Thu, 31 Oct 2019 23:23:02 GMT",https://www.bbc.co.uk/news/world-europe-50257922,http://feeds.bbci.co.uk/news/world/rss.xml,2019-11-01 00:21:58.608436,rss_feed,Five men acquitted of gang-raping teenager in ...
2,Brazil wildfires: Blaze advances across Pantan...,The area is one of the most biodiverse regions...,"Fri, 01 Nov 2019 00:11:01 GMT",https://www.bbc.co.uk/news/world-latin-america...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-11-01 00:21:58.608446,rss_feed,Brazil wildfires: Blaze advances across Pantan...
3,Islamic State group names its new leader as Ab...,The jihadist group names Abu Ibrahim al-Hashem...,"Thu, 31 Oct 2019 19:03:25 GMT",https://www.bbc.co.uk/news/world-middle-east-5...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-11-01 00:21:58.608455,rss_feed,Islamic State group names its new leader as Ab...
4,Iraq protests: How tuk-tuks are saving lives i...,"From a nuisance to a necessity, tuk-tuks have ...","Thu, 31 Oct 2019 19:11:51 GMT",https://www.bbc.co.uk/news/world-middle-east-5...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-11-01 00:21:58.608464,rss_feed,Iraq protests: How tuk-tuks are saving lives i...


In [6]:
corpus.shape

(50679, 8)

## 2. Clustering using Entity Detection And Network Analytics

This doesn't resolve very well for Bing, because there's a whole bunch of keywords from the original searches in there.  Suspect that's got a lot to do with the failure of the other methods too.  For the network analytics method I'm going to try removing the keywords from the table first.

In [7]:
#with open("/home/ozwald/Dropbox/news_crow/scrape_settings.json", "r") as f:
#    scrape_config = json.load(f)
#
#search_terms = scrape_config['disaster_search_list']
#search_terms = re.sub(r"[^0-9A-Za-z ]", "", " ".join(search_terms)).lower().split()
#search_terms = set(search_terms)

In [8]:
#search_terms

In [None]:
model = reps.NounAdjacencyModel(list(corpus['clean_text']), list(corpus['clean_text']))
model.noun_sets[3]

PARALLEL AWESOMENESS!!!
found all nouns
Reduced noun lists to sets


In [None]:
nouns_df = model.table.copy()
nouns_df.head()

### Drop any noun/noun phrase containing one of the search terms, then create an adjacency matrix

### Drop any noun/phrase occuring too infrequently

In [None]:
# Get 500 most common nouns
nouns_to_keep = list(nouns_df.\
                    sum(axis=0).\
                    sort_values(ascending=False).\
                    index)

# Cut out any nouns containing the original search terms
#nouns_to_keep = [noun for noun in nouns_to_keep if sum([term in noun for term in search_terms]) == 0]

# Keep only top 500 most common
nouns_to_keep = nouns_to_keep[:500]

# Subset nouns dataframe
nouns_df = nouns_df[nouns_to_keep]

print(nouns_df.shape)

In [None]:
embeddings = np.asarray(nouns_df)
adjacency = np.dot(embeddings, embeddings.T)
print(np.max(adjacency))

In [None]:
# If the "lower" limit is 1, the graph has so many edges it eats ALL the memory of my desktop, even
# with just 500-ish stories to process.
upper = 100
lower = 3
G = nx.Graph()
rows, cols = np.where((upper >= adjacency) & (adjacency >= lower))
weights = [float(adjacency[rows[i], cols[i]]) for i in range(len(rows))]
edges = zip(rows.tolist(), cols.tolist(), weights)
G.add_weighted_edges_from(edges)

# Simplify; remove self-edges
G.remove_edges_from(nx.selfloop_edges(G))

In [None]:
G.number_of_edges()

In [None]:
#G_plot = nx.petersen_graph()
#plt.subplot(121)
#nx.draw(G, with_labels=True, font_weight='bold')
#plt.subplot(122)
#nx.draw_shell(G, nlist=[range(5, 10), range(5)], with_labels=True, font_weight='bold')

### Cliques, worth a look?
Idea from the docs, explanation at https://en.wikipedia.org/wiki/Clique_(graph_theory)

So, cliques are allowed to overlap - should've thought of that.  Still, good preliminary results and I've found I can disambiguate the cliques to some degree by cutting out weaker links (fewer shared entities).

I should add it also appears to merely suffer from the same problems as the other clustering methods, clusters are ultimately hierarchical!

In [None]:
cliques = []
for x in nx.find_cliques(G):
    x.sort()
    cliques.append((len(x), x))

In [None]:
cliques_df = pd.DataFrame({"nodes_list": [x[1] for x in cliques],
                           "clique_size": [x[0] for x in cliques]}).\
                    sort_values("clique_size", ascending=False).\
                    reset_index()

In [None]:
len(cliques_df[cliques_df['clique_size'] >= 5])

In [None]:
cliques_df[cliques_df['clique_size'] >= 5]

In [None]:
cliqued = set(flatten(list(cliques_df['nodes_list'])))
len(cliqued)

In [None]:
for node in cliques_df.iloc[0]['nodes_list']:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])

In [None]:
for node in cliques_df.iloc[1]['nodes_list']:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])

In [None]:
for node in cliques_df.iloc[3]['nodes_list']:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])

In [None]:
for node in cliques_df.iloc[17]['nodes_list']:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])

### Connected components

In [None]:
nx.number_connected_components(G)

In [None]:
components = [component for component in nx.connected_components(G)]

In [None]:
sum([len(component) for component in components])

### Community Detection Algorithm

In [None]:
from community import best_partition

In [None]:
# Apply Louvain Community Detection
# The keys are nodes, the values are the partitions they belong to
partition = best_partition(G)

number_partitions = max(partition.values())
number_partitions

In [None]:
# Iterate through and get a list of partitions and their nodes
partition_contents = {}
for key in partition.keys():
    partition_contents[partition[key]] = partition_contents.get(partition[key], []) + [key]

# Drop partitions that are too small
for key in list(partition_contents.keys()):
    if len(partition_contents[key]) < 5:
        partition_contents.pop(key)

In [None]:
# Let's see how big our "clusters" are, and how many there are total after removing the tiny ones
partition_lengths = [len(value) for key, value in partition_contents.items()]
print(partition_lengths, sum(partition_lengths))

In [None]:
partition_contents.keys()

In [None]:
for node in partition_contents[2]:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])

In [None]:
for node in partition_contents[9]:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])

In [None]:
for node in partition_contents[15]:
    article = nouns_df.reset_index().iloc[node]
    print(article['clean_text'])