## Imports

In [1]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from collections import Counter

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import warnings
warnings.filterwarnings('ignore')

nltk.download("stopwords")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JOY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data

In [2]:
df_raw = pd.read_csv("data/news_data.csv")

In [3]:
df_raw.sample(3)

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
8005,8005,cbs-news,CBS News,Emily Tillett,"300 former officials call out Trump for ""uncon...",Group of bipartisan national security experts ...,https://www.cbsnews.com/news/donald-trump-ukra...,https://cbsnews2.cbsistatic.com/hub/i/r/2017/0...,2019-09-27T12:17:01Z,More than 300 former national security officia...,0.0,14330.0,6414.0,4197.0,0.0
9495,9495,cnn,CNN,"Jason Hanna and Aaron Cooper, CNN",WWII-era bomber crashes at an airport near Har...,A World War II-era aircraft crashed Wednesday ...,https://www.cnn.com/2019/10/02/us/connecticut-...,https://cdn.cnn.com/cnnnext/dam/assets/1910021...,2019-10-02T14:35:10Z,,0.0,3373.0,988.0,1265.0,0.0
7428,7428,business-insider,Business Insider,"lramsey@businessinsider.com (Lydia Ramsey), Ly...",Dispensed: Amazon and Best Buy's expanding hea...,"REUTERS/Joshua Roberts Hello, There must be so...",https://www.businessinsider.com/dispensed-week...,https://image.businessinsider.com/5c2f7f05bd77...,2019-09-27T14:13:56Z,"Hello,\r\nThere must be something in the water...",0.0,0.0,0.0,1627.0,0.0


## Clean data

### Define function to clean and tokenize

In [4]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

### Apply function and remove duplicates

In [5]:
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["title", "description", "content"]

df = df_raw.copy()
df["content"] = df["content"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


### Check vocabulary

In [6]:
docs = df["text"].values
tokenized_docs = df["tokens"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [7]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

## Generate vectors from document

### Define function for creating a single vectors from word embeddings

In [8]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

### Apply function to previously pre-processed text

In [9]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)

In [10]:
model.wv.most_similar("trump")

[('trumps', 0.988541841506958),
 ('president', 0.9746480584144592),
 ('donald', 0.9274919629096985),
 ('ivanka', 0.9203823804855347),
 ('impeachment', 0.9195769429206848),
 ('pences', 0.9152195453643799),
 ('avlon', 0.9148270487785339),
 ('biden', 0.9146018624305725),
 ('breitbart', 0.9143953323364258),
 ('vice', 0.9067230224609375)]

In [11]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

### Generate and analyze clusters

In [12]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [13]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3580.118044419166
Silhouette values:
    Cluster 25: Size:51 | Avg:0.38 | Min:-0.05 | Max: 0.59
    Cluster 2: Size:124 | Avg:0.30 | Min:-0.02 | Max: 0.49
    Cluster 29: Size:23 | Avg:0.29 | Min:0.01 | Max: 0.50
    Cluster 17: Size:88 | Avg:0.28 | Min:-0.14 | Max: 0.51
    Cluster 48: Size:82 | Avg:0.27 | Min:0.01 | Max: 0.47
    Cluster 19: Size:100 | Avg:0.26 | Min:-0.04 | Max: 0.44
    Cluster 28: Size:104 | Avg:0.24 | Min:-0.04 | Max: 0.48
    Cluster 3: Size:115 | Avg:0.24 | Min:-0.05 | Max: 0.40
    Cluster 23: Size:83 | Avg:0.23 | Min:0.03 | Max: 0.41
    Cluster 30: Size:58 | Avg:0.22 | Min:-0.11 | Max: 0.47
    Cluster 44: Size:139 | Avg:0.20 | Min:-0.04 | Max: 0.41
    Cluster 26: Size:241 | Avg:0.19 | Min:-0.13 | Max: 0.41
    Cluster 41: Size:550 | Avg:0.18 | Min:-0.04 | Max: 0.40
    Cluster 16: Size:442 | Avg:0.17 | Min:0.00 | Max: 0.37
    Cluster 4: Size:95 | Avg:0.15 | Min:-0.06 | Max: 0.40
    Cluster 27: Size

In [14]:
print("Top terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: panel buttigieg hill rogue opposing 
Cluster 1: mosquito collapsed train boats borne 
Cluster 2: pm johnsons proposals delay benjamin 
Cluster 3: entertainment likes calloway patch contact 
Cluster 4: girl boy whose apartment raping 
Cluster 5: delegation amid erdogan envoy undermine 
Cluster 6: tournament victory injury beat finished 
Cluster 7: professional expensive virtual edition ones 
Cluster 8: category humberto tropical landfall strengthened 
Cluster 9: repeal urged renewed kiev agencies 
Cluster 10: knife indiana pleaded duluth arizona 
Cluster 11: speech dominic israels block suspend 
Cluster 12: asian followed gained sep gain 
Cluster 13: bomb dozens kills soldiers victims 
Cluster 14: glasgow father jailed daughter accident 
Cluster 15: vizcarra congressional ukrainian aides volodymyr 
Cluster 16: orleans training corps follows male 
Cluster 17: tanker ablaze yemen arabian strikes 
Cluster 18: obama moat blower tweet re

In [15]:
test_cluster = 48
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

US woman arrested at Manila airport with baby hidden in bag | Get breaking national and world news, broadcast video coverage, and exclusive interviews. Find the top news online at ABC news. | An American woman who attempted to carry a 6-day-old baby out of the Philippines hidden inside a sling bag has been arrested at Manila's airport and charged with human trafficking, officials said Thursday.
They said Jennifer Talbot was able to pass through t… [+1496 chars]
-------------
Police: No evidence of shooting at northern Virginia mall | Get breaking national and world news, broadcast video coverage, and exclusive interviews. Find the top news online at ABC news. | Authorities in northern Virginia say they have found no evidence that a shooting occurred at a popular mall.
The Arlington County Police Department tweeted Saturday night that authorities were continuing to conduct a search at the Ballston Quarter mall in Ar… [+162 chars]
-------------
12th man arrested in statutory rape case 