In [1]:
import numpy as np
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from numpy.linalg import norm
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import os
import shutil
import pathlib
import gc
from tqdm.auto import tqdm

In [2]:
# root dir to store the zip file of the whole output folder which serves as the base directory
# for all the things output by the script
root_dir = pathlib.Path("/kaggle/working/")
base_dir = pathlib.Path("/kaggle/working/output/")
os.makedirs(base_dir, exist_ok=True)

In [None]:
articles_dataset = tf.data.Dataset.from_tensor_slices((pd.read_csv("/kaggle/input/medium-articles/medium_articles.csv")['text']).values)
articles_dataset = articles_dataset.batch(8000)
total_batches = len(articles_dataset)
print(total_batches)

## We've got 25 batches of length 8000 each but we have about 190K or 190K+ artciles so the first 24 batches take up 192K articles hence it's obvious that the last batch would be around ~1K texts or less, which may not be suitable for us, but we generate word clouds from it anyway.

In [None]:
# Collected all the code in one cell to save memory consumption as i was constantly running out of memory on kaggle when running SVD.
#------------------


def rank_words_by_tfidf(indices, words_list):
    """Ranks the words, specified by indices which are sent in by "cluster_to_cloud"
    function. Ranking is based on the summed tfidf score """
    
    summed_tfidf = np.asarray(tfidf_matrix[indices].sum(axis=0))
    data = {"Words": words_list,
           "Summed_TFIDF": summed_tfidf}
    return pd.DataFrame(data).sort_values("Summed_TFIDF", ascending=False)


#-------------------


def cluster_to_cloud(df_cluster, max_words=15, cluster_num=0, words_list=None, batch_num=0, save_dir=None):
    """Generates a word cloud image using the top 15 words 
    (which are ranked by their tfidf score) in the given cluster"""
    
    indices = df_cluster.Index.values
    df_ranked_words_all = rank_words_by_tfidf(indices, words_list)
    df_ranked_words_in_cloud = df_ranked_words_all[:max_words]
    df_ranked_words_remaining = df_ranked_words_all[max_words:]
    
    filename = save_dir / f"cluster_{cluster_num}_words_batch_{batch_num}.csv"
    df_ranked_words_remaining.to_csv(filename, index=False)
    words_to_score = {word:score
                     for word, score in df_ranked_words_in_cloud.values}    
    cloud_generator = WordCloud(background_color="white",
                               random_state=1, width=2000, height=1000)
    wordcloud_image = cloud_generator.fit_words(words_to_score)
    return wordcloud_image


#------------------


def vectorizeX(batch):
    """Vectorizes the texts,generates the TFIDF matrix
    and returns TFIDF matrix, words, and sorted words TFIDF dataframe.
    X in the function names stands for eXtra, as it performs and returns some extra things
    also in all honesty, it's more because it's sounds cool this way. No judging, okay?"""
    
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch).toarray()
    words = tfidf_vectorizer.get_feature_names_out()
    words_tfidf_df = pd.DataFrame({"Words": words, "Summed_TFIDF": tfidf_matrix.sum(axis=0)})
    sorted_words_tfidf_df = words_tfidf_df.sort_values(by="Summed_TFIDF", ascending=False)
    
    return words, tfidf_matrix, words_tfidf_df, sorted_words_tfidf_df


#-------------------


def clusterItUp(shrunk_norm_matrix, batch_num=0):
    """Takes the normalized shrunk matrix of a batch and clusters the words using KMeans
    and returns the dataframe with each word assigned a relevant cluster id"""
     
    #as mentioned earlier, since our last batch would contain just around 1K texts hence it would have less
    # words and less diversity. So we cluster that in a 2 groups instead of 15,
    # a number we found in our initial prototypes (that code is not included here)
          
    print("\tClustering words into groups...")
    if batch_num != 24:
        cluster_model = KMeans(n_clusters=15)
        clusters = cluster_model.fit_predict(shrunk_norm_matrix)
        # we are using Z to make the plural in clusters, more apparent and more easily distinguishable
        clusterZ_df = pd.DataFrame({'Index': range(clusters.size), 'Cluster': clusters})
    else:
        cluster_model = KMeans(n_clusters=2)
        clusters = cluster_model.fit_predict(shrunk_norm_matrix)
        clusterZ_df = pd.DataFrame({'Index': range(clusters.size), 'Cluster': clusters})
    print("\t\tSuccessfully Clustered!")
    return clusterZ_df

# -------------

for batch_num, batch in tqdm(enumerate(articles_dataset)):
    
    print(f"\nProcessing batch {batch_num+1} / {total_batches}")
    
    # converts the tf object into numpy since we're accustomed ot numpy and pandas workflow
    batch = batch.numpy()
    
    # make directories to neatly organize our output files
    # also would be a lot easier to download later
    print("\tMaking directories...")
    batch_dir = base_dir / f"batch{batch_num}"
    cluster_words_dir = batch_dir / "cluster_words"
    cluster_clouds_dir = batch_dir / "cluster_clouds"   
    os.makedirs(cluster_words_dir, exist_ok=True)
    os.makedirs(cluster_clouds_dir, exist_ok=True)
    print("\t\tSuccessfully created required directories.")
    
    # calculate the words, tfidf_matrix and sorted words df by calling the vectorizeX function
    print("\tPerforming vectorization x...")
    words, tfidf_matrix, words_tfidf_df, sorted_words_tfidf_df = vectorizeX(batch)
    print("\t\tSuccessfully vectorized x.")
    
    print(f"\tTotal Words -> {len(words)}")
    
    filename = batch_dir / f"words_summed_tfidf_batch_{batch_num}.csv"
    print(f"\tSaving {filename} to disk...")
    
    filename = batch_dir / f"sorted_words_summed_tfidf_batch_{batch_num}.csv"
    print(f"\tSaving {filename} to disk...")
    sorted_words_tfidf_df.to_csv(f"{filename}", index=False)

    
    print("\tApplying SVD...")
    shrunk_matrix = TruncatedSVD(n_components=100).fit_transform(tfidf_matrix)
    print("\t\tSuccessfully applied SVD.")
    
    # normalize the matrix
    print("\tNormalizing shrunk matrix...")
    shrunk_norm_matrix = normalize(shrunk_matrix)
    # norm(shrunk_norm_matrix[0])
    print("\t\tTarget normalized.")
    

    clusterZ_df = clusterItUp(shrunk_norm_matrix, batch_num=batch_num)

    # though clusters file and summed tfidf file would pretty much contain the same words
    # with ONLY DIFFERENCE being that this cluster file would associate each word with its
    #cluster id. but we are saving it anyway.
    filename = batch_dir / f"clustered_groups_batch_{batch_num}.csv"
    clusterZ_df.to_csv(filename, index=False)
    # making a list of clustered groups for further manipulation
    cluster_groups = [df_cluster for _, df_cluster in clusterZ_df.groupby("Cluster")]
#     len(cluster_groups)
    
    # the number of words we want in the word cloud image.
    # 15 seems to work well. But obviously you can change to whatever you want.
    max_words = 15
    
    # making a copy of clustered groups list so as to avoid accidently changing its elements
    cluster_groups_cp = cluster_groups[:]
    
    total_groups = len(cluster_groups_cp)
    
    print(f"\tGenerating word cloud images for clusters... ")
    for i in tqdm(range(total_groups)):
        cluster_df = cluster_groups_cp[i]
        wordcloud_image = cluster_to_cloud(cluster_df, cluster_num=i, 
                                           words_list=words, batch_num=batch_num, 
                                           save_dir=cluster_words_dir)
        
        filename = cluster_clouds_dir / f"cluster_{i}_cloud_batch_{batch_num}.png"
        wordcloud_image.to_file(filename)
    
    print(f"\tYay! Everything went well for batch {batch_num}. Onto the next one!\n")
    
    print("\tClearning up memory for next iteration so we dont run out of memory...")
    del batch, words, tfidf_matrix, words_tfidf_df, sorted_words_tfidf_df, shrunk_matrix, clusterZ_df, cluster_groups, cluster_groups_cp
    gc.collect()


# Zip up the whole output folder
shutil.make_archive(root_dir / "output", "zip", base_dir)