In [1]:
import numpy as np
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from numpy.linalg import norm
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import os
import shutil
import pathlib

In [None]:
base_dir = pathlib.Path("/kaggle/working/output/")
os.makedirs(base_dir)

In [None]:
# array = np.arange(100)

In [None]:
# dataset = tf.data.Dataset.from_tensor_slices(array)
# dataset = dataset.batch(10)
# for i, batch in enumerate(dataset):
#     print(f"{i}: {batch.numpy()}")
# print(len(dataset))

In [None]:
# all_batches = []
# for i in range(10):
#     batch = np.random.choice(array, size=10, replace=False)
#     all_batches.extend(batch)
#     print(f"{i}th iter: {batch}")
# print(f"Total unique elements sampled: {len(np.unique(np.array(all_batches), return_counts=False))}")

In [None]:
articles_dataset = tf.data.Dataset.from_tensor_slices((pd.read_csv("/kaggle/input/medium-articles/medium_articles.csv")['text']).values)

In [None]:
articles_dataset = articles_dataset.batch(9000)
print(len(articles_dataset))

## We've got 22 batches of length 9000 each but we have about 190K or 190K+ artciles so it's obvious that the last batch would be around ~1K texts, which may not be suitable for us, but generate word clouds from it anyway.

In [None]:
def rank_words_by_tfidf(indices, words_list):
    """Ranks the words, specified by indices which are sent in by "cluster_to_cloud"
    function. Ranking is based on the summed tfidf score """
    
    summed_tfidf = np.asarray(tfidf_matrix[indices].sum(axis=0))
    data = {"Words": words_list,
           "Summed_TFIDF": summed_tfidf}
    return pd.DataFrame(data).sort_values("Summed_TFIDF", ascending=False)

In [None]:
def cluster_to_cloud(df_cluster, max_words=15, cluster_num=0, batch_num=0):
    """Generates a word cloud image using the top 15 words 
    (which are ranked by their tfidf score) in the given cluster"""
    
    indices = df_cluster.Index.values
    df_ranked_words_all = rank_words_by_tfidf(indices)
    df_ranked_words_in_cloud = df_ranked_words_all[:max_words]
    df_ranked_words_remaining = df_ranked_words_all[max_words:]
    
    df_ranked_words_remaining.to_csv(f"cluster_{cluster_num}_words_batch_{batch_num}.csv")
    words_to_score = {word:score
                     for word, score in df_ranked_words_in_cloud.values}    
    cloud_generator = WordCloud(background_color="white",
                               random_state=1, width=2000, height=1000)
    wordcloud_image = cloud_generator.fit_words(words_to_score)
    return wordcloud_image

In [None]:
def vectorizeX(batch, batch_num=0):
    """Vectorizes the texts,generates the TFIDF matrix
    and returns TFIDF matrix, words, and sorted words TFIDF dataframe.
    X in the function names stands for eXtra, as it performs and returns some extra things
    also in all honesty, it's more because it's sounds cool this way. No judging, okay?"""
    
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch).toarray()
    words = tfidf_vectorizer.get_feature_names()
    words_tfidf_df = pd.DataFrame({"Words": words, "Summed_TFIDF": tfidf_matrix.sum(axis=0)})
    sorted_words_tfidf_df = words_tfidf_df.sort_values(by="Summed_TFIDF", ascending=False)
    
    return words, tfidf_matrix, sorted_words_tfidf_df

In [None]:
def clusterItUp(batch, batch_num=0):
    """Takes the normalized shrunk matrix of a batch and clusters the words using KMeans
    and returns the dataframe with each word assigned a relevant cluster id"""
     
    #as mentioned earlier, since our last batch would contain just around 1K texts hence it would have less
    # words and less diversity. So we cluster that in a 3 groups instead of 30 which by the way is the number of groups
    # we found in our ealier extensive testing and prototyping.
          
    print("/tClustering words into groups...")
    if batch_num != 21:
        cluster_model = KMeans(n_clusters=30)
        clusters = cluster_model.fit_predict(shrunk_norm_matrix)
        # we are using Z to make the plural in clusters, more apparent and more easily distinguishable
        clusterZ_df = pd.DataFrame({'Index': range(clusters.size), 'Cluster': clusters})
    else:
        cluster_model = KMeans(n_clusters=3)
        clusters = cluster_model.fit_predict(shrunk_norm_matrix)
        clusterZ_df = pd.DataFrame({'Index': range(clusters.size), 'Cluster': clusters})
    print("\t\tSuccessfully Clustered!")
    return clusters_df

In [None]:
for batch_num, batch in enumerate(artciles_dataset):
    
    print(f"\nProcessing batch {batch_num}...")
    
    # TODO: i- implement zipping the whole base output folder
    #       ii- save vectorize matrix for each cluster in a file to later read that in
    #           browser and use that to compute the user entered word's similarity with the cluster
    # now that i think about it, there's no way to calculate similarity with mere dot product between
    # a single word vector and vector of all the words in the given space or word cloud.
    # i mean sure, we can measure similarity between two sentences but apply the same method for a
    # sentence and a word doesn't seem like would work.
    
    # make directories to neatly organize our output files
    # also would be a lot easier to download later
    batch_dir = base_dir / f"batch{batch_num}"
    cluster_words_dir = batch_dir / "cluster_words"
    cluster_clouds_dir = batch_dir / "cluster_clouds"
    os.makedirs(cluster_words_dir)
    os.makedirs(cluster_clouds_dir)


    print(f"\tTotal Words -> {len(words)}"")
    
    # calculate the words, tfidf_matrix and sorted words df by calling the vectorizeX function
    words, tfidf_matrix, sorted_words_tfidf_df = vectorizeX(batch, batch_num=batch_num)
    
    filename = batch_dir / f"sorted_words_summed_tfidf_batch_{batch_num}.csv"
    sorted_words_tfidf_df.to_csv(f"{filename}", index=False)

    
    print("\tApplying SVD...")
    shrunk_matrix = TruncatedSVD(n_components=100).fit_transform(tfidf_matrix)
    # normalize the matrix
    shrunk_norm_matrix = normalize(shrunk_matrix)
    print("\t\tSuccessfully applied SVD.")
    # norm(shrunk_norm_matrix[0])
    
          
    clusterZ_df = clusterItUp(shrunk_norm_matrix, batch_num=batch_num)
    # making a list of clustered groups for further manipulation
    cluster_groups = [df_cluster for _, df_cluster in clusterZ_df.groupby("Cluster")]
#     len(cluster_groups)
    
    # the number of words we want in the word cloud image.
    # 15 seems to work well. But obviously you can change to whatever you want.
    max_words = 15
    
    # making a copy of clustered groups list so as to avoid accidently changing its elements
    cluster_groups_cp = cluster_groups[:]
    
    total_groups = len(cluster_groups_cp)
    
    print(f"\tGenerating word cloud images for clusters... ")
    for i in range(total_groups):
        cluster_df = cluster_groups_cp[i]
        wordcloud_image = cluster_to_cloud(cluster_df, cluster_num=i, batch_num=batch_num)
        wordcloud_image.to_file(f"cluster_{i}_cloud.png")
        
        if i+1%10==0:
          print(f"\t\t{i+1} of {total_groups} word clouds generated...")
    
    print(f"\tYay! Everything went well for batch {batch_num}. Onto the next one!\n")
    return