In [1]:
import numpy as np
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from numpy.linalg import norm
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import os
import shutil
import pathlib
import gc
from tqdm.auto import tqdm

In [2]:
# root dir to store the zip file of the whole output folder which serves as the base directory
# for all the things output by the script
root_dir = pathlib.Path("/kaggle/working/")
base_dir = pathlib.Path("/kaggle/working/output/")
os.makedirs(base_dir, exist_ok=True)

In [3]:
articles_dataset = tf.data.Dataset.from_tensor_slices((pd.read_csv("/kaggle/input/medium-articles/medium_articles.csv")['text']).values)
articles_dataset = articles_dataset.batch(8000)
total_batches = len(articles_dataset)
print(total_batches)

2022-06-17 05:02:25.975503: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


25


## We've got 25 batches of length 8000 each but we have about 190K or 190K+ artciles so the first 24 batches take up 192K articles hence it's obvious that the last batch would be around ~1K texts or less, which may not be suitable for us, but generate word clouds from it anyway.

In [4]:
#Collected all the code in one cell to save memory consumption
#------------------


def rank_words_by_tfidf(indices, words_list):
    """Ranks the words, specified by indices which are sent in by "cluster_to_cloud"
    function. Ranking is based on the summed tfidf score """
    
    summed_tfidf = np.asarray(tfidf_matrix[indices].sum(axis=0))
    data = {"Words": words_list,
           "Summed_TFIDF": summed_tfidf}
    return pd.DataFrame(data).sort_values("Summed_TFIDF", ascending=False)


#-------------------


def cluster_to_cloud(df_cluster, max_words=15, cluster_num=0, words_list=None, batch_num=0, save_dir=None):
    """Generates a word cloud image using the top 15 words 
    (which are ranked by their tfidf score) in the given cluster"""
    
    indices = df_cluster.Index.values
    df_ranked_words_all = rank_words_by_tfidf(indices, words_list)
    df_ranked_words_in_cloud = df_ranked_words_all[:max_words]
    df_ranked_words_remaining = df_ranked_words_all[max_words:]
    
    filename = save_dir / f"cluster_{cluster_num}_words_batch_{batch_num}.csv"
    df_ranked_words_remaining.to_csv(filename, index=False)
    words_to_score = {word:score
                     for word, score in df_ranked_words_in_cloud.values}    
    cloud_generator = WordCloud(background_color="white",
                               random_state=1, width=2000, height=1000)
    wordcloud_image = cloud_generator.fit_words(words_to_score)
    return wordcloud_image


#------------------


def vectorizeX(batch):
    """Vectorizes the texts,generates the TFIDF matrix
    and returns TFIDF matrix, words, and sorted words TFIDF dataframe.
    X in the function names stands for eXtra, as it performs and returns some extra things
    also in all honesty, it's more because it's sounds cool this way. No judging, okay?"""
    
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch).toarray()
    words = tfidf_vectorizer.get_feature_names_out()
    words_tfidf_df = pd.DataFrame({"Words": words, "Summed_TFIDF": tfidf_matrix.sum(axis=0)})
    sorted_words_tfidf_df = words_tfidf_df.sort_values(by="Summed_TFIDF", ascending=False)
    
    return words, tfidf_matrix, words_tfidf_df, sorted_words_tfidf_df


#-------------------


def clusterItUp(shrunk_norm_matrix, batch_num=0):
    """Takes the normalized shrunk matrix of a batch and clusters the words using KMeans
    and returns the dataframe with each word assigned a relevant cluster id"""
     
    #as mentioned earlier, since our last batch would contain just around 1K texts hence it would have less
    # words and less diversity. So we cluster that in a 2 groups instead of 10,
    # previously we clustered into 30 a number we found in our initial prototypes
    # but that generated way too many similar clouds in this case so let's try 10 this time.
          
    print("\tClustering words into groups...")
    if batch_num != 24:
        cluster_model = KMeans(n_clusters=10)
        clusters = cluster_model.fit_predict(shrunk_norm_matrix)
        # we are using Z to make the plural in clusters, more apparent and more easily distinguishable
        clusterZ_df = pd.DataFrame({'Index': range(clusters.size), 'Cluster': clusters})
    else:
        cluster_model = KMeans(n_clusters=3)
        clusters = cluster_model.fit_predict(shrunk_norm_matrix)
        clusterZ_df = pd.DataFrame({'Index': range(clusters.size), 'Cluster': clusters})
    print("\t\tSuccessfully Clustered!")
    return clusterZ_df

# -------------

for batch_num, batch in tqdm(enumerate(articles_dataset)):
    
    print(f"\nProcessing batch {batch_num+1} / {total_batches}")
    
    # converts the tf object into numpy since we're accustomed ot numpy and pandas workflow
    batch = batch.numpy()
    
    # TODO: i- implement zipping the whole base output folder
    #       ii- save vectorize matrix for each cluster in a file to later read that in
    #           browser and use that to compute the user entered word's similarity with the cluster
    # now that i think about it, there's no way to calculate similarity with mere dot product between
    # a single word vector and vector of all the words in the given space or word cloud.
    # i mean sure, we can measure similarity between two sentences but apply the same method for a
    # sentence and a word doesn't seem like would work.
    
    # make directories to neatly organize our output files
    # also would be a lot easier to download later
    print("\tMaking directories...")
    batch_dir = base_dir / f"batch{batch_num}"
    cluster_words_dir = batch_dir / "cluster_words"
    cluster_clouds_dir = batch_dir / "cluster_clouds"   
    os.makedirs(cluster_words_dir, exist_ok=True)
    os.makedirs(cluster_clouds_dir, exist_ok=True)
    print("\t\tSuccessfully created required directories.")
    
    # calculate the words, tfidf_matrix and sorted words df by calling the vectorizeX function
    print("\tPerforming vectorization x...")
    words, tfidf_matrix, words_tfidf_df, sorted_words_tfidf_df = vectorizeX(batch)
    print("\t\tSuccessfully vectorized x.")
    
    print(f"\tTotal Words -> {len(words)}")
    
    filename = batch_dir / f"words_summed_tfidf_batch_{batch_num}.csv"
    print(f"\tSaving {filename} to disk...")
    
    filename = batch_dir / f"sorted_words_summed_tfidf_batch_{batch_num}.csv"
    print(f"\tSaving {filename} to disk...")
    sorted_words_tfidf_df.to_csv(f"{filename}", index=False)

    
    print("\tApplying SVD...")
    shrunk_matrix = TruncatedSVD(n_components=100).fit_transform(tfidf_matrix)
    print("\t\tSuccessfully applied SVD.")
    
    # normalize the matrix
    print("\tNormalizing shrunk matrix...")
    shrunk_norm_matrix = normalize(shrunk_matrix)
    # norm(shrunk_norm_matrix[0])
    print("\t\tTarget normalized.")
    

    clusterZ_df = clusterItUp(shrunk_norm_matrix, batch_num=batch_num)

    # though clusters file and summed tfidf file would pretty much contain the same words
    # with ONLY DIFFERENCE being that this cluster file would associate each word with its
    #cluster id. but we are saving it anyway.
    filename = batch_dir / f"clustered_groups_batch_{batch_num}.csv"
    clusterZ_df.to_csv(filename, index=False)
    # making a list of clustered groups for further manipulation
    cluster_groups = [df_cluster for _, df_cluster in clusterZ_df.groupby("Cluster")]
#     len(cluster_groups)
    
    # the number of words we want in the word cloud image.
    # 15 seems to work well. But obviously you can change to whatever you want.
    max_words = 15
    
    # making a copy of clustered groups list so as to avoid accidently changing its elements
    cluster_groups_cp = cluster_groups[:]
    
    total_groups = len(cluster_groups_cp)
    
    print(f"\tGenerating word cloud images for clusters... ")
    for i in tqdm(range(total_groups)):
        cluster_df = cluster_groups_cp[i]
        wordcloud_image = cluster_to_cloud(cluster_df, cluster_num=i, 
                                           words_list=words, batch_num=batch_num, 
                                           save_dir=cluster_words_dir)
        
        filename = cluster_clouds_dir / f"cluster_{i}_cloud_batch_{batch_num}.png"
        wordcloud_image.to_file(filename)
        
#         if (i+1)%10==0:
#             print(f"\t\t{i+1} of {total_groups} word clouds generated...")
    
    print(f"\tYay! Everything went well for batch {batch_num}. Onto the next one!\n")
    
    print("\tClearning up memory for next iteration so we dont run out of memory...")
    del batch, words, tfidf_matrix, words_tfidf_df, sorted_words_tfidf_df, shrunk_matrix, clusterZ_df, cluster_groups, cluster_groups_cp
    gc.collect()

    # for the test run, we're gonna run this loop for just once, hence we break here.
    # but before we break we'll zip the batch folder for easy download. BUT BUT BUT
    # once tested, we'll make archive of the base_dir and not batch dir
    # so that we can download the whole base_dir and not the batch dir.
#     shutil.make_archive(base_dir / "base_output", "zip", base_dir)
#     break

# Zip up the whole output folder
shutil.make_archive(root_dir / "output", "zip", base_dir)

0it [00:00, ?it/s]


Processing batch 1 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 96452
	Saving /kaggle/working/output/batch0/words_summed_tfidf_batch_0.csv to disk...
	Saving /kaggle/working/output/batch0/sorted_words_summed_tfidf_batch_0.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 0. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 2 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 101738
	Saving /kaggle/working/output/batch1/words_summed_tfidf_batch_1.csv to disk...
	Saving /kaggle/working/output/batch1/sorted_words_summed_tfidf_batch_1.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 1. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 3 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 100347
	Saving /kaggle/working/output/batch2/words_summed_tfidf_batch_2.csv to disk...
	Saving /kaggle/working/output/batch2/sorted_words_summed_tfidf_batch_2.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 2. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 4 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 99737
	Saving /kaggle/working/output/batch3/words_summed_tfidf_batch_3.csv to disk...
	Saving /kaggle/working/output/batch3/sorted_words_summed_tfidf_batch_3.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 3. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 5 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 100453
	Saving /kaggle/working/output/batch4/words_summed_tfidf_batch_4.csv to disk...
	Saving /kaggle/working/output/batch4/sorted_words_summed_tfidf_batch_4.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 4. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 6 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 103910
	Saving /kaggle/working/output/batch5/words_summed_tfidf_batch_5.csv to disk...
	Saving /kaggle/working/output/batch5/sorted_words_summed_tfidf_batch_5.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 5. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 7 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 101115
	Saving /kaggle/working/output/batch6/words_summed_tfidf_batch_6.csv to disk...
	Saving /kaggle/working/output/batch6/sorted_words_summed_tfidf_batch_6.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 6. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 8 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 99650
	Saving /kaggle/working/output/batch7/words_summed_tfidf_batch_7.csv to disk...
	Saving /kaggle/working/output/batch7/sorted_words_summed_tfidf_batch_7.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 7. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 9 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 101513
	Saving /kaggle/working/output/batch8/words_summed_tfidf_batch_8.csv to disk...
	Saving /kaggle/working/output/batch8/sorted_words_summed_tfidf_batch_8.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 8. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 10 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 94883
	Saving /kaggle/working/output/batch9/words_summed_tfidf_batch_9.csv to disk...
	Saving /kaggle/working/output/batch9/sorted_words_summed_tfidf_batch_9.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 9. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 11 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 95785
	Saving /kaggle/working/output/batch10/words_summed_tfidf_batch_10.csv to disk...
	Saving /kaggle/working/output/batch10/sorted_words_summed_tfidf_batch_10.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 10. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 12 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 99021
	Saving /kaggle/working/output/batch11/words_summed_tfidf_batch_11.csv to disk...
	Saving /kaggle/working/output/batch11/sorted_words_summed_tfidf_batch_11.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 11. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 13 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 86298
	Saving /kaggle/working/output/batch12/words_summed_tfidf_batch_12.csv to disk...
	Saving /kaggle/working/output/batch12/sorted_words_summed_tfidf_batch_12.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 12. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 14 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 100489
	Saving /kaggle/working/output/batch13/words_summed_tfidf_batch_13.csv to disk...
	Saving /kaggle/working/output/batch13/sorted_words_summed_tfidf_batch_13.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 13. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 15 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 98249
	Saving /kaggle/working/output/batch14/words_summed_tfidf_batch_14.csv to disk...
	Saving /kaggle/working/output/batch14/sorted_words_summed_tfidf_batch_14.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 14. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 16 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 105084
	Saving /kaggle/working/output/batch15/words_summed_tfidf_batch_15.csv to disk...
	Saving /kaggle/working/output/batch15/sorted_words_summed_tfidf_batch_15.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 15. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 17 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 96444
	Saving /kaggle/working/output/batch16/words_summed_tfidf_batch_16.csv to disk...
	Saving /kaggle/working/output/batch16/sorted_words_summed_tfidf_batch_16.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 16. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 18 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 106302
	Saving /kaggle/working/output/batch17/words_summed_tfidf_batch_17.csv to disk...
	Saving /kaggle/working/output/batch17/sorted_words_summed_tfidf_batch_17.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 17. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 19 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 115056
	Saving /kaggle/working/output/batch18/words_summed_tfidf_batch_18.csv to disk...
	Saving /kaggle/working/output/batch18/sorted_words_summed_tfidf_batch_18.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 18. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 20 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 112884
	Saving /kaggle/working/output/batch19/words_summed_tfidf_batch_19.csv to disk...
	Saving /kaggle/working/output/batch19/sorted_words_summed_tfidf_batch_19.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 19. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 21 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 121007
	Saving /kaggle/working/output/batch20/words_summed_tfidf_batch_20.csv to disk...
	Saving /kaggle/working/output/batch20/sorted_words_summed_tfidf_batch_20.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 20. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 22 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 112130
	Saving /kaggle/working/output/batch21/words_summed_tfidf_batch_21.csv to disk...
	Saving /kaggle/working/output/batch21/sorted_words_summed_tfidf_batch_21.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 21. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 23 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 115608
	Saving /kaggle/working/output/batch22/words_summed_tfidf_batch_22.csv to disk...
	Saving /kaggle/working/output/batch22/sorted_words_summed_tfidf_batch_22.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 22. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 24 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 98380
	Saving /kaggle/working/output/batch23/words_summed_tfidf_batch_23.csv to disk...
	Saving /kaggle/working/output/batch23/sorted_words_summed_tfidf_batch_23.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/10 [00:00<?, ?it/s]

	Yay! Everything went well for batch 23. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...

Processing batch 25 / 25
	Making directories...
		Successfully created required directories.
	Performing vectorization x...
		Successfully vectorized x.
	Total Words -> 18789
	Saving /kaggle/working/output/batch24/words_summed_tfidf_batch_24.csv to disk...
	Saving /kaggle/working/output/batch24/sorted_words_summed_tfidf_batch_24.csv to disk...
	Applying SVD...
		Successfully applied SVD.
	Normalizing shrunk matrix...
		Target normalized.
	Clustering words into groups...
		Successfully Clustered!
	Generating word cloud images for clusters... 


  0%|          | 0/3 [00:00<?, ?it/s]

	Yay! Everything went well for batch 24. Onto the next one!

	Clearning up memory for next iteration so we dont run out of memory...


'/kaggle/working/output.zip'