In [4]:
import pandas as pd
from collections import Counter

In [5]:
def cluster_word_frequency(models, column='text'):
    """
    Generates a dataframe containing the word frequency for each cluster.

    Parameters
    ----------
    models : list
        The list of models used for clustering.
    column : str
        The column containing the text data.
        Default: 'text'
    """
    for model in models:
        df = pd.read_pickle(f'{model}.pkl')
        for n in df['cluster'].unique():
            # count words in each cluster
            df_filtered = df[df['cluster'] == n]
            word_list = df_filtered[column].str.split().tolist()
            flattened_list = [word for sublist in word_list for word in sublist]
            word_counts = Counter(flattened_list)
            word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['Count'])
            word_counts_df = word_counts_df.sort_values(by='Count', ascending=False)
            # change index to column 
            word_counts_df = word_counts_df.reset_index()
            # rename column
            word_counts_df = word_counts_df.rename(columns={'index':'Name'})
            # add cluster column
            word_counts_df['Cluster'] = n
            # write to csv
            pd.DataFrame(word_counts_df).to_csv(f'{model}_cluster{n}.csv')
    return None

In [6]:
models = ['KMeans', 'Agglomerative', 'DBSCAN']
cluster_word_frequency(models, column='text')