Main Code

In [17]:
import base64
import pandas as pd
from io import StringIO

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline


def sfs_to_df(sfs_data):
    if not isinstance(sfs_data, dict) or 'content' not in sfs_data:
        return None
    content = sfs_data['content']
    if not content:
        return []
    try:
        decoded_content = base64.b64decode(content).decode('utf-8')
    except Exception:
        decoded_content = content
    csv_buffer = StringIO(decoded_content)
    df = pd.read_csv(csv_buffer)
    return df


def do_bow(documents, min_phrase_length=1, max_phrase_length=3):
    count_vectorizer = CountVectorizer(
        lowercase=True,
        stop_words='english',
        ngram_range=(min_phrase_length, max_phrase_length),
        min_df=2,
        max_df=0.95
    )
    bow_matrix = count_vectorizer.fit_transform(documents)
    feature_names = count_vectorizer.get_feature_names_out()
    return bow_matrix, feature_names


def dimensionality_reduction(K, bow_matrix):
    lsa = make_pipeline(
        TruncatedSVD(n_components=min(100, K * 2)),  # Use fewer components if K is small
        Normalizer(copy=False)
    )
    bow_matrix_lsa = lsa.fit_transform(bow_matrix)
    # Convert sparse matrix to dense for easier manipulation downstream
    bow_matrix_dense = bow_matrix.toarray()  # Shape: (n_docs, n_features)
    return bow_matrix_lsa, bow_matrix_dense


def do_kmeans_clustering(K, documents, bow_matrix_lsa, bow_matrix_dense, feature_names):
    kmeans = KMeans(
        n_clusters=K,
        n_init=10,  # Multiple runs for better initialization
        max_iter=300,
        random_state=42
    )
    cluster_labels = kmeans.fit_predict(bow_matrix_lsa)
    # Calculate importance of each phrase within its cluster to balance intra-cluster relevance and cluster size
    # define importance as: (frequency in cluster / total frequency in corpus) * (cluster size / total documents)
    phrase_importance = {}
    # For each cluster, find the top phrases based on TF-IDF-like weighting within the cluster
    for cluster_id in range(K):
        # Get documents belonging to this cluster
        mask = (cluster_labels == cluster_id)
        cluster_doc_indices = np.where(mask)[0]
        # If no documents in this cluster, skip
        if len(cluster_doc_indices) == 0:
            continue
        # Compute the average TF (term frequency) of each word within this cluster
        cluster_tf = bow_matrix_dense[cluster_doc_indices].sum(axis=0)  # Sum across docs in cluster
        # Total number of documents in the dataset
        total_docs = len(documents)
        # Cluster size
        cluster_size = len(cluster_doc_indices)
        # For each phrase, calculate its importance
        for i, phrase in enumerate(feature_names):
            # Term frequency within cluster
            tf_in_cluster = cluster_tf[i]
            # Global term frequency across all documents
            global_tf = bow_matrix_dense[:, i].sum()
            # Avoid division by zero
            if global_tf == 0:
                continue
            # Weighted importance: higher if frequent in cluster and rare globally
            # Also penalize if cluster is very small
            importance = (tf_in_cluster / global_tf) * (cluster_size / total_docs)
            # Accumulate importance across clusters
            if phrase not in phrase_importance:
                phrase_importance[phrase] = 0
            phrase_importance[phrase] += importance
    phrase_percentage_pairs = [(phrase, score) for phrase, score in phrase_importance.items()]
    return phrase_percentage_pairs


def get_top_K(K, results):
    results.sort(key=lambda x: x[1], reverse=True)
    top_k_ngrams = results[:K]
    return top_k_ngrams


def handler(input_data):
    sfs_file = input_data.get("File")
    K = input_data.get("K", 5)
    df = sfs_to_df(sfs_file)
    documents = df.iloc[:, 1]  # use col 2

    bow_matrix, feature_names = do_bow(documents, 1, 3)
    bow_matrix_lsa, bow_matrix_dense = dimensionality_reduction(K, bow_matrix)
    phrase_percentage_pairs = do_kmeans_clustering(K, documents, bow_matrix_lsa, bow_matrix_dense, feature_names)
    top_k_ngrams = get_top_K(K, phrase_percentage_pairs)
    return top_k_ngrams

In [18]:
import top_k_phrases_main as tkm
tkm.main("questionnaire responses.csv", 5)

Processing CSV file: questionnaire responses.csv
The top 5 Phrases are:
• ??? — 91.67% mention: clients
• ??? — 91.67% mention: managers
• ??? — 91.67% mention: need
• ??? — 83.33% mention: marketing
• ??? — 83.33% mention: sales


In [19]:
K = 5
csv_file_path = "questionnaire responses.csv"
df = pd.read_csv(csv_file_path, encoding='latin1')
documents = df.iloc[:, 1]
bow_matrix, feature_names = do_bow(documents, 1, 3)

print(bow_matrix)
feature_names

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1019 stored elements and shape (12, 246)>
  Coords	Values
  (0, 24)	1
  (0, 94)	1
  (0, 46)	1
  (0, 33)	1
  (0, 128)	1
  (0, 173)	1
  (0, 230)	1
  (0, 140)	2
  (0, 61)	1
  (0, 0)	1
  (0, 189)	1
  (0, 123)	1
  (0, 116)	1
  (0, 59)	1
  (0, 64)	1
  (0, 200)	1
  (0, 105)	1
  (0, 197)	1
  (0, 179)	1
  (0, 30)	1
  (0, 233)	1
  (0, 19)	1
  (0, 87)	1
  (0, 113)	1
  (0, 211)	1
  :	:
  (11, 144)	1
  (11, 181)	1
  (11, 32)	1
  (11, 235)	1
  (11, 22)	1
  (11, 91)	1
  (11, 115)	1
  (11, 213)	1
  (11, 205)	1
  (11, 164)	1
  (11, 136)	1
  (11, 5)	1
  (11, 132)	1
  (11, 238)	1
  (11, 121)	1
  (11, 122)	1
  (11, 145)	1
  (11, 239)	1
  (11, 207)	1
  (11, 17)	1
  (11, 208)	1
  (11, 148)	2
  (11, 1)	2
  (11, 2)	1
  (11, 86)	1


array(['android', 'attractive', 'audience', 'audience sales',
       'audience sales marketing', 'audiences', 'awareness', 'best',
       'brands', 'buttons', 'calculator', 'calculator sales',
       'calculator sales pitch', 'calculator training',
       'calculator training design', 'case', 'case studies',
       'case studies pricing', 'case studies sales', 'cases',
       'cases industry', 'cases industry hands', 'cases industry local',
       'challenge', 'challenging', 'challenging introduce',
       'challenging introduce rcs', 'challenging iâ', 'challenging iâ ve',
       'channels', 'clear', 'clear use', 'clear use cases', 'clients',
       'clients familiar', 'clients familiar rcs', 'clients know',
       'clients meet', 'clients meet relevant', 'clients moved',
       'clients moved forwardâ', 'compare', 'compared', 'conversation',
       'conversation flows', 'conversation flows clients',
       'conversations', 'conversations clients',
       'conversations clients meet', 

In [20]:
lsa = make_pipeline(
    TruncatedSVD(n_components=min(100, K * 2)),  # Use fewer components if K is small
    Normalizer(copy=False)
)
bow_matrix_lsa = lsa.fit_transform(bow_matrix)
# Convert sparse matrix to dense for easier manipulation downstream
bow_matrix_dense = bow_matrix.toarray()  # Shape: (n_docs, n_features)

print(bow_matrix_lsa)

[[ 8.04625081e-01 -3.40576150e-01  3.42218436e-01  2.69781794e-02
  -2.05412073e-01  1.12075219e-01 -1.03427587e-01 -1.88825401e-01
   1.84540472e-02  1.31519048e-01]
 [ 5.19153440e-01 -2.93488460e-01 -2.62933607e-01  6.14194187e-01
  -1.76176690e-01  4.29445044e-02 -3.41898687e-01  9.43851998e-02
  -1.79685857e-01 -8.36833708e-02]
 [ 8.30081308e-01  5.52307926e-01  3.17506010e-02  4.51038261e-02
  -9.93578227e-03 -2.41315450e-02 -6.74493424e-03 -4.35444044e-02
  -1.52271428e-02 -4.89847180e-03]
 [ 7.98428841e-01 -1.71253970e-01 -2.79149934e-01 -1.44553304e-01
  -4.74962538e-02  3.51693643e-01  2.75168140e-01  1.67885280e-01
  -6.71965051e-02 -6.82249588e-04]
 [ 5.35597870e-01 -2.32296156e-02  2.59603485e-01 -3.78138805e-01
   4.03287744e-01  5.66822673e-02 -3.59429072e-01  3.88854035e-01
   1.17329623e-01 -2.05415527e-01]
 [ 6.90986533e-01 -1.96185308e-01 -1.08489711e-01 -3.27463025e-01
  -2.77335013e-01 -4.57007754e-01 -6.62569235e-02  1.97782204e-01
   4.50031976e-02  1.83692737e-01

In [21]:
print(bow_matrix_dense)

[[1 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 6]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 2 1 ... 0 0 0]]


K Means

In [22]:
kmeans = KMeans(
    n_clusters=K,
    n_init=10,  # Multiple runs for better initialization
    max_iter=300,
    random_state=42
)
cluster_labels = kmeans.fit_predict(bow_matrix_lsa)
# Calculate importance of each phrase within its cluster to balance intra-cluster relevance and cluster size
# define importance as: (frequency in cluster / total frequency in corpus) * (cluster size / total documents)
phrase_importance = {}
# For each cluster, find the top phrases based on TF-IDF-like weighting within the cluster
for cluster_id in range(K):
    # Get documents belonging to this cluster
    mask = (cluster_labels == cluster_id)
    cluster_doc_indices = np.where(mask)[0]
    # If no documents in this cluster, skip
    if len(cluster_doc_indices) == 0:
        continue
    # Compute the average TF (term frequency) of each word within this cluster
    cluster_tf = bow_matrix_dense[cluster_doc_indices].sum(axis=0)  # Sum across docs in cluster
    # Total number of documents in the dataset
    total_docs = len(documents)
    # Cluster size
    cluster_size = len(cluster_doc_indices)
    # For each phrase, calculate its importance
    for i, phrase in enumerate(feature_names):
        # Term frequency within cluster
        tf_in_cluster = cluster_tf[i]
        # Global term frequency across all documents
        global_tf = bow_matrix_dense[:, i].sum()
        # Avoid division by zero
        if global_tf == 0:
            continue
        # Weighted importance: higher if frequent in cluster and rare globally
        # Also penalize if cluster is very small
        importance = (tf_in_cluster / global_tf) * (cluster_size / total_docs)
        # Accumulate importance across clusters
        if phrase not in phrase_importance:
            phrase_importance[phrase] = 0
        phrase_importance[phrase] += importance
phrase_percentage_pairs = [(phrase, score) for phrase, score in phrase_importance.items()]
print(phrase_percentage_pairs)

[('android', np.float64(0.20833333333333331)), ('attractive', np.float64(0.2222222222222222)), ('audience', np.float64(0.17592592592592593)), ('audience sales', np.float64(0.16666666666666666)), ('audience sales marketing', np.float64(0.16666666666666666)), ('audiences', np.float64(0.19444444444444442)), ('awareness', np.float64(0.2222222222222222)), ('best', np.float64(0.25)), ('brands', np.float64(0.20833333333333331)), ('buttons', np.float64(0.20833333333333331)), ('calculator', np.float64(0.2361111111111111)), ('calculator sales', np.float64(0.20833333333333331)), ('calculator sales pitch', np.float64(0.20833333333333331)), ('calculator training', np.float64(0.25)), ('calculator training design', np.float64(0.25)), ('case', np.float64(0.23333333333333334)), ('case studies', np.float64(0.23958333333333331)), ('case studies pricing', np.float64(0.25)), ('case studies sales', np.float64(0.25)), ('cases', np.float64(0.21296296296296297)), ('cases industry', np.float64(0.208333333333333

Sort by Top K

In [23]:
phrase_percentage_pairs.sort(key=lambda x: x[1], reverse=True)
phrase_percentage_pairs

[('best', np.float64(0.25)),
 ('calculator training', np.float64(0.25)),
 ('calculator training design', np.float64(0.25)),
 ('case studies pricing', np.float64(0.25)),
 ('case studies sales', np.float64(0.25)),
 ('cases industry local', np.float64(0.25)),
 ('channels', np.float64(0.25)),
 ('compare', np.float64(0.25)),
 ('compared', np.float64(0.25)),
 ('conversation flows clients', np.float64(0.25)),
 ('ctr', np.float64(0.25)),
 ('directors', np.float64(0.25)),
 ('experience', np.float64(0.25)),
 ('flows clients', np.float64(0.25)),
 ('help', np.float64(0.25)),
 ('image', np.float64(0.25)),
 ('industry local', np.float64(0.25)),
 ('industry local success', np.float64(0.25)),
 ('lead', np.float64(0.25)),
 ('level', np.float64(0.25)),
 ('managers executive', np.float64(0.25)),
 ('managers executive specialist', np.float64(0.25)),
 ('pitch templates pricing', np.float64(0.25)),
 ('reply', np.float64(0.25)),
 ('roi calculator training', np.float64(0.25)),
 ('sessions', np.float64(0.25)),

In [24]:
top_k_ngrams = phrase_percentage_pairs[:K]
top_k_ngrams

[('best', np.float64(0.25)),
 ('calculator training', np.float64(0.25)),
 ('calculator training design', np.float64(0.25)),
 ('case studies pricing', np.float64(0.25)),
 ('case studies sales', np.float64(0.25))]