Main Code

In [None]:
import base64
import pandas as pd
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF + n-grams
from typing import Dict, Any, List, Tuple
import numpy as np


def sfs_to_df(
    file_content
) -> pd.DataFrame:
    try:
        decoded_content = base64.b64decode(file_content).decode('utf-8')
    except Exception:
        decoded_content = file_content
    csv_buffer = StringIO(decoded_content)
    df = pd.read_csv(csv_buffer)
    return df


def do_tfidf(
    documents: List[str], 
    min_phrase_length: int = 3, 
    max_phrase_length: int = 3
) -> Tuple[np.ndarray, List[str]]:
    tfidf_vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        ngram_range=(min_phrase_length, max_phrase_length),
        min_df=2,
        max_df=0.95
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names


def calc_percentages(
    total_respondents: int, 
    tfidf_matrix: np.ndarray, 
    feature_names: List[str]
) -> List[Tuple[str, float]]:
    phrase_freq_pairs = {}
    for i, phrase in enumerate(feature_names):
        # Binary count non-zero entries in the TF-IDF matrix for this phrase
        phrase_freq = (tfidf_matrix[:, i] > 0).sum()
        phrase_freq_pairs[phrase] = phrase_freq
    phrase_percentage_pairs = []
    for phrase, phrase_freq in phrase_freq_pairs.items():
        percentage = (phrase_freq / total_respondents) * 100
        phrase_percentage_pair = (phrase, percentage)
        phrase_percentage_pairs.append(phrase_percentage_pair)
    return phrase_percentage_pairs


def get_top_K(
    K: int, 
    results: List[Tuple[str, float]]
) -> List[Tuple[str, float]]:
    results.sort(key=lambda x: x[1], reverse=True)
    top_k_ngrams = results[:K]
    return top_k_ngrams


def handler(
    input_data: Dict[str, Any]
) -> Dict[str, Any]:
    file_content = input_data.get("File").get("content")
    K = input_data.get("K", 5)
    df = sfs_to_df(file_content)
    documents = df.iloc[:, 1]  # use col 2

    tfidf_matrix, feature_names = do_tfidf(documents, 1, 3)
    phrase_percentage_pairs = calc_percentages(len(documents), tfidf_matrix, feature_names)
    top_k_ngrams = get_top_K(K, phrase_percentage_pairs)
    return top_k_ngrams

Simulating node being run ...

In [3]:
import top_k_phrases_main as tkm
tkm.main("questionnaire responses.csv", 5)

Processing CSV file: questionnaire responses.csv
The top 5 Phrases are:
• ??? — 91.67% mention: clients
• ??? — 91.67% mention: managers
• ??? — 91.67% mention: need
• ??? — 83.33% mention: marketing
• ??? — 83.33% mention: sales


Testing

In [2]:
K = 5
csv_file_path = "questionnaire responses.csv"
df = pd.read_csv(csv_file_path, encoding='latin1')
documents = df.iloc[:, 1]

tfidf_matrix, feature_names = do_tfidf(documents, 1, 3)
print(tfidf_matrix)
feature_names

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1019 stored elements and shape (12, 246)>
  Coords	Values
  (0, 24)	0.09530449320891317
  (0, 94)	0.08098845740963533
  (0, 46)	0.08098845740963533
  (0, 33)	0.06929140446470394
  (0, 128)	0.08098845740963533
  (0, 173)	0.08098845740963533
  (0, 230)	0.08774798465164782
  (0, 140)	0.13858280892940789
  (0, 61)	0.08774798465164782
  (0, 0)	0.15823074232701018
  (0, 189)	0.07487372123160878
  (0, 123)	0.07487372123160878
  (0, 116)	0.06929140446470394
  (0, 59)	0.15823074232701018
  (0, 64)	0.09530449320891317
  (0, 200)	0.09530449320891317
  (0, 105)	0.15823074232701018
  (0, 197)	0.09530449320891317
  (0, 179)	0.09530449320891317
  (0, 30)	0.08774798465164782
  (0, 233)	0.08098845740963533
  (0, 19)	0.08774798465164782
  (0, 87)	0.08774798465164782
  (0, 113)	0.09530449320891317
  (0, 211)	0.09530449320891317
  :	:
  (11, 144)	0.10897318633401004
  (11, 181)	0.10897318633401004
  (11, 32)	0.09205788627253741
  (11, 235)	0.09

array(['android', 'attractive', 'audience', 'audience sales',
       'audience sales marketing', 'audiences', 'awareness', 'best',
       'brands', 'buttons', 'calculator', 'calculator sales',
       'calculator sales pitch', 'calculator training',
       'calculator training design', 'case', 'case studies',
       'case studies pricing', 'case studies sales', 'cases',
       'cases industry', 'cases industry hands', 'cases industry local',
       'challenge', 'challenging', 'challenging introduce',
       'challenging introduce rcs', 'challenging iâ', 'challenging iâ ve',
       'channels', 'clear', 'clear use', 'clear use cases', 'clients',
       'clients familiar', 'clients familiar rcs', 'clients know',
       'clients meet', 'clients meet relevant', 'clients moved',
       'clients moved forwardâ', 'compare', 'compared', 'conversation',
       'conversation flows', 'conversation flows clients',
       'conversations', 'conversations clients',
       'conversations clients meet', 

In [9]:
phrase_percentage_pairs = calc_percentages(len(documents), tfidf_matrix, feature_names)
phrase_percentage_pairs

[('android', np.float64(16.666666666666664)),
 ('attractive', np.float64(16.666666666666664)),
 ('audience', np.float64(25.0)),
 ('audience sales', np.float64(16.666666666666664)),
 ('audience sales marketing', np.float64(16.666666666666664)),
 ('audiences', np.float64(16.666666666666664)),
 ('awareness', np.float64(16.666666666666664)),
 ('best', np.float64(16.666666666666664)),
 ('brands', np.float64(16.666666666666664)),
 ('buttons', np.float64(16.666666666666664)),
 ('calculator', np.float64(50.0)),
 ('calculator sales', np.float64(16.666666666666664)),
 ('calculator sales pitch', np.float64(16.666666666666664)),
 ('calculator training', np.float64(16.666666666666664)),
 ('calculator training design', np.float64(16.666666666666664)),
 ('case', np.float64(66.66666666666666)),
 ('case studies', np.float64(58.333333333333336)),
 ('case studies pricing', np.float64(16.666666666666664)),
 ('case studies sales', np.float64(16.666666666666664)),
 ('cases', np.float64(66.66666666666666)),


In [10]:
phrase_percentage_pairs.sort(key=lambda x: x[1], reverse=True)
phrase_percentage_pairs

[('clients', np.float64(91.66666666666666)),
 ('managers', np.float64(91.66666666666666)),
 ('need', np.float64(91.66666666666666)),
 ('marketing', np.float64(83.33333333333334)),
 ('sales', np.float64(83.33333333333334)),
 ('clients meet', np.float64(75.0)),
 ('clients meet relevant', np.float64(75.0)),
 ('conversations', np.float64(75.0)),
 ('conversations clients', np.float64(75.0)),
 ('conversations clients meet', np.float64(75.0)),
 ('introduce', np.float64(75.0)),
 ('introduce rcs', np.float64(75.0)),
 ('introduce rcs conversations', np.float64(75.0)),
 ('meet', np.float64(75.0)),
 ('meet relevant', np.float64(75.0)),
 ('rcs conversations', np.float64(75.0)),
 ('rcs conversations clients', np.float64(75.0)),
 ('relevant', np.float64(75.0)),
 ('use', np.float64(75.0)),
 ('case', np.float64(66.66666666666666)),
 ('cases', np.float64(66.66666666666666)),
 ('cases industry', np.float64(66.66666666666666)),
 ('clear', np.float64(66.66666666666666)),
 ('clear use', np.float64(66.666666

In [11]:
top_k_ngrams = phrase_percentage_pairs[:K]
top_k_ngrams

[('clients', np.float64(91.66666666666666)),
 ('managers', np.float64(91.66666666666666)),
 ('need', np.float64(91.66666666666666)),
 ('marketing', np.float64(83.33333333333334)),
 ('sales', np.float64(83.33333333333334))]