In [None]:
# hide
# default_exp test.test_clustering_no_params

In [None]:
# exportn_step:first


def something():
    pass

# Text Discovery with Top2Vec

> top2vec is an unsupervised topic detection algorithm. It finds clusters of similar texts and then groups them into meaningful topics.
* Get Topics
* Get Words
* Get Docs

# Look at:

* https://github.com/fastai/fastdoc
* https://github.com/fastai/fastpages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export


import numpy as np
import pandas as pd
from sciflow.utils import odbc_connect, query

In [None]:
pd.set_option("display.max_colwidth", 800)

# Params

> These parameters are managed by papermill execution

In [None]:
# export

traffic_percent = 1
speed = "fast-learn"
workers = 8
odbc_conn = odbc_connect()
model_level = "TopLevelDispatcher"
min_date = "2021-01-01"

In [None]:
# export
def get_traffic_text(percent):
    return str(percent) if int(percent) >= 10 else "0" + str(percent)

In [None]:
assert get_traffic_text("3") == "03"
assert get_traffic_text("13") == "13"
assert get_traffic_text("78") == "78"

# Preprocess Data

In [None]:
# export
def get_experiment_segment(traffic_percent):
    return tuple(get_traffic_text(tp) for tp in range(traffic_percent))

In [None]:
assert get_experiment_segment(1) == ("00",)
assert get_experiment_segment(3) == ("00", "01", "02")
assert "' '".join(get_experiment_segment(1)) == "00"
assert f"""IN ('{"','".join(get_experiment_segment(3))}')""" == "IN ('00','01','02')"
assert len(get_experiment_segment(50)) == 50
assert max([int(x) for x in get_experiment_segment(100)]) == 99

In [None]:
# export


def get_utterances(odbc_conn, model, min_date, traffic_percent):
    segment = get_experiment_segment(traffic_percent)
    return query(
        odbc_conn,
        f"""
    select Utterance from "chatbot_unpublish_s3"."lambda-output"."finn_feedback"
    where model = '{model}' and to_date(substr("Timestamp", 0, 10), 'YYYY-MM-dd') >= to_date('{min_date}', 'YYYY-MM-dd')
    and substr(AccountNumber, 15, 16) IN ('{"','".join(segment)}')
    """,
    )

In [None]:
# export


def get_button_responses_filter(odbc_conn):
    button_responses_query = f"""
    SELECT "text"
    FROM "chatbot_unpublish_s3"."lambda-output"."live_person".messages a
    inner join "chatbot_unpublish_s3"."lambda-output".digital.events b
    on a."conversationId" = b."LivePersonConversationId"
    where b.QuickReplyButton = true and a.eventBy = 'Consumer'
    """
    button_responses = query(odbc_conn, button_responses_query)
    additional_button_responses = [
        "Transaction enquiry",
        "Transaction Enquiry",
        "Hi",
        "Hello",
        "Card declined",
        "Close account",
    ]
    return button_responses.text.tolist() + additional_button_responses

In [None]:
# exportn_step:preprocess


def preprocess(odbc_conn, model_level, min_date, traffic_percent):
    data = get_utterances(odbc_conn, model_level, min_date, traffic_percent)
    button_filter = get_button_responses_filter(odbc_conn)
    user_texts = data[~data.Utterance.isin(button_filter)].copy()
    documents = user_texts.Utterance.tolist()
    results = {"documents": documents}
    return results

In [None]:
documents = preprocess(odbc_conn, model_level, min_date, traffic_percent)["documents"]

In [None]:
assert len(documents) > 0
assert (
    pd.Series(["Transaction Enquiry", "Payment Issues", "Credit Limit Enquiry"])
    .isin(pd.Series(documents))
    .sum()
    == 0
)  # no button response texts

In [None]:
# export


class Topics:
    def __init__(self, documents, workers, speed):
        pass

    def get_num_topics(self):
        return 6

    def get_topic_sizes(self):
        return [1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]

    def get_topics(self, num_topics):
        return (
            ["cat", "sat", "mat", "mouse", "house", "grouse"],
            np.asarray([1, 1, 1, 1, 1, 1]),
            [1, 2, 3, 4, 5, 6],
        )

    def search_documents_by_topic(self, topic_num, num_docs):
        return (
            ["cat", "sat", "mat", "mouse", "house", "grouse"],
            np.asarray([1, 1, 1, 1, 1, 1]),
            [1, 2, 3, 4, 5, 6],
        )

    def generate_topic_wordcloud(self, topic_num):
        print("wordcloud")

    def hierarchical_topic_reduction(self, num_topics):
        return ["cat", "sat", "mat"]

# Fit

In [None]:
# exportn_step:fit


def fit(documents, workers=workers, speed="fast-learn"):
    model = Topics(documents, workers=workers, speed=speed)
    results = {"model": model}
    return results

In [None]:
# slow
import time

time.sleep(10)

In [None]:
model = fit(documents, workers=workers, speed=speed)["model"]

# Evaluate

# Number of Topics

In [None]:
model.get_num_topics()

# Size of Topics

In [None]:
topic_sizes, topic_nums = model.get_topic_sizes()
assert all([s > 0 for s in topic_sizes])

# Get Topic Words & Scores

In [None]:
topic_words, word_scores, topic_nums = model.get_topics(model.get_num_topics())
assert len(topic_words) == model.get_num_topics()

# Get Representative Documents for Topics

In [None]:
# export
def get_num_docs(topic_idx, topic_sizes, max_k=50):
    n_docs = topic_sizes[topic_idx]
    return n_docs if n_docs < max_k else max_k

In [None]:
# Distance in word space & in intent space
# Topics matches to sub-intent?

In [None]:
i = np.random.choice(model.get_num_topics(), 1)[0]
docs, doc_scores, doc_ids = model.search_documents_by_topic(
    topic_num=i, num_docs=get_num_docs(i, topic_sizes, max_k=20)
)

assert all([type(doc) == str for doc in docs])
assert all([type(doc.encode("utf-8")) == bytes for doc in docs])

docs

In [None]:
# vis
# time.sleep(120)
model.generate_topic_wordcloud(0)

In [None]:
# exportn_step:evaluate


def evaluate(model):
    topic_words, word_scores, topic_nums = model.get_topics(model.get_num_topics())

    topic_contains_non_empty_words = all([len(tw) > 0 for tw in topic_words])
    word_scores_in_range = word_scores.min() >= 0.0 and word_scores.max() <= 1.0
    as_many_items_as_topics = (
        model.get_num_topics() == len(topic_words) == word_scores.shape[0]
    )
    results = (
        topic_contains_non_empty_words
        and word_scores_in_range
        and as_many_items_as_topics
    )
    return results

# Serve

In [None]:
# export
def serve_num_topics(model):
    return model.get_num_topics()

In [None]:
assert serve_num_topics(model) > 0

In [None]:
# export
def serve_reduced_hierarchies(model, desired_num_topics):
    return model.hierarchical_topic_reduction(desired_num_topics)

In [None]:
try:
    serve_reduced_hierarchies(model, -1)
except IndexError:
    print("Negative indexing not possible")
try:
    serve_reduced_hierarchies(model, model.get_num_topics() + 1)  # > #topics
except ValueError as ve:
    print(ve)