In [2]:
# hide
# default_exp test.test_clustering

In [3]:
# exportn_step:first


def something():
    print("The first step")

# Example Usage: Mock Unsupervised Modelling

Auto-reloading modules is very useful when using `nbdev` as changes to underlying modules are picked up without having to restart the kernel.

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# export


import numpy as np
import pandas as pd

from sciflow.utils import lib_path

# Params

> `sciflow` uses the papermill format for paramaeterising notebooks. See here for how to specify papermill params: https://papermill.readthedocs.io/en/latest/usage-parameterize.html. These parameters will be available to use in your flows.

In [6]:
# export

traffic_percent = 1
workers = 8
model_level = "dispatcher"
min_date = "2021-01-01"

In [7]:
# export


def get_traffic_text(percent):
    return str(percent) if int(percent) >= 10 else "0" + str(percent)

`nbdev` tests are any cells which are not exporting code and do not have flags that say they should be ignored from testing.

In [8]:
assert get_traffic_text("3") == "03"
assert get_traffic_text("13") == "13"
assert get_traffic_text("78") == "78"

# Preprocess Data

In [9]:
# export


def get_experiment_segment(traffic_percent):
    return tuple(get_traffic_text(tp) for tp in range(traffic_percent))

In [10]:
assert get_experiment_segment(1) == ("00",)
assert get_experiment_segment(3) == ("00", "01", "02")
assert "' '".join(get_experiment_segment(1)) == "00"
assert f"""IN ('{"','".join(get_experiment_segment(3))}')""" == "IN ('00','01','02')"
assert len(get_experiment_segment(50)) == 50
assert max([int(x) for x in get_experiment_segment(100)]) == 99

In [11]:
# export


def get_utterances(model_level=None, min_date=None, traffic_percent=100):
    """
    You will probably call data preparation code here. To simplify dependencies we are just creating synthetic data instead.
    """
    get_experiment_segment(traffic_percent)
    dummy_data = pd.Series(
        np.random.choice(
            [
                "Hello",
                "Goodbye",
                "Hi",
                "Can you help?",
                "I have an issue, can you help me?",
            ],
            100,
        ),
        name="utterance",
    )
    return dummy_data

In [12]:
# exportn_step:preprocess


def preprocess(model_level=None, min_date=None, traffic_percent=100):
    data = get_utterances(model_level, min_date, traffic_percent)
    documents = data.tolist()
    results = {"documents": documents}
    return results

In [13]:
documents = preprocess(traffic_percent)["documents"]

In [14]:
assert len(documents) > 0
assert (
    pd.Series(["Some other text", "Which should not be in the utterances"])
    .isin(pd.Series(documents))
    .sum()
    == 0
)  # no button response texts

In [15]:
# export


class Topics:
    def __init__(self, documents, workers):
        pass

    def get_num_topics(self):
        return 6

    def get_topic_sizes(self):
        return [1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]

    def get_topics(self, num_topics):
        return (
            ["cat", "sat", "mat", "mouse", "house", "grouse"],
            np.asarray([1, 1, 1, 1, 1, 1]),
            [1, 2, 3, 4, 5, 6],
        )

    def plot_wordcloud(self):
        print("you may want to remove plotting code from testing to speed things up")

# Fit

In [16]:
# exportn_step:fit


def fit(documents, workers=workers):
    model = Topics(documents, workers=workers)
    results = {"model": model}
    return results

> Tests which are long running can be ignored from test execution. You can use the tst flags in settings.ini or create your own in the same file. See https://nbdev.fast.ai/test for more info. In this example we use `#slow` to indicate this should be skipped.

In [17]:
# slow
import time

time.sleep(3)

In [18]:
model = fit(documents, workers=workers)["model"]

# Evaluate

# Number of Topics

In [19]:
model.get_num_topics()

6

# Size of Topics

In [20]:
topic_sizes, topic_nums = model.get_topic_sizes()
assert all([s > 0 for s in topic_sizes])

# Get Topic Words & Scores

In [21]:
topic_words, word_scores, topic_nums = model.get_topics(model.get_num_topics())
assert len(topic_words) == model.get_num_topics()

In [22]:
# vis
# time.sleep(120)
model.plot_wordcloud()

you may want to remove plotting code from testing to speed things up


In [23]:
# exportn_step:evaluate


def evaluate(model):
    topic_words, word_scores, topic_nums = model.get_topics(model.get_num_topics())

    topic_contains_non_empty_words = all([len(tw) > 0 for tw in topic_words])
    word_scores_in_range = word_scores.min() >= 0.0 and word_scores.max() <= 1.0
    as_many_items_as_topics = (
        model.get_num_topics() == len(topic_words) == word_scores.shape[0]
    )
    word_summaries = (
        topic_contains_non_empty_words
        and word_scores_in_range
        and as_many_items_as_topics
    )
    # You can add artifacts in a step that will be saved to block storage. Add the paths to the file on the local filesystem
    # and the artifact will be uploaded to remote storage.
    artifacts = [lib_path("nbs", "test", "dataframe_artifact.csv")]
    # You can add step metrics too this time just add a list of 3-tuples where tuple order = (name, value, step)
    metrics = [("mae", 100, 0), ("mae", 67, 1), ("mae", 32, 2)]
    results = {
        "word_summaries": word_summaries,
        "artifacts": artifacts,
        "metrics": metrics,
    }
    return results

In [24]:
results = evaluate(model)
assert results["word_summaries"]
assert results["metrics"] == [("mae", 100, 0), ("mae", 67, 1), ("mae", 32, 2)]
assert results["artifacts"] == [lib_path("nbs", "test", "dataframe_artifact.csv")]

# Serve

In [25]:
# export
def serve_num_topics(model):
    return model.get_num_topics()

In [26]:
assert serve_num_topics(model) > 0