# Usage

This notebook expects a document-topic matrix and term-topic matrix, both in parquet format and in the same directory. It also expects a settings.json file in the same directory. The settings file mostly just points to information about the original text, so you can run it without them and follow the errors. The data path, index column, and text column should be all that you need to change, which is just three strings.

[Click here](#settings) to edit paths and/or column names.

[Click here](#interactive-display) to jump to results.

# On Reporting

You will see some numbers by the topics, which represent the average distribution of that topic across all documents in the corpus. That number is OK to use and think about, however do not use that number in conjunction with human naming of topics. Also avoid reporting that number to casual audiences who do not fully grasp the difference between human understandable topics and machine topics. Even unlikely words that you won't look at can matter since there are so many.

# Imports

## Libraries

In [None]:
from ipywidgets import interact, Combobox
from IPython.display import display, display_html
import json

In [None]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import gmean

## Settings

In [None]:
try:    
    with open("settings.json") as config_file:
        settings = json.load(config_file)
except:
    settings = {}
# Necessary pieces of information to know about the file containing the original text.
data_path = settings.get("data_filepath","Your data filepath here")
nice_text_col = settings.get("nice_text_column","Your plain text column here")
index_col = settings.get("index_column","Your index col here") # Column to use as index for dataframe
# Other nice things to know
model_name = settings.get("job_name","Your model file name here")
long_job_name = settings.get("long_name","No long job name in config.")
include_metadata = settings.get("include_metadata",False) # Metadata is extra document-level data

## Data

In [None]:
# Load in the original text for display purposes
preprocessed_SES = pd.read_csv(data_path,
            usecols = [index_col,nice_text_col])
# Set the index manually as I was having some trouble with specifying it in the import statement.
preprocessed_SES.set_index(index_col,inplace = True)

## Import Model and create components


Term score is a slightly better metric for word relevance per topic for human consumption especially. v indexes terms while k indexes topics:
$$ \text{term-score}_{v,k} = \hat \beta_{v,k} \log \left( \frac{\hat \beta_{v,k}}{\left( \prod_{j=1}^K \hat \beta_{v,j}\right)^{\frac{1}{K}}} \right)$$

In [None]:
def convert_row_to_term_score(row):
    '''Converts a word-topic row to a term score row. 
    Input should be a series of probabilities (intent is that the term is the index)'''
    normalizer = gmean(row) # Compute geometric mean of the word probabilities
    term_score_row = row.apply(lambda b: b*(np.log(b/normalizer))) #applying the transformation
    return term_score_row

In [None]:
# First grab the matrix of word probabilities
term_topic_matrix = pd.read_parquet("term-topic.parquet")
# Create Term Score Matrix
term_score_matrix = term_topic_matrix.apply(convert_row_to_term_score,axis=1)
print(f"Number of terms: {len(term_score_matrix)}")
print("Term Score Examples")
term_score_matrix.head(3)

In [None]:
# Read Document Topic Matrix
document_topic_matrix = pd.read_parquet("doc-topic.parquet")
print(f"Number of documents: {len(document_topic_matrix)}")
print("Document Distribution Examples")
document_topic_matrix.head(3)

In [None]:
topic_means = document_topic_matrix.mean().apply(lambda x: round(x, 3))

# Explorer

## Functions

In [None]:
def get_top_responses(topic_name,number_responses,doc_metadata = None):
    doc_ids = document_topic_matrix.sort_values(by=topic_name,ascending = False)
    doc_ids = doc_ids.index.tolist()[:number_responses]
    # Print results
    for doc_id in doc_ids:
        if doc_metadata is not None: # Check if we want to display metadata with each comment
            display(doc_metadata.loc[[doc_id]].style.hide_index())
        display_html(" • " + preprocessed_SES.loc[doc_id][nice_text_col] + "<br><br><br>", raw = True)

In [None]:
def get_random_responses(topic_name, number_responses, doc_metadata = None):
    '''Gives {number_responses} random responses that are dominated by topic {topic_name}'''
    relevant_responses = document_topic_matrix[document_topic_matrix.apply(
        lambda row: row[topic_name] >= row.max(), # Check if the named topic is at least as large as the largest topic
        axis = 1 # Map over rows
    )]
    doc_ids = relevant_responses.sample(number_responses).index.tolist()
    for doc_id in doc_ids:
        if doc_metadata is not None: # Check if we want to display metadata with each comment
            display(doc_metadata.loc[[doc_id]].style.hide_index())
        display_html(" • " + preprocessed_SES.loc[doc_id][nice_text_col] + "<br><br><br>", raw = True)

In [None]:
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html() + ("\xa0" * 5) # Spaces
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

## Classic Display


In [None]:
# Display top words per topic
num_top_words = 14
for c in term_score_matrix.columns:
    print(f'\n Topic {c} -- {topic_means[c]} \n',
          term_score_matrix[c]
          .sort_values(ascending=False) #Sort most relevant words by their term score in column 'c'
          .head(num_top_words) #Take top ten most relevant words
          .index #The index is the word itself
          .tolist() #Feel free to replace with some nicer display function
         )

## Interactive Display

### Word

In [None]:
@interact(word = Combobox(options = list(term_score_matrix.index)), continuous_update = False, value = "class")
def plot_term(word = "class"):
    try:
        display_html(f"<h4> Probability(term|topic) for \"{word}\"",raw=True)
        display_html(term_topic_matrix.loc[[word]].transpose().plot.bar(ylabel = "Conditional term probability",xlabel = "Topic"))
    except KeyError as e: print("Waiting for valid input")

### Topic

#### Most relevant words by topic

In [None]:
@interact(topic = document_topic_matrix.columns, num = (5,100), cols = (1,10),include_term_score = True)
def top_words(topic,num = 30, cols = 4, include_term_score = True):
    sorted_term_score = term_score_matrix.sort_values(by = topic, ascending = False)[[topic]] # Prepare terms sorted by score
    sorted_term_score.columns = ["Term Score"]
    display_html(f"<h4><u> Most Relevant words for Topic {topic} ({topic_means[topic]}):", raw = True) # Heading
    if include_term_score:
        per_col = int(np.ceil(num/cols)) # Figure out how many words to put per column
        display_side_by_side(*[sorted_term_score.iloc[x: x + per_col] for x in range(0,num,per_col)]) # Display the columns. *[] used to partition the dataframe
    else:
        print(sorted_term_score.head(num).index.tolist()) # Print them out plainly if we want that for some reason.

#### Responses that most identify with a particular topic in order

In [None]:
@interact(
    topic = document_topic_matrix.columns, # Choose a topic from the doc-topic matrix
    number_responses = [4,20,50,100,500], # Choose a number of responses
    include_topic_distributions = False # Choose whether you want to show the entry from the doc-topic matrix for each response
)
def top_resp(topic, number_responses = 4, include_topic_distributions = False):
    if include_topic_distributions:
        metadata = document_topic_matrix # Set the metadata to display and populate it
    else: metadata = None
    display_html(f"<h2><u> Top Responses for Topic {topic} ({topic_means[topic]}):", raw = True)
    return get_top_responses(topic_name = topic, number_responses = number_responses, doc_metadata = metadata)

#### Randomly sample responses dominated by a particular topic

In [None]:
@interact(
    topic = document_topic_matrix.columns, # Choose which topic you want to see responses from
    number_responses = [1,5,10,20,50,100,500], # How many random responses you want to see
    include_topic_distributions = False, # Choose to see the topic distribution for each comment
    click_for_new = False # Box to click to get new responses
)
def random_resp(topic,number_responses = 1, include_topic_distributions = False, click_for_new = False):
    if include_topic_distributions:
        metadata = document_topic_matrix # Set the metadata to display and populate it
    else: metadata = None
    display_html(f"<h2><u> Random Responses most represented by Topic {topic} ({topic_means[topic]}):", raw = True)
    return get_random_responses(topic_name = topic, number_responses = number_responses, doc_metadata = metadata)