# Setup
Perform common imports and data setup which is shared across notebooks.

**Note:** this notebook should be run as a part of another notebook using `%run` magic. 

In [None]:
# Warnings are imported first to filter out import warnings.
import warnings
warnings.filterwarnings('ignore')

In [15]:
try:
    import base64
    import boto3
    import csv
    import ftfy
    import gensim
    import hashlib
    import hdbscan
    import html
    import io
    import itertools
    import json
    import json_lines
    import langid
    import logging
    import math
    import matplotlib.pyplot as plt
    import mpld3
    import multiprocessing
    import networkx as nx
    import numpy as np
    import operator
    import os
    import pandas as pd
    import pickle
    import pprint as pp
    import pyarrow.parquet as pq
    import pytextrank
    import random
    import re
    import requests
    import scipy
    import seaborn as sns
    import shutil
    import six
    import smart_open
    import string
    import sys
    import tensorflow as tf
    import tensorflow_hub as hub
    import tensorflow_probability as tfp
    import time
    import traceback
    import urllib
    import xml.etree.ElementTree as etree


    from bokeh.io import output_notebook, show
    from bokeh.models import ColumnDataSource, HoverTool
    from bokeh.plotting import figure
    from botocore.exceptions import ClientError
    from bs4 import BeautifulSoup
    from collections import Counter, defaultdict, namedtuple
    from datetime import datetime
    from enum import Enum
    from functools import reduce
    from glob import glob
    from io import BytesIO, StringIO
    from IPython.display import display, HTML, Image
#     from Levenshtein import distance as levenshtein_distance
    from matplotlib import gridspec, rc
    from matplotlib.animation import FuncAnimation
    from MulticoreTSNE import MulticoreTSNE as TSNE
    from newspaper import Article
    from nltk import pos_tag
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk.stem.porter import PorterStemmer
    from nltk.tokenize import sent_tokenize, word_tokenize
    from pathlib import Path
    from PIL import Image as pil_img
    from pprint import pprint
    from scipy import spatial
    from scipy.cluster.hierarchy import ward, dendrogram
    from shutil import copyfile
    from sklearn import datasets, decomposition, feature_extraction, metrics, model_selection, preprocessing
    from sklearn.cluster import KMeans, DBSCAN
    from sklearn.decomposition import PCA, TruncatedSVD
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.manifold import MDS
    from sklearn.metrics import accuracy_score, average_precision_score, auc, f1_score, precision_score, precision_recall_curve, recall_score, roc_auc_score, roc_curve
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.model_selection import KFold, train_test_split
    from sklearn.utils import shuffle
    from skopt import gp_minimize
    from skopt.space import Categorical, Integer, Real
#     from skopt.utils import use_named_args
    from stringcase import camelcase, snakecase
    from tensor2tensor import models, problems
    from tensor2tensor.data_generators import problem, text_problems
    from tensor2tensor.utils import registry, trainer_lib
    from tensorflow.contrib.rnn import GRUCell, LSTMCell, LSTMStateTuple
    from tensorflow.python.feature_column import feature_column as fc_core
    from tensorflow.python.framework import ops
    from tensorflow.python.saved_model import constants
    from termcolor import colored, cprint
    from textblob import TextBlob as tb
    from tqdm import tqdm_notebook
    from urllib.parse import urlparse
    from urllib.request import urlopen
except ImportError as e:
    print('Import error: ', e)

Modes = tf.estimator.ModeKeys
tfe = tf.contrib.eager

Import error:  This version of TensorFlow Probability requires TensorFlow version >= 1.12.0; Detected an installation of version 1.9.0. Please upgrade TensorFlow to proceed.


In [None]:
np.warnings.filterwarnings('ignore')

### TF Eager
Optionally enable  mode.

In [None]:
try:
    TF_EAGER
except:
    TF_EAGER = False
if TF_EAGER:
    tf.enable_eager_execution()

### Jupiter Notebook Settings

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 100;

In [None]:
%autosave 30
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.random.seed(42)
random.seed(42)
tf.set_random_seed(42)
# trainer_lib.set_random_seed(42)
print('Notebook & runtime settings applied...')

### Utility Functions

In [None]:
def prepare_directory(dir_path, rmdir=False):
    """Creates a directory if it doesn't exist; otherwise clears its contents."""
    if os.path.exists(dir_path) and rmdir:
        shutil.rmtree(dir_path)
    os.makedirs(dir_path, exist_ok=True)
    return dir_path

def get_n_grams(word_list, n=2):
    """Compute ngrams for a list of words."""
    if n < 2:
#         raise ValueError('Please provide n which is >= 2.')
        warnings.warn('Obtaining ngrams with n < 2.', Warning)
        return word_list
    
    n_grams = []
    for idx, w in enumerate(word_list[:-(n - 1)]):
        subsequent_words = [word_list[i] for i in range(idx + 1, idx + 1 + (n - 1))]
        n_grams.append('_'.join([w] + subsequent_words))
    return n_grams

def get_sentence_n_grams(sentence_word_list, n=2):
    """Compute ngrams for an array of sentences (word lists)."""
    ngrams = []
    for s in sentence_word_list:
        ngrams += get_n_grams(s, n)
    return ngrams

def get_tf_idf_dict_for_text(text_str, vectorizer, tf_idf_words):
    """Get TF-IDF results for a text (returns word:score dictionary)."""
    tf_idf_dict = {x:0 for x in text_str.split(' ')}
    transform_result = vectorizer.transform([text_str])
    coo = transform_result.tocoo()
    non_zero_scores = {tf_idf_words[term_id]: score for doc_id, term_id, score in
                       zip(coo.row, coo.col, coo.data) if score > 0}
    tf_idf_dict.update(non_zero_scores)
    return tf_idf_dict

def strip_text(text):
    """Leave only allowed set of characters in a text."""
    text = re.sub(r'[^a-zA-Z0-9?.!,¿\-\'"()]+', ' ', text)
    text = text.strip()
    return text

def get_article_for_url(url):
    """Returns parsed newspaper3k article from URL."""
    a = Article(url=url, fetch_images=False, request_timeout=5)
    a.download()
    a.parse()
    return a

def get_html_for_url(url):
    """Returns HTML from URL (extracted via newspaper3k)."""
    try:
        t = get_article_for_url(url).html
    except:
        t = ''
    return t

def get_text_for_url(url):
    """Returns text from URL (extracted via newspaper3k)."""
    try:
        t = get_article_for_url(url).text
    except:
        t = ''
    return t

def find_all_substr(source_str, target_str):
    """Find all occurances of `target_str` in `source_str`."""
    start = 0
    while True:
        start = source_str.find(target_str, start)
        if start == -1: return
        yield start
        start += len(target_str) # use start += 1 to find overlapping matches
        
def find_all_sublists(source_list, target_list):
    """Find all occurances of `target_list` in `source_list`."""
    start = 0
    target_len = len(target_list)
    indices = []
    for i in range(start, len(source_list) - len(target_list) + 1):
        if source_list[i:i+target_len] == target_list:
            indices.append(i)
    return indices

def get_host_from_url(url):
    """Returns domain from URL."""
    return urlparse(url).netloc

def get_md5_hash_str(input_str):
    """Gets an MD5 hash string from input string."""
    return hashlib.md5(input_str.encode('utf8')).hexdigest()

def flatten_2d_list(list_2d):
    """Make a flat (1-d) list out of 2-d list (list of lists)."""
    if list_2d is None:
        return list_2d
    return list(itertools.chain.from_iterable(list_2d))

def get_language(input_str, max_str_length=300):
    """Returns the language of a string."""
    return langid.classify(input_str[:max_str_length])[0]

def safe_str_to_int_cast(input_str):
    try:
        return int(input_str)
    except:
        return 0

LINK_REGEX = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)'
def extract_links_from_text(text):
    """Extracts HTTP(S) links from input text."""
    matches = re.finditer(LINK_REGEX, text)
    return [m.group(0) for m in matches]

def strip_expr(source_str, strip_expr):
    """Strips the `strip_expr` from the end of a string if it is present."""
    return source_str[:-len(strip_expr)] if source_str[-len(strip_expr):] == strip_expr else source_str

def strip_exprs(source_str, strip_exprs_list):
    """Strips any of the `strip_exprs` from the end of a string."""
    for expr in strip_exprs_list:
        if source_str[-len(expr):] == expr:
            source_str = source_str[:-len(expr)]
    return source_str

def get_file_logger(log_file_path='custom_log.log', logger_name='custom_logger'):
    """Returns a logger which logs to a file with datetime-formatted messages."""
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)

    handler = logging.FileHandler(log_file_path)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(levelname)s:%(asctime)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    logger.info('Log file created')
    return logger

### TF Utils
#### Logging Filters

In [None]:
class TFFilter(logging.Filter):
    def filter(self, record):
        return not record.getMessage().startswith('Initialize variable')

tf_logger = logging.getLogger('tensorflow')
tf_logger.addFilter(TFFilter())
print('TensorFlow logging filter applied...')
print('TensorFlow version: {}'.format(tf.__version__))

#### Checkpoints

In [None]:
def copy_weights_from_ckpt(source_ckpt, target_ckpt):
    """Copies all weights with matching names from `source_ckpt` to `target_ckpt`."""
    tf.reset_default_graph()

    # Copy all variables which are currently in a target checkpoint.
    target_reader = tf.train.NewCheckpointReader(target_ckpt)
    source_reader = tf.train.NewCheckpointReader(source_ckpt)

    target_vars = target_reader.get_variable_to_shape_map()
    source_vars = source_reader.get_variable_to_shape_map()
    
    checkpoint_vars = {}
    for name in target_reader.get_variable_to_shape_map():
        if name in source_vars:
            checkpoint_vars[name] = tf.Variable(source_reader.get_tensor(name))
        else:
            checkpoint_vars[name] = tf.Variable(target_reader.get_tensor(name))
            
    # Save updated checkpoint file.
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(checkpoint_vars)
    
    with tf.Session() as sess:
        sess.run(init)
        saver.save(sess, target_ckpt)
        
        
def rename_vars_in_ckpt(name_map, in_file, out_file):
    """Renames variables in checkpoint according to a mapping scheme.
     
    Adopted from: https://github.com/KranthiGV/Pretrained-Show-and-Tell-model/issues/7#issuecomment-309862894
     
    Example of variable renaming:
    OLD_CHECKPOINT_FILE = os.path.join(MODEL_PATH, 'checkpoint-329')
    NEW_CHECKPOINT_FILE = os.path.join(MODEL_PATH, 'checkpoint-329-fixed')
    vars_to_rename = {
        'lstm/basic_lstm_cell/weights': 'lstm/basic_lstm_cell/kernel',
        'lstm/basic_lstm_cell/biases': 'lstm/basic_lstm_cell/bias',
    }
    rename_vars_in_ckpt(vars_to_rename, OLD_CHECKPOINT_FILE, NEW_CHECKPOINT_FILE)
    """
    tf.reset_default_graph()
    new_checkpoint_vars = {}
    reader = tf.train.NewCheckpointReader(in_file)
    for old_name in reader.get_variable_to_shape_map():
        if old_name in name_map:
            new_name = name_map[old_name]
        else:
            new_name = old_name
        new_checkpoint_vars[new_name] = tf.Variable(reader.get_tensor(old_name))
 
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(new_checkpoint_vars)
 
    with tf.Session() as sess:
        sess.run(init)
        saver.save(sess, out_file)
        
        
def average_checkpoints(checkpoints='',
                        num_last_checkpoints=0,
                        prefix='',
                        output_path='/tmp/averaged.ckpt'):
    """Creates a TF checkpoint which is an average of provided checkpoints.
    
    Adopted from https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/avg_checkpoints.py.
    
    Args:
        checkpoints: Comma-separated list of checkpoints to average.
        num_last_checkpoints: Averages the last N saved checkpoints.
            If the checkpoints flag is set, this is ignored.
        prefix: Path to output the averaged checkpoint to.
    """
    if checkpoints:
        # Get the checkpoints list and run some basic checks.
        checkpoints = [c.strip() for c in checkpoints.split(',')]
        checkpoints = [c for c in checkpoints if c]
        if not checkpoints:
            raise ValueError('No checkpoints provided for averaging.')
        if prefix:
            checkpoints = [prefix + c for c in checkpoints]
    else:
        assert num_last_checkpoints >= 1, 'Must average at least one model'
        assert prefix, ('Prefix must be provided when averaging last'
                        ' N checkpoints')
        checkpoint_state = tf.train.get_checkpoint_state(
            os.path.dirname(prefix))
        # Checkpoints are ordered from oldest to newest.
        checkpoints = checkpoint_state.all_model_checkpoint_paths[
            -num_last_checkpoints:]

    if not checkpoints:
        if checkpoints:
            raise ValueError(
                f'None of the provided checkpoints exist. {checkpoints}')
        else:
            raise ValueError(f'Could not find checkpoints at {os.path.dirname(prefix)}')
            
    # Read variables from all checkpoints and average them.
    tf.logging.info('Reading variables and averaging checkpoints:')
    for c in checkpoints:
        tf.logging.info(f'{c} ')
    var_list = tf.contrib.framework.list_variables(checkpoints[0])
    var_values, var_dtypes = {}, {}
    for (name, shape) in var_list:
        if not name.startswith('global_step'):
            var_values[name] = np.zeros(shape)
    for checkpoint in checkpoints:
        reader = tf.contrib.framework.load_checkpoint(checkpoint)
        for name in var_values:
            tensor = reader.get_tensor(name)
            var_dtypes[name] = tensor.dtype
            var_values[name] += tensor
        tf.logging.info(f'Read from checkpoint {checkpoint}')
    for name in var_values:  # Average.
        var_values[name] /= len(checkpoints)

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        tf_vars = [
            tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[v]) \
            for v in var_values]
    placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars]
    assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
    global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int64)
    saver = tf.train.Saver(tf.all_variables())

    # Build a model consisting only of variables, set them to the average values.
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for p, assign_op, (name, value) in zip(
            placeholders, assign_ops, six.iteritems(var_values)):
            sess.run(assign_op, {p: value})
        # Use the built saver to save the averaged checkpoint.
        saver.save(sess, output_path, global_step=global_step)
    tf.logging.info(f'Averaged checkpoints saved in {output_path}')
    
    # Copy graph file.
    shutil.copy2(os.path.join(os.path.dirname(prefix), 'graph.pbtxt'), os.path.dirname(output_path))
    tf.logging.info(f'Copied graph.pbtxt file.')

### Histogram Overlays

In [None]:
BIN_CNT = 25
NOISE_REDUCTION_LEVEL = 1e4
MAX_CATEGORICAL_FEATURES = 7
MAX_BIN_TICKS = 7

def plot_bins_numerical(df, feature_names, class_column, class_values, class_colors=None, feature_bin_cnts=None, plot_kde=False):
    """Plots histograms for numerical features."""
    fig = plt.figure(figsize=(14, int(len(feature_names)) * 1.5))
    gs = gridspec.GridSpec(math.ceil(len(feature_names) / 3), 3)
    df = df[feature_names + [class_column]].dropna()

    # Number of bins for each feature. If counts are not provided will use BIN_CNT bins.
    feature_bin_cnts = feature_bin_cnts if feature_bin_cnts else [BIN_CNT] * len(feature_names)
    
    # Use default colors if class colors are not provided.
    if not class_colors:
        cmap = plt.cm.get_cmap('hsv', len(class_values))
        class_colors = list(map(cmap, range(len(class_values))))
    
    # Add some noise so that bin edges are unique.
    df_numerical = df[feature_names].copy()
    df_numerical += np.random.random(df_numerical.shape) / NOISE_REDUCTION_LEVEL
    
    for i, (f, bin_cnt) in enumerate(zip(feature_names, feature_bin_cnts)):
        ax = plt.subplot(gs[i])
        ax.set_xlabel('Bin Number')
        if (bin_cnt > MAX_BIN_TICKS) or plot_kde:
            ax.set_xticks(range(0, bin_cnt + 1, bin_cnt // MAX_BIN_TICKS))
        else:
            ax.set_xticks(range(bin_cnt + 1))
        ax.set_ylabel('Kernel Density')
        ax.set_title(f)
        
        # Split data in `bin_cnt` buckets of equal (by number of examples) size.
        bins = pd.qcut(df_numerical[f], bin_cnt, labels=False)

        # Plot distribution of examples into bins for each of the classes.
        for class_name, class_color in zip(class_values, class_colors):
            bins_s = bins[df[class_column] == class_name]
            
            if not len(bins_s):
                continue
                
            if plot_kde:
                sns.kdeplot(bins_s.values,
                            shade=True,
                            cut=5,
                            color=class_color,
                            label=class_name)
            else:
                sns.distplot(bins_s.values,
                             bins=bin_cnt,
                             color=class_color,
                             kde=False,
                             norm_hist=True,
                             hist_kws=dict(edgecolor='dimgrey', linewidth=1),
                             label=class_name)
                plt.legend()
            
    gs.tight_layout(fig)
    plt.show()


def plot_bins_categorical(df, feature_names, class_column, class_values, class_colors=None, max_category_cnt=7):
    """Plots histograms for categorical features.
    
    It takes up to `max_category_cnt` top categories across all classes.
    Then for each class it plots (values[category == given_category] / values).
    """
    fig = plt.figure(figsize=(14, len(feature_names)))
    gs = gridspec.GridSpec(math.ceil(len(feature_names) / 3), 3)

    # Use default colors if class colors are not provided.
    if not class_colors:
        cmap = plt.cm.get_cmap('RdYlGn', len(class_values))
        class_colors = cmap(range(len(class_values)))
        
    for i, f in enumerate(feature_names):
        # Get top categories for the current feature.
        top_c = list(df[f].value_counts()[:max_category_cnt].index)
        df_c = df[df[f].isin(top_c)][[class_column, f]]
        
        # Get % which values of this category take of all values.
        value_cnt_pct, categories, classes = [], [], []
        for l in df[class_column].unique():
            v_c = pd.DataFrame(df_c[df_c[class_column] == l][f].value_counts())
            v_c['pct'] = v_c[f] / v_c[f].sum()
            value_cnt_pct.extend(list(v_c['pct']))
            categories.extend(list(v_c.index))
            classes.extend([l] * len(v_c))
        
        categories_df = pd.DataFrame({'value_cnt_pct': value_cnt_pct,
                                      'category': categories,
                                      'class': classes})

        # Plot. Heights of bars for each class sum up to one.
        # Note: having heights of bars within category sum up to one may be
        # misleading in cases of class imbalance.
        ax = plt.subplot(gs[i])
        sns.barplot(ax=ax, x='category', y='value_cnt_pct', hue='class', data=categories_df,
                    palette=class_colors, alpha=0.5, saturation=0.9, edgecolor='dimgrey', linewidth=1)
        ax.set_title(f)
        ax.set_xlabel('')
        ax.set_ylabel('% of Total')


    gs.tight_layout(fig)
    plt.show()

### Graph Visualisation Functions
Credit: http://nbviewer.jupyter.org/github/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/deepdream/deepdream.ipynb

In [None]:
def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:1000px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))
  
    iframe = """
        <iframe seamless style="width:1000px;height:1000px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

    
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = tf.compat.as_bytes("<stripped %d bytes>"%size)
    return strip_def


def get_feature_dict(features):
    if isinstance(features, dict):
        return features
    return {'': features}

### Shampoo

In [None]:
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""The Shampoo Optimizer.

Variant of Adagrad using one preconditioner matrix per variable dimension.
For details, see https://arxiv.org/abs/1802.09568
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.platform import tf_logging
from tensorflow.python.training import optimizer


def GetParam(var, timestep):
  if callable(var):
    return var(timestep)
  else:
    return var


class ShampooOptimizer(optimizer.Optimizer):
  """The Shampoo Optimizer

  Variant of Adagrad using one preconditioner matrix per variable dimension.
  For details, see https://arxiv.org/abs/1802.09568

  gbar is time-weighted accumulated gradient:
  gbar[t] = gbar_decay[t] * gbar[t-1] + gbar_weight[t] * g[t]

  mat_gbar is time-weighted accumulated gradient square:
  mat_gbar_j[t] = mat_gbar_decay[t] * mat_gbar_j[t-1]
                  + mat_gbar_weight[t] * gg_j[t]
  where if g[t] = g_abcd then gg_a[t] = g_abcd g_a'bcd (Einstein notation)

  Update rule:
  w[t+1] = w[t] - learning_rate[t] * Prod_j mat_gbar_j[t]^(-alpha/n) gbar[t]
     Again, mat_gbar_j[t]^(-alpha) gbar[t] is a tensor contraction along the
     j'th dimension of gbar[t] with the first dimension of
     mat_gbar_j[t]^(-alpha/n), where alpha is a hyperparameter,
     and n = rank of the variable.
     Prod_j represents doing this contraction for all j in 0..n-1.

  Typically learning_rate is constant, but could be time dependent by passing
  a lambda function that depends on step.
  """

  def __init__(self,
               global_step=0,
               max_matrix_size=768,
               gbar_decay=0.0,
               gbar_weight=1.0,
               mat_gbar_decay=1.0,
               mat_gbar_weight=1.0,
               learning_rate=1.0,
               svd_interval=1,
               precond_update_interval=1,
               epsilon=0.1,
               alpha=0.5,
               use_iterative_root=False,
               use_locking=False,
               name="Shampoo"):
    """Default values of the various hyper-parameters.

    gbar_decay, gbar_weight etc. can be a float or a time varying parameter.
    For time-varying parameters use e.g. "lambda T: T / (T + 1.0)"
    where the expression in the lambda is a tensorflow expression

    Args:
      global_step: tensorflow variable indicating the step.
      max_matrix_size: We do not perform SVD for matrices larger than this.
      gbar_decay:
      gbar_weight:  Used to update gbar:
            gbar[t] = gbar_decay[t] * gbar[t-1] + gbar_weight[t] * g[t]
      mat_gbar_decay:
      mat_gbar_weight:  Used to update mat_gbar:
           mat_gbar_j[t] = mat_gbar_decay[t] * mat_gbar_j[t-1]
                           + mat_gbar_weight[t] * gg_j[t]
      learning_rate: Similar to SGD
      svd_interval: We should do SVD after this many steps. Default = 1, i.e.
                    every step. Usually 20 leads to no loss of accuracy, and
                    50 or 100 is also OK. May also want more often early,
                    and less often later - set in caller as for example:
                    "svd_interval = lambda(T): tf.cond(
                        T < 2000, lambda: 20.0, lambda: 1000.0)"
      precond_update_interval: We should update the preconditioners after
                               this many steps. Default = 1. Usually less than
                               svd_interval.
      epsilon:  epsilon * I_n is added to each mat_gbar_j for stability
      alpha:  total power of the preconditioners.
      use_iterative_root: should the optimizer use SVD (faster) or the
                          iterative root method (for TPU) for finding the
                          roots of PSD matrices.
      use_locking:
      name: name of optimizer.
    """

    super(ShampooOptimizer, self).__init__(use_locking, name)

    self._global_step = math_ops.to_float(global_step)
    self._max_matrix_size = max_matrix_size
    self._gbar_decay = gbar_decay
    self._gbar_weight = gbar_weight
    self._mat_gbar_decay = mat_gbar_decay
    self._mat_gbar_weight = mat_gbar_weight
    self._learning_rate = learning_rate
    self._svd_interval = svd_interval
    self._precond_update_interval = precond_update_interval
    self._epsilon = epsilon
    self._alpha = alpha
    self._use_iterative_root = use_iterative_root
    self._name = name

  def _create_slots(self, var_list):
    for v in var_list:
      with ops.colocate_with(v):
        _ = self._zeros_slot(v, "gbar", self._name)
        shape = np.array(v.get_shape())
        for i, d in enumerate(shape):
          d_tensor = ops.convert_to_tensor(d)
          if d <= self._max_matrix_size:
            mat_g_init = array_ops.zeros_like(linalg_ops.eye(d_tensor))
            if self._svd_interval > 1:
              _ = self._get_or_make_slot(v, linalg_ops.eye(d_tensor),
                                         "H_" + str(i), self._name)
          else:
            mat_g_init = array_ops.zeros([d_tensor])

          _ = self._get_or_make_slot(v, mat_g_init, "Gbar_" + str(i),
                                     self._name)

  def _resource_apply_dense(self, grad, var):
    return self._apply_dense(grad, var)

  def _apply_dense(self, grad, var):
    return self._apply_gradient(grad, var)

  def _resource_apply_sparse(self, grad_values, var, grad_indices):
    return self._apply_sparse_shared(grad_values, grad_indices, var)

  def _apply_sparse(self, grad, var):
    return self._apply_sparse_shared(grad.values, grad.indices, var)

  def _apply_sparse_shared(self, grad_values, grad_indices, var):
    if var.get_shape()[0] <= self._max_matrix_size or self._gbar_decay != 0.0:
      # The dimension is small enough, we can make the variable dense and
      # do a dense update
      dense_grad = array_ops.scatter_nd(
          array_ops.expand_dims(grad_indices, axis=1), grad_values,
          array_ops.shape(var, out_type=grad_indices.dtype))
      return self._apply_gradient(dense_grad, var)
    return self._apply_gradient(grad_values, var, grad_indices)

  def _weighted_average(self, var, weight, weight_t, rest):
    """Computes exponential weighted average: var = weight_t * var + rest.

    Important to ensure that var does not occur in rest, otherwise
    we can get race conditions in a distributed setting.

    Args:
      var: variable to be updated
      weight: parameter to be checked. If it is a constant, we can optimize.
      weight_t: current value of parameter, used for weighting
      rest: the remaining tensor to be added

    Returns:
      updated variable.
    """
    if weight == 0.0:
      return rest       # no need to update var, we will never use it.
    if weight == 1.0:   # common case
      return state_ops.assign_add(var, rest)
    # The op below can cause race conditions in a distributed setting,
    # since computing weight_t * var + rest can take some time, during
    # which var may be set by another worker. To prevent this, it should
    # be implemented as a C++ op.
    return var.assign_add((weight_t - 1) * var + rest)

  def _update_mat_g(self, mat_g, grad, axes, mat_gbar_decay,
                    mat_gbar_weight, i):
    """Updates the cumulative outer products of the gradients.

    Args:
      mat_g: the matrix to be updated
      grad: the gradient of the variable
      axes: a list of k-1 integers 0 to k-1, except i
      mat_gbar_decay: constant for weighted average:
          mat_g = mat_g * decay + grad * weight
      mat_gbar_weight: constant for weighted average
      i: index of dimension to be updated.

    Returns:
      updated mat_g = mat_g * mat_gbar_decay + grad_outer * mat_gbar_weight

    In Einstein notation if i = 0: grad_outer_aa'= g_abcd g_a'bcd
    thus grad_outer is a matrix d_i x d_i, where d_i is the size of the
    i'th dimension of g.
    Alternate view: If mat_i(grad) is the flattening of grad to a
    d_i x (d_1d_2...d_{i-1}d_{i+1}...d_k) matrix, then
         grad_outer = mat_i(grad) mat_i(grad).transpose
    """
    grad_outer = math_ops.tensordot(grad, grad, axes=(axes, axes),
                                    name="grad_outer_" + str(i))
    return self._weighted_average(mat_g, self._mat_gbar_decay, mat_gbar_decay,
                                  mat_gbar_weight * grad_outer)

  def _compute_power_svd(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name):
    """Computes mat_h = mat_g^alpha using svd. mat_g is a symmetric PSD matrix.

    Args:
      var: the variable we are updating.
      mat_g: the symmetric PSD matrix whose power it to be computed
      mat_g_size: size of mat_g
      alpha: a real number
      mat_h_slot_name: name of slot to store the power, if needed.

    Returns:
      mat_h = mat_g^alpha

    Stores mat_h in the appropriate slot, if it exists.
    Note that mat_g is PSD. So we could use linalg_ops.self_adjoint_eig.
    """
    if mat_g_size == 1:
      mat_h = math_ops.pow(mat_g + self._epsilon, alpha)
    else:
      damping = self._epsilon * linalg_ops.eye(math_ops.to_int32(mat_g_size))
      diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True)
      mat_h = math_ops.matmul(
          mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha),
          array_ops.transpose(mat_u))
    if mat_h_slot_name is not None:
      return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h)
    return mat_h

  def _compute_power_iter(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name,
                          iter_count=100, epsilon=1e-6):
    """Computes mat_g^alpha, where alpha = -1/p, p a positive integer.

    We use an iterative Schur-Newton method from equation 3.2 on page 9 of:

    A Schur-Newton Method for the Matrix p-th Root and its Inverse
    by Chun-Hua Guo and Nicholas J. Higham
    SIAM Journal on Matrix Analysis and Applications,
    2006, Vol. 28, No. 3 : pp. 788-804
    https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf

    Args:
      var: the variable we are updating.
      mat_g: the symmetric PSD matrix whose power it to be computed
      mat_g_size: size of mat_g.
      alpha: exponent, must be -1/p for p a positive integer.
      mat_h_slot_name: name of slot to store the power, if needed.
      iter_count: Maximum number of iterations.
      epsilon: accuracy indicator, useful for early termination.

    Returns:
      mat_g^alpha
    """

    identity = linalg_ops.eye(math_ops.to_int32(mat_g_size))

    def MatPower(mat_m, p):
      """Computes mat_m^p, for p a positive integer.

      Power p is known at graph compile time, so no need for loop and cond.
      Args:
        mat_m: a square matrix
        p: a positive integer

      Returns:
        mat_m^p
      """
      assert p == int(p) and p > 0
      power = None
      while p > 0:
        if p % 2 == 1:
          power = math_ops.matmul(mat_m, power) if power is not None else mat_m
        p //= 2
        mat_m = math_ops.matmul(mat_m, mat_m)
      return power

    def IterCondition(i, mat_m, _):
      return math_ops.logical_and(
          i < iter_count,
          math_ops.reduce_max(math_ops.abs(mat_m - identity)) > epsilon)

    def IterBody(i, mat_m, mat_x):
      mat_m_i = (1 - alpha) * identity + alpha * mat_m
      return (i + 1, math_ops.matmul(MatPower(mat_m_i, -1.0/alpha), mat_m),
              math_ops.matmul(mat_x, mat_m_i))

    if mat_g_size == 1:
      mat_h = math_ops.pow(mat_g + self._epsilon, alpha)
    else:
      damped_mat_g = mat_g + self._epsilon * identity
      z = (1 - 1 / alpha) / (2 * linalg_ops.norm(damped_mat_g))
      # The best value for z is
      # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) /
      #                 (c_max^{1-alpha} - c_min^{1-alpha})
      # where c_max and c_min are the largest and smallest singular values of
      # damped_mat_g.
      # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha)
      # Can replace above line by the one below, but it is less accurate,
      # hence needs more iterations to converge.
      # z = (1 - 1/alpha) / math_ops.trace(damped_mat_g)
      # If we want the method to always converge, use z = 1 / norm(damped_mat_g)
      # or z = 1 / math_ops.trace(damped_mat_g), but these can result in many
      # extra iterations.
      _, _, mat_h = control_flow_ops.while_loop(
          IterCondition, IterBody,
          [0, damped_mat_g * z, identity * math_ops.pow(z, -alpha)])
    if mat_h_slot_name is not None:
      return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h)
    return mat_h

  def _compute_power(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name=None):
    """Just a switch between the iterative power vs svd."""
    with ops.name_scope("matrix_iterative_power"):
      if self._use_iterative_root:
        return self._compute_power_iter(var, mat_g, mat_g_size, alpha,
                                        mat_h_slot_name)
      else:
        return self._compute_power_svd(var, mat_g, mat_g_size, alpha,
                                       mat_h_slot_name)

  def _apply_gradient(self, grad, var, indices=None):
    """The main function to update a variable.

    Args:
      grad: A Tensor containing gradient to apply.
      var: A Tensor containing the variable to update.
      indices: An array of integers, for sparse update.

    Returns:
      Updated variable var = var - learning_rate * preconditioner * grad

    If the gradient is dense, var and grad have the same shape.
    If the update is sparse, then the first dimension of the gradient and var
    may differ, others are all the same. In this case the indices array
    provides the set of indices of the variable which are to be updated with
    each row of the gradient.
    """
    global_step = self._global_step + 1

    # Update accumulated weighted average of gradients
    gbar = self.get_slot(var, "gbar")
    gbar_decay_t = GetParam(self._gbar_decay, global_step)
    gbar_weight_t = GetParam(self._gbar_weight, global_step)
    if indices is not None:
      # Note - the sparse update is not easily implemented, since the
      # algorithm needs all indices of gbar to be updated
      # if mat_gbar_decay != 1 or mat_gbar_decay != 0.
      # One way to make mat_gbar_decay = 1 is by rescaling.
      # If we want the update:
      #         G_{t+1} = a_{t+1} G_t + b_{t+1} w_t
      # define:
      #         r_{t+1} = a_{t+1} * r_t
      #         h_t = G_t / r_t
      # Then:
      #         h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t
      # So we get the mat_gbar_decay = 1 as desired.
      # We can implement this in a future version as needed.
      # However we still need gbar_decay = 0, otherwise all indices
      # of the variable will need to be updated.
      if self._gbar_decay != 0.0:
        tf_logging.warning("Not applying momentum for variable: %s" % var.name)
      gbar_updated = grad
    else:
      gbar_updated = self._weighted_average(gbar, self._gbar_decay,
                                            gbar_decay_t,
                                            gbar_weight_t * grad)

    # Update the preconditioners and compute the preconditioned gradient
    shape = var.get_shape()
    mat_g_list = []
    for i in range(len(shape)):
      mat_g_list.append(self.get_slot(var, "Gbar_" + str(i)))
    mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step)
    mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step)

    preconditioned_grad = gbar_updated
    v_rank = len(mat_g_list)
    neg_alpha = - GetParam(self._alpha, global_step) / v_rank
    svd_interval = GetParam(self._svd_interval, global_step)
    precond_update_interval = GetParam(self._precond_update_interval,
                                       global_step)
    for i, mat_g in enumerate(mat_g_list):
      # axes is the list of indices to reduce - everything but the current i.
      axes = list(range(i)) + list(range(i+1, v_rank))
      if shape[i] <= self._max_matrix_size:
        # If the tensor size is sufficiently small perform full Shampoo update
        # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
        # is not strictly correct. However we will use it for now, and
        # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg)

        # pylint: disable=g-long-lambda,cell-var-from-loop
        mat_g_updated = control_flow_ops.cond(
            math_ops.mod(global_step, precond_update_interval) < 1,
            lambda: self._update_mat_g(
                mat_g, grad, axes, mat_gbar_decay_t,
                mat_gbar_weight_t * precond_update_interval, i),
            lambda: mat_g)

        if self._svd_interval == 1:
          mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha)
        else:
          mat_h = control_flow_ops.cond(
              math_ops.mod(global_step, svd_interval) < 1,
              lambda: self._compute_power(var, mat_g_updated, shape[i],
                                          neg_alpha, "H_" + str(i)),
              lambda: self.get_slot(var, "H_" + str(i)))

        # mat_h is a square matrix of size d_i x d_i
        # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor
        # After contraction with a d_i x d_i tensor
        # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor
        # (the first dimension is contracted out, and the second dimension of
        # mat_h is appended).  After going through all the indices, it becomes
        # a d_0 x ... x d_n tensor again.
        preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h,
                                                 axes=([0], [0]),
                                                 name="precond_" + str(i))
      else:
        # Tensor size is too large -- perform diagonal Shampoo update
        grad_outer = math_ops.reduce_sum(grad * grad, axis=axes)
        if i == 0 and indices is not None:
          assert self._mat_gbar_decay == 1.0
          mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                mat_gbar_weight_t * grad_outer)
          mat_h = math_ops.pow(
              array_ops.gather(mat_g_updated, indices) + self._epsilon,
              neg_alpha)
        else:
          mat_g_updated = self._weighted_average(mat_g,
                                                 self._mat_gbar_decay,
                                                 mat_gbar_decay_t,
                                                 mat_gbar_weight_t * grad_outer)
          mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha)

        # Need to do the transpose to ensure that the tensor becomes
        # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
        preconditioned_grad = array_ops.transpose(
            preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h

    # Update the variable based on the Shampoo update
    learning_rate_t = GetParam(self._learning_rate, global_step)
    if indices is not None:
      var_updated = state_ops.scatter_add(
          var, indices, -learning_rate_t * preconditioned_grad)
    else:
      var_updated = state_ops.assign_sub(var,
                                         learning_rate_t * preconditioned_grad)
    return var_updated

### S3 Client
Based on mlagi@ implementation (added upload, fast file existence checking, some minor changes):

Caching / logging is removed as this additional I/O slows things down by ~30-40%.

https://git.hubteam.com/HubSpot/ProductionModels/blob/8d5196a59b256ff5a67caed3a7cce38cd39a393d/Searchbar/v1/prod/AWSGetter.py

In [None]:
import os
import boto3

class S3Client(object):
    """Download/upload files from/to Amazon Web Services S3.
    
    If `aws_access_key_id` and `aws_secret_access_key` are not defined
    it will attempt loading credentials from ~/.aws/credentials.
    """

    def __init__(self, bucket_name, prefix, aws_access_key_id=None, aws_secret_access_key=None):
        """Initialization."""
        self.bucket_name = bucket_name
        self.prefix = prefix
        self.client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
        )
    
    @staticmethod
    def clean_subdir_path(subdir_path):
        """Removes '/' prefix if it exists.
        
        In case if subdirectory path is absolute (starts with '/') all
        previous paths are discarded.
        E.g.: os.path.join('/home', '/dir')  # Outputs: '/dir'
        """
        return subdir_path[1:] if subdir_path.startswith('/') else subdir_path
    
    def check_file(self, key):
        """Check if file exists on S3."""
        aws_file_key = os.path.join(self.prefix, key)
        try:
            self.client.head_object(Bucket=self.bucket_name, Key=aws_file_key)
            return True
        except:
            return False
        
    def get_file(self, key):
        """Read a file from S3 (without saving to disk)."""
        aws_file_key = os.path.join(self.prefix, key)
        try:
            s3_object = self.client.get_object(Bucket=self.bucket_name, Key=aws_file_key)
            return s3_object['Body']
        except Exception as e:
            print(e)
            return None
    
    def put_file(self, body, key):
        """Upload a file from S3 (without saving to disk)."""
        aws_file_key = os.path.join(self.prefix, key)
        try:
            s3_object = self.client.put_object(
                Body=body, Bucket=self.bucket_name, Key=aws_file_key)
            return True
        except:
            return False
        
    def download_file(self, key, destination_dir, truncate_path=False):
        """Download a file from S3."""
        # Get file paths.
        file_path = os.path.join(destination_dir,
                                 os.path.split(key)[1] if truncate_path else key)
        aws_file_key = os.path.join(self.prefix, key)
        
        # Create target subdirectories if needed.
        folder_path = os.path.dirname(file_path)
        if folder_path and (not os.path.exists(folder_path)):
            os.makedirs(folder_path, exist_ok=True)

        # Download file.
        self.client.download_file(self.bucket_name, aws_file_key, file_path)

    def upload_file(self, file_path, destination_dir):
        """Upload a file to S3.
        
        Args:
            file_path (str): path to a local file.
            destination_dir (str): directory to which file will be uploaded.
        """
        file_name = os.path.basename(file_path)
        key = os.path.join(
            self.prefix, self.clean_subdir_path(destination_dir), file_name)
        self.client.upload_file(
            Filename=file_path, Bucket=self.bucket_name, Key=key)

    def download_files(self, keys, destination_dir, truncate_path=False):
        """Download a list of files from S3."""
        for key in keys:
            self.download_file(key, destination_dir, truncate_path)
            
    def upload_files(self, file_paths, destination_dir):
        """Upload a list of files to S3."""
        for file_path in file_paths:
            self.upload_file(file_path, destination_dir)

    def download_directory(self, prefix, destination_dir, original_prefix=None):
        """Download recursively all files from a S3 directory.
        Args:
            prefix (str): S3 directory path (e.g. "qa/dumps/searchbar/").
            destination_dir (str): local destination directory.
            original_prefix (str): first prefix in the recursion stack.
        """
        paginator = self.client.get_paginator('list_objects')
        if not prefix.startswith(self.prefix):
            prefix = self.prefix + prefix
        if original_prefix is None:
            original_prefix = prefix
        for result in paginator.paginate(Bucket=self.bucket_name, Delimiter='/',
                                         Prefix=prefix):
            if result.get('CommonPrefixes') is not None:
                for subdir in result.get('CommonPrefixes'):
                    self.download_directory(
                        subdir.get('Prefix'), destination_dir,
                        original_prefix=original_prefix)
            if result.get('Contents') is not None:
                for content in result.get('Contents'):
                    key = content.get('Key')
                    short_key = self.clean_subdir_path(
                        key.replace(original_prefix, ''))
                    short_path = os.path.join(destination_dir, short_key)
                    dirname = os.path.dirname(short_path)
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)
                    if os.path.exists(short_path):
                        continue
                    try:
                        self.client.download_file(
                            self.bucket_name, key, short_path)
                    except OSError as err:
                        continue
                        
    def upload_directory(self, source_dir, destination_dir):
        """Upload a directory and all its contents to S3.
        
        Note: empty subfolders are not uploaded.
        """
        for path, _, files in os.walk(source_dir):
            for f in files:
                dir_prefix = path.replace(source_dir, '')
                file_destination_dir = self.clean_subdir_path(
                    os.path.join(destination_dir, dir_prefix))
                self.upload_file(os.path.join(path, f), file_destination_dir)

In [None]:
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}
        

def plot_clusters(clusters, xs, ys):
    text_df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 
    groups = text_df.groupby('label')

    fig, ax = plt.subplots(figsize=(12, 6))
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=6, mec='none', alpha=0.3)
        ax.set_aspect('auto')
        ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
        ax.tick_params(axis='y', which='both', left='off', top='off', labelleft='off')
    plt.show()
    
    
def plot_string_clusters_with_tooltip(strings, clusters, xs, ys):
    text_df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, item_name=strings)).sample(10000)
    groups = text_df.groupby('label')

    css = """
    text.mpld3-text, div.mpld3-tooltip {
      font-family:Arial, Helvetica, sans-serif;
    }

    g.mpld3-xaxis, g.mpld3-yaxis {
    display: none; }
    """

    fig, ax = plt.subplots(figsize=(12, 6))
    for name, group in groups:
        if name == -1:
            continue
        
        points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=6, mec='none', alpha=0.3)
        ax.set_aspect('auto')
        labels = [i for i in group.item_name]

        tooltip = mpld3.plugins.PointHTMLTooltip(
            points[0], labels, voffset=10, hoffset=10, css=css)
        mpld3.plugins.connect(fig, tooltip, TopToolbar())    

        ax.axes.get_xaxis().set_ticks([])
        ax.axes.get_yaxis().set_ticks([])

        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)


In [None]:
SETUP_COMPLETE = True