# NLP Token Visualizer : demo

In [1]:
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension --sys-prefix
#!jupyter nbextension install --py widgetsnbextension --sys-prefix

In [2]:
import tokenviz
from tokenviz.visualization import process_text
import ipywidgets as widgets
from IPython.display import display, HTML

## Define encoding / decoding scheme

Here I'm borrowing some code from Andrej Karpathy's video lecture [Let's build GPT: from scratch, in code, spelled out.](https://www.youtube.com/watch?v=kCc8FmEb1nY). When using your own tokenizer, replace these methods with your own appropriate code.

In [3]:
# taken from: https://github.com/karpathy/ng-video-lecture/blob/master/bigram.py

with open('_demo/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

## HTML example

Here's a simple example using the predefined encoding/decoding methods with a simple string.

In [4]:
text = 'Hello world!'
output = widgets.Output()
with output:
    output.clear_output()  # Clear previous output
    processed_text = process_text(text, encode, decode, markup='html')
    display(HTML(processed_text))
display(output)

print(processed_text)

Output()

<span style="background-color: Khaki;">H</span><span style="background-color: AliceBlue;">e</span><span style="background-color: Aquamarine;">l</span><span style="background-color: Coral;">l</span><span style="background-color: Lavender;">o</span><span style="background-color: Ivory;"> </span><span style="background-color: DarkSalmon;">w</span><span style="background-color: Khaki;">o</span><span style="background-color: AliceBlue;">r</span><span style="background-color: Aquamarine;">l</span><span style="background-color: Coral;">d</span><span style="background-color: Lavender;">!</span>


## LaTeX example

Add the following imports and definitions to your LaTeX document.

```latex
\usepackage{listings}
\usepackage{xcolor}

% Define a custom style for listings
\lstdefinestyle{custom}{
    basicstyle=\small\ttfamily, % Small font size and typewriter style
    escapeinside={(*@}{@*)},    % Escape for inline LaTeX
}
\begin{lstlisting}[caption=My title, label=mylabel, style=custom]
% Your LaTeX code goes here
\end{lstlisting}
```

In [5]:
text = 'Hello world!'
processed_text = process_text(text, encode, decode, markup='latex', colors=tokenviz.visualization.LATEX_COLORS)
print(processed_text)

(*@\colorbox{yellow}{H}@*)(*@\colorbox{pink}{e}@*)(*@\colorbox{lightgray}{l}@*)(*@\colorbox{lime}{l}@*)(*@\colorbox{cyan}{o}@*)(*@\colorbox{magenta}{ }@*)(*@\colorbox{yellow}{w}@*)(*@\colorbox{pink}{o}@*)(*@\colorbox{lightgray}{r}@*)(*@\colorbox{lime}{l}@*)(*@\colorbox{cyan}{d}@*)(*@\colorbox{magenta}{!}@*)
