# Visualizing Active Recovery Protocol Text Data

Active Recovery Protocols:

* Breathwork
* Cryotherapy
* Exercise
* Heat Therapy
* Nature Immersion
* Sleeping

In [2]:
%load_ext autoreload
%autoreload 2

In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import string
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import altair as alt
import pandas as pd

In [3]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# 1. How Related Are these Active Recovery Protocols? 

Let's create an embedding representation for each protocol, calculate a similarity matrix, and then make a heatmap to see how much they overlap!

## Analysis

### Embeddings

First, I need each active protocol to be its own document.

In [5]:
def read_text_files(file_dict):
    data = {}
    for var_name, file_path in file_dict.items():
        with open(file_path, "r", encoding="utf-8") as f:
            data[var_name] = f.read().lower()
    return data

# Example usage:
breathwork_file_dict = {
    "breathwork_overview_raw": "data/raw/breathwork/overview.txt",
    "breathwork_execution_raw": "data/raw/breathwork/execution.txt",
    "breathwork_benefits_raw": "data/raw/breathwork/benefits.txt"
}

cryotherapy_file_dict = {
    "cryotherapy_overview_raw": "data/raw/cryotherapy/overview.txt",
    "cryotherapy_execution_raw": "data/raw/cryotherapy/execution.txt",
    "cryotherapy_benefits_raw": "data/raw/cryotherapy/benefits.txt"
}

heat_therapy_file_dict = {
    "heat_therapy_overview_raw": "data/raw/heat_therapy/overview.txt",
    "heat_therapy_execution_raw": "data/raw/heat_therapy/execution.txt",
    "heat_therapy_benefits_raw": "data/raw/heat_therapy/benefits.txt"
}

exercise_file_dict = {
    "exercise_overview_raw": "data/raw/exercise/overview.txt",
    "exercise_execution_raw": "data/raw/exercise/execution.txt",
    "exercise_benefits_raw": "data/raw/exercise/benefits.txt"
}

nature_file_dict = {
    "nature_overview_raw": "data/raw/nature/overview.txt",
    "nature_execution_raw": "data/raw/nature/execution.txt",
    "nature_benefits_raw": "data/raw/nature/benefits.txt"
}

sleep_file_dict = {
    "sleep_overview_raw": "data/raw/sleep/overview.txt",
    "sleep_execution_raw": "data/raw/sleep/execution.txt",
    "sleep_benefits_raw": "data/raw/sleep/benefits.txt"
}

breathwork_text_data = read_text_files(breathwork_file_dict)
cryotherapy_text_data = read_text_files(cryotherapy_file_dict)
heat_therapy_text_data = read_text_files(heat_therapy_file_dict)
exercise_text_data = read_text_files(exercise_file_dict)
nature_text_data = read_text_files(nature_file_dict)
sleep_text_data = read_text_files(sleep_file_dict)

In [7]:
# Load a pretrained model (e.g., 'all-MiniLM-L6-v2' is lightweight and effective)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Your documents (replace with your actual text)
documents = [
    breathwork_text_data["breathwork_overview_raw"] + " " + breathwork_text_data["breathwork_execution_raw"] + " " + breathwork_text_data["breathwork_benefits_raw"],
    cryotherapy_text_data["cryotherapy_overview_raw"] + " " + cryotherapy_text_data["cryotherapy_execution_raw"] + " " + cryotherapy_text_data["cryotherapy_benefits_raw"],
    heat_therapy_text_data["heat_therapy_overview_raw"] + " " + heat_therapy_text_data["heat_therapy_execution_raw"] + " " + heat_therapy_text_data["heat_therapy_benefits_raw"],
    exercise_text_data["exercise_overview_raw"] + " " + exercise_text_data["exercise_execution_raw"] + " " + exercise_text_data["exercise_benefits_raw"],
    nature_text_data["nature_overview_raw"] + " " + nature_text_data["nature_execution_raw"] + " " + nature_text_data["nature_benefits_raw"],
    sleep_text_data["sleep_overview_raw"] + " " + sleep_text_data["sleep_execution_raw"] + " " + sleep_text_data["sleep_benefits_raw"]
]

# Generate embeddings
embeddings = model.encode(documents, convert_to_tensor=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Let's take a look at one of the embeddings!

In [10]:
embeddings[0].shape

torch.Size([384])

In [11]:
embeddings[0][0:10]

tensor([-0.0725,  0.0533, -0.0570,  0.0121, -0.0242, -0.0174, -0.0125, -0.0458,
        -0.0315,  0.0271])

### Similarity Matrix

In [13]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

print("Similarity Matrix:")
print(similarity_matrix)

Similarity Matrix:
[[1.0000002  0.35337472 0.55132973 0.5183076  0.53135264 0.5063235 ]
 [0.35337472 1.0000004  0.6192724  0.50839937 0.37734425 0.3363817 ]
 [0.55132973 0.6192724  1.         0.5834539  0.59140074 0.5238651 ]
 [0.5183076  0.50839937 0.5834539  0.99999994 0.6421325  0.63790226]
 [0.53135264 0.37734425 0.59140074 0.6421325  1.0000002  0.60041714]
 [0.5063235  0.3363817  0.5238651  0.63790226 0.60041714 0.9999996 ]]


## Visualization

### Heatmap

In [17]:
labels = ["breathwork", "cryotherapy", "heat_therapy", "exercise", "nature", "sleep"]

# Convert matrix to long-form DataFrame
rows = []
for i in range(similarity_matrix.shape[0]):
    for j in range(similarity_matrix.shape[1]):
        rows.append({
            'x': labels[j],
            'y': labels[i],
            'value': similarity_matrix[i, j],
            'row_idx': i,
            'col_idx': j
        })
df = pd.DataFrame(rows)

# Filter to show only lower triangle (including diagonal)
df_half = df[df['row_idx'] >= df['col_idx']]

# Create the half heatmap
heatmap = alt.Chart(df_half).mark_rect().encode(
    x=alt.X('x:O', title='', sort=labels),
    y=alt.Y('y:O', title='', sort=labels),
    color=alt.Color('value:Q', 
                   scale=alt.Scale(scheme='oranges', domain=[0, 1]),
                   legend=alt.Legend(title='Similarity')),
    tooltip=['x', 'y', 'value']
).properties(
    title='Active Recovery Protocol Similarity Heatmap',
    width=300,
    height=300
)

# Add text labels
text = heatmap.mark_text(baseline='middle').encode(
    text=alt.Text('value:Q', format='.2f'),
    color=alt.condition(
        alt.datum.value > 0.5,
        alt.value('black'),
        alt.value('white')
    )
)

# Combine and make interactive
interactive_half_heatmap = (heatmap + text).interactive()
interactive_half_heatmap

Based on these results, we can immediately see that cryotherapy is the most unique from the rest.

We can also see that exercise is very similar to nature-based recovery and sleeping.

What is fascinating is that cryotherapy and heat therapy are the most similar with each other compared to the other protocols.

Here are the most related pairings:

* cryotherapy <-> heat therapy
* exercise <-> sleep 
* nature <-> exercise

Breakthwork is the loser, not having any strongest similarity with any other protocol:

* breathwork -> heat therapy