# Reading data


## Compiling all annotations

The dataset provided by DN in `/Datasets/impactme/dylan_extract` was copied into the working directory as `data/labeled`. 

Currently, each label is split across multiple TSVs (27 subdomains + 8 domains + 1 ANY feature). To match the previous workflow, we want to create a table with the text, and each column corresponding to the annotation. E.g., one table with 1 + 27 + 8 + 1 columns

Below is an example of how the A1 annotations for all may look. 

In [None]:
# Read data/labeled/a1.tsv for demo
import pandas as pd

df = pd.read_csv('data/labeled/all/a1.tsv', sep='\t')
df.head()

In [None]:
# Quick sanity check: make sure that the column "text" for each TSV in data/labeled/all/ is identical
# If not, we need to reprocess the data

# Read all TSVs in data/labeled/all/
import pandas as pd

features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8',
    'b1', 'b2', 'b3',
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6',
    'e1', 'e2', 'e3', 'e4', 'e5',
    'f1', 'f2', 'f3', 
    'g1', 'g2', 
]

all_tsv = {
    f: pd.read_csv(f'data/labeled/all/{f}.tsv', sep='\t') for f in features
}

# Check if all "text" columns are identical
for f in features:
    if not all_tsv['a1']['text'].equals(all_tsv[f]['text']):
        print(f'Column "text" in {f} is different from a1')

# If all "text" columns are identical, we can create the new data frame
# We can copy A1 and add the other columns
df = all_tsv['a1'].copy()
df = df.rename(columns={'positive': 'a1'})
df['a1'] = df['a1'].astype(int)

for f in features[1:]:
    df[f] = all_tsv[f]['positive'].astype(int)

In [None]:
# Return elements of 'text' that are 3 characters long
df[df['text'].str.len() < 4]['text'].unique()

# Return unique combinations of subject and reporter from the above
# df[df['text'].str.len() == 2][['subject', 'reporter']].drop_duplicates()

In [None]:
print(df.shape)
# Drop the rows where str length is less than 4
df = df[df['text'].str.len() >= 4]
# Get number of rows
print(df.shape)

In [None]:
# Remove columns 'a', 'b', 'c', 'd', 'e', 'f', 'g' and 'all'
# They need to be remade, as there was a mistake in the data collection
# Column is is an ANY on a1, a2, ..., a8. Same rule applies to the other letters

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g']

for l in letters:
    # Select all columns that start with the letter
    cols = [c for c in df.columns if c.startswith(l)]
    df[l] = df[cols].any(axis=1).astype(int)

# Can recreate the column 'all' as an ANY on a, b, c, d, e, f, g
df['any'] = df[letters].any(axis=1).astype(int)

# Write as text_anno
df.to_csv('data/text_anno.tsv', sep='\t', index=False)

### Holdout list

The set of holdout patients was manually developed by Dylan based on picking a subset of holdouts that showed a similar distribution of symptom occurrence per reporter. Dylan's reasoning can be found in the Slack channel. 

In [None]:
holdouts = [
    'BEH2360',
    'TAV2103',
    'ISL2227',
    'BEH2357',
    'ISL2224',
    'BEH2303',
    'TAV2139',
    'TAV2101'
]

tst = df[df["subject"].isin(holdouts)]
trn = df[-df["subject"].isin(holdouts)]

# Sum each column for tst and trn
tst_sum = tst[features].sum()
trn_sum = trn[features].sum()

# Create a new data frame with the sums
df_sum = pd.DataFrame([trn_sum, tst_sum], index=['trn', 'tst']).T
df_sum

In [None]:
tst.to_csv("data/tst/all.tsv", sep="\t")
trn.to_csv("data/trn/all.tsv", sep="\t")

### Repeating for other datasets

We will repeat basically the same steps as above on `pt_noshort` and `turns`.

In [None]:
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8',
    'b1', 'b2', 'b3',
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6',
    'e1', 'e2', 'e3', 'e4', 'e5',
    'f1', 'f2', 'f3', 
    'g1', 'g2', 
    # 'a', 'b', 'c', 'd', 'e', 'f', 'g',
    # 'all'
]

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g']

def write_tsvs(df, holdouts, fname):
    tst = df[df["subject"].isin(holdouts)]
    trn = df[-df["subject"].isin(holdouts)]

    tst.to_csv(f"data/tst/{fname}.tsv", sep="\t")
    trn.to_csv(f"data/trn/{fname}.tsv", sep="\t")


#### pt_noshort

Even though this dataset is pronounced 'patient no short', it really takes the non-interviewers. 

The cutoff is 13 characters (that is, include everything with 13 characters or more), as this is the shortest positive example found. 

In [None]:
ptns_tsv = {
    f: pd.read_csv(f'data/labeled/pt_noshort/{f}.tsv', sep='\t') for f in features
}

# Check if all "text" columns are identical
for f in features:
    if not ptns_tsv['a1']['text'].equals(ptns_tsv[f]['text']):
        print(f'Column "text" in {f} is different from a1')

df = ptns_tsv['a1'].copy()
df = df.rename(columns={'positive': 'a1'})
df['a1'] = df['a1'].astype(int)

In [None]:
# Add the other columns
for f in features[1:]:
    df[f] = ptns_tsv[f]['positive'].astype(int)

for l in letters:
    cols = [c for c in df.columns if c.startswith(l)]
    df[l] = df[cols].any(axis=1).astype(int)

temp = pd.read_csv('data/labeled/pt_noshort/all.tsv', sep='\t')
df['any'] = temp['positive'].astype(int)

In [None]:
# Sum each column for tst and trn
tst = df[df["subject"].isin(holdouts)]
trn = df[-df["subject"].isin(holdouts)]
tst_sum = tst[features].sum()
trn_sum = trn[features].sum()

# Create a new data frame with the sums
df_sum = pd.DataFrame([trn_sum, tst_sum], index=['trn', 'tst']).T
df_sum

In [None]:
# Check sum of features
pd.DataFrame([df[letters].sum()], index=['all'])
pd.DataFrame([df[features].sum()], index=['all'])

In [None]:
print(df.shape)

In [None]:
write_tsvs(df, holdouts, "pt_noshort")

#### Turns

A turn in conversation is, ideally, the interviewer and then the patient. Due to quirks in the data, in practice it's more like a chunk that starts with an interviewer. 

In [None]:
# Repeat the above, but 'turns' instead of 'pt_noshort'
turns_tsv = {
    f: pd.read_csv(f'data/labeled/turns/{f}.tsv', sep='\t') for f in features
}

for f in features:
    if not turns_tsv['a1']['text'].equals(turns_tsv[f]['text']):
        print(f'Column "text" in {f} is different from a1')

df = turns_tsv['a1'].copy()
df = df.rename(columns={'positive': 'a1'})


In [None]:
# Show rows that are less than 6 characters
df[df['text'].str.len() < 7]['text'].unique()

In [None]:
print(df.shape)

# Remove rows where 'text' is 'I: P: ' or 'I: P:'
df = df[~df['text'].str.endswith(('I: P: ', 'I: P:'))]

print(df.shape)

In [None]:
for f in features[1:]:
    df[f] = turns_tsv[f]['positive'].astype(int)

for l in letters:
    cols = [c for c in df.columns if c.startswith(l)]
    df[l] = df[cols].any(axis=1).astype(int)

df['any'] = df[letters].any(axis=1).astype(int)

In [None]:
# Check sum of features
# pd.DataFrame([df[letters].sum()], index=['all']).T
# pd.DataFrame([df[features].sum()], index=['all']).T

write_tsvs(df, holdouts, "turns")

## Passage lengths

Previously performed in R but converted to Python for consistency. 

### Data summary

After rejoining the TSVs, we can look at a basic summary table of their min, max, range, etc.

In [None]:
import numpy as np

def rejoin_tsvs(fname):
    df = pd.read_csv(f'data/trn/{fname}.tsv', sep='\t')
    df = pd.concat([df, pd.read_csv(f'data/tst/{fname}.tsv', sep='\t')])

    # Add column 'nchar' for number of characters in 'text'
    df['nchar'] = df['text'].str.len()
    df['log_nchar'] = df['nchar'].apply(lambda x: 0 if x == 0 else np.log10(x))
    
    return df

fnames = ['all', 'pt_noshort', 'turns']
passages = {
    f: rejoin_tsvs(f) for f in fnames
}

In [None]:
# Print summary tables of passages

for f in fnames:
    print(f)
    print(passages[f].shape)
    print('Passages w/ any feature:', passages[f]['any'].sum())
    # Range of characters
    print('Range of characters:', passages[f]['nchar'].min(), passages[f]['nchar'].max())
    print()

### Make histograms

Histograms are faceted by the segmentation type. Additionally, positive and negative labels are plotted separately, using the feature 'any' for visual clarity. 

In [None]:
import matplotlib.pyplot as plt

# Create a 1 x 3 grid of plots
fig, axs = plt.subplots(1, 3, figsize=(12, 4))

# Paper-friendly names
new_names = [
    'Original', 
    'Monologue', 
    'Turns'
]

for i, f in enumerate(fnames):
    # Plot histogram of 'log_nchar' where 'any' is 1
    # passages[f]['log_nchar'].hist(ax=axs[i])
    
    # Plot histogram of 'log_nchar' where 'any' is 1
    passages[f][passages[f]['any'] == 1]['log_nchar'].hist(
        ax=axs[i], 
        bins=20, 
        alpha=0.5, 
        color='green',
        label='Any feature'
    )
    passages[f][passages[f]['any'] == 0]['log_nchar'].hist(
        ax=axs[i],
        bins=20, 
        alpha=0.5, 
        color='skyblue',
        label='No feature'
    )   

    # Fix axes to 0 to 10
    axs[i].set_xlim(0.5, 4.0)

    # Convert x-axis to 10^x
    axs[i].set_xticks(np.arange(0.5, 4.1, 0.5))
    axs[i].set_xticklabels([f'{10 ** x:.0f}' for x in np.arange(0.5, 4.1, 0.5)])

    # Various labels
    axs[i].set_title(new_names[i])
    axs[i].set_xlabel('No. of characters')
    axs[i].set_ylabel('Frequency')

# Create legend at the bottom below the axis labels
fig.legend(['Any feature', 'No feature'], loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.05))

fig.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Create a 1 x 3 grid of plots
fig, axs = plt.subplots(1, 3, figsize=(12, 4))

# Paper-friendly names
new_names = [
    'Original', 
    'Monologue-like', 
    'Turns'
]

max_char = 1000

for i, f in enumerate(fnames):
    # Plot histogram of 'log_nchar' where 'any' is 1
    # passages[f]['log_nchar'].hist(ax=axs[i])

    # Fix axes to max_char
    axs[i].set_xlim(0, max_char)
    # Filter df by max_char
    temp = passages[f][passages[f]['nchar'] <= max_char]
    
    # Plot histogram of 'log_nchar' where 'any' is 1
    temp[temp['any'] == 1]['nchar'].hist(
        ax=axs[i], 
        bins=20, 
        alpha=0.5, 
        color='green',
        label='Any feature'
    )
    temp[temp['any'] == 0]['nchar'].hist(
        ax=axs[i],
        bins=20, 
        alpha=0.5, 
        color='skyblue',
        label='No feature'
    )   

    # Various labels
    axs[i].set_title(new_names[i])
    axs[i].set_xlabel('No. of characters')
    axs[i].set_ylabel('Frequency')

# Create legend at the bottom below the axis labels
fig.legend(['Any feature', 'No feature'], loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.05))

fig.tight_layout()
plt.show()

## Label distribution

We can display the counts of each feature for each type of data in a simple wide table. 

In [None]:
# Wide table of counts of feature by each dataset
df = pd.DataFrame()

feats = features + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'any']
for f in fnames:
    temp = passages[f][feats].sum()
    temp['dataset'] = f
    df = df.append(temp, ignore_index=True)

df = df.set_index('dataset')

# Replace names
df = df.rename(index={'all': 'Original', 'pt_noshort': 'Monologue-like', 'turns': 'Turns'})
df = df.rename(columns={f: f.upper() for f in feats})
df.T.head()

#### Compared with training

We can also append columns corresponding to the number of annotations in the training set (non-holdout).

In [None]:
import pandas as pd
fnames = ['all', 'pt_noshort', 'turns']
trn_passages = {
    f: pd.read_csv(f'data/trn/{f}.tsv', sep='\t') for f in fnames
}

In [None]:
# Wide table of counts of feature by each dataset
trn_df = pd.DataFrame()

feats = features + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'any']
for f in fnames:
    temp = trn_passages[f][feats].sum()
    temp['dataset'] = f
    trn_df = trn_df.append(temp, ignore_index=True)

trn_df = trn_df.set_index('dataset')

# Replace names
trn_df = trn_df.rename(index={'all': 'Original', 'pt_noshort': 'Monologue-like', 'turns': 'Turns'})
trn_df = trn_df.rename(columns={f: f.upper() for f in feats})

In [None]:
df.T.head()

In [None]:
# Make trn_df.T new columns of df.T
all_df = df.T.join(trn_df.T, rsuffix='_trn')

# Print as LaTeX
print(all_df.to_latex())

## Subject number distributions

Similar to above, we can create tables listing the number of unique subjects for each feature. 

In [None]:
segmentations = ['Orig', 'Mono', 'Turn']
subj_nos_all = []
for data in fnames:
    temp = passages[data]
    subj_nos = [temp[temp[feat] == 1]['subject'].nunique() for feat in feats]
    subj_nos_all.append(subj_nos)
subj_nos_all = np.array(subj_nos_all).T
df = pd.DataFrame(subj_nos_all, columns=segmentations)
# Insert feats as first column
df.insert(0, 'Feature', feats)

# Repeat for trn_df
subj_nos_all = []
for data in fnames:
    temp = trn_passages[data]
    subj_nos = [temp[temp[feat] == 1]['subject'].nunique() for feat in feats]
    subj_nos_all.append(subj_nos)
subj_nos_all = np.array(subj_nos_all).T
trn_df = pd.DataFrame(subj_nos_all, columns=segmentations)
trn_df.insert(0, 'Feature', feats)

# Merge data
all_df = df.merge(trn_df, on='Feature', suffixes=('', '1'))
# Sort by 'Feature'
all_df = all_df.sort_values('Feature')
# Make Feature all upper
all_df['Feature'] = all_df['Feature'].str.upper()

# Print as LaTeX
print(all_df.to_latex())

## Embeddings

The average of the last hidden state of the model will be provided as the inputs to logistic regression. The final hidden layer has dimension $n_{tokens} \times d_{hidden}$, where $d_{hidden} = 768$ for base BERT. The average produces a $1 \times 768$ tensor. Alternatively, a sum of the last few layers can also be used as the inputs. 

As this code takes some time to run, even with GPU acceleration, the embeds are saved in the `data/trn` directory and read from the file. Embeds are generated in the 00A and 00B executables. 

In [None]:
# Run this in a faster machine, like radium or europium
# ! python 00A-embeds.py

### BERT embedding distribution

To get an idea of the structure of the data, a heatmap of the embeddings is provided below. 

To be visually informative, the range of the heatmap is roughly the mean plus or minus 1.5 standard deviations. The heatmap shows banding; this may be the result of divisions between the patient IDs. 

In [None]:
import torch
import numpy as np

with open('data/trn/bert_base_uncased-last_avg-all.t','rb') as f:
    embeds = torch.load(f)

embeds_np = embeds.cpu().numpy()

print(
    np.min(embeds_np), 
    np.max(embeds_np),
    np.mean(embeds_np),
    np.std(embeds_np)
)

Adding lines indicating the patient and reporter type boundaries does show that banding is correlated with the ID.

In [None]:
from itertools import groupby

text_anno = pd.read_csv("data/trn/all.tsv", sep="\t")
text_id = text_anno["subject"].values
text_type = text_anno["reporter"].values

id_idx = [i for i, (a, b) in enumerate(zip(text_id, text_id[1:]), 1) if a != b]
type_idx = [i for i, (a, b) in enumerate(zip(text_type, text_type[1:]), 1) if a != b]

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20, 10))
plt.imshow(embeds_np, vmin=-.5, vmax=.5, aspect=.05)
plt.colorbar()
plt.hlines(y=id_idx, xmin=0, xmax=150, color='r', linestyle='-', linewidth=0.5)
plt.hlines(y=id_idx, xmin=618, xmax=767, color='r', linestyle='-', linewidth=0.5)
plt.hlines(y=type_idx, xmin=151, xmax=617, color='w', linestyle='--', linewidth=1)
plt.show()

We can also group together all the reporters from the same ID. The reason why we're writing a dictionary and concatenating the values is because using `np.argsort` straightforwardly results in scrambling of the secondary grouping by type, rather than keeping transcripts from the same reporter in the same chunk. 

In [None]:
unique_ids = text_anno['subject'].unique()
id_grouped = [np.where(text_id == i)[0] for i in unique_ids]

text_id = text_anno["subject"].values
text_type = text_anno["reporter"].values

type_list = ['y', 'p', 't']
id_dict = {}

for i in unique_ids:
    id_dict[i] = {}
    for t in type_list:
        id_dict[i][t] = np.where((text_id == i) & (text_type == t))[0]

id_grouped = []

for i in id_dict:
    temp = [id_dict[i][t] for t in type_list]
    temp = np.concatenate(temp)
    id_grouped.append(temp)
id_grouped = np.concatenate(id_grouped)

text_id = text_id[id_grouped]
text_type = text_type[id_grouped]

id_idx = [i for i, (a, b) in enumerate(zip(text_id, text_id[1:]), 1) if a != b]
type_idx = [i for i, (a, b) in enumerate(zip(text_type, text_type[1:]), 1) if a != b]

embeds_grouped = embeds_np[id_grouped]

fig = plt.figure(figsize=(20, 10))
plt.imshow(embeds_grouped, vmin=-.5, vmax=.5, aspect=.05)
plt.colorbar()
plt.hlines(y=id_idx, xmin=0, xmax=150, color='r', linestyle='-', linewidth=0.5)
plt.hlines(y=id_idx, xmin=618, xmax=767, color='r', linestyle='-', linewidth=0.5)
plt.hlines(y=type_idx, xmin=151, xmax=450, color='w', linestyle='--', linewidth=1)
plt.show()

### Comparing embeddings from different models

We can repeat the code above to compare the embeddings across BERT base uncased, mental BERT, and mental longformer. 

In [None]:
models = [
    'bert_base_uncased',
    'mental_bert', 
    'roberta'
]
text_anno = pd.read_csv("data/trn/all.tsv", sep="\t")
embeds = {}

for model in models:
    with open(f'data/trn/{model}-last_avg-all.t','rb') as f:
        x = torch.load(f)

    # Convert embeddings to numpy
    x = x.cpu().numpy()

    unique_ids = text_anno["subject"].unique()
    id_grouped = [np.where(text_id == i)[0] for i in unique_ids]

    text_id = text_anno["subject"].values
    text_type = text_anno["reporter"].values

    type_list = ["y", "p", "t"]
    id_dict = {}

    for i in unique_ids:
        id_dict[i] = {}
        for t in type_list:
            id_dict[i][t] = np.where((text_id == i) & (text_type == t))[0]

    id_grouped = []

    for i in id_dict:
        temp = [id_dict[i][t] for t in type_list]
        temp = np.concatenate(temp)
        id_grouped.append(temp)
    id_grouped = np.concatenate(id_grouped)

    text_id = text_id[id_grouped]
    text_type = text_type[id_grouped]

    id_idx = [i for i, (a, b) in enumerate(zip(text_id, text_id[1:]), 1) if a != b]
    type_idx = [i for i, (a, b) in enumerate(zip(text_type, text_type[1:]), 1) if a != b]

    embeds[model] = x[id_grouped]


In [None]:
# Make table of summary stats (mean, std, min, max) for each feature
import numpy as np
import pandas as pd

embeds_df = []

for model in models:
    embeds_np = embeds[model]
    # print(model)
    # print(
    #     np.min(embeds_np), 
    #     np.max(embeds_np),
    #     np.mean(embeds_np),
    #     np.std(embeds_np)
    # )
    temp = pd.DataFrame({
        'model': model,
        'min': [np.min(embeds_np)],
        'max': [np.max(embeds_np)],
        'mean': [np.mean(embeds_np)],
        'std': [np.std(embeds_np)]
    })
    embeds_df.append(temp)

embeds_df = pd.concat(embeds_df, ignore_index=True)
embeds_df

In [None]:
# Plot the embeddings side by side
fig, axs = plt.subplots(1, 3, figsize=(30, 10))

for i, model in enumerate(models):
    axs[i].imshow(embeds[model], vmin=-.5, vmax=.5, aspect=.05)
    axs[i].set_title(model)
    axs[i].hlines(y=id_idx, xmin=0, xmax=150, color='r', linestyle='-', linewidth=0.5)
    axs[i].hlines(y=id_idx, xmin=618, xmax=767, color='r', linestyle='-', linewidth=0.5)
    axs[i].hlines(y=type_idx, xmin=151, xmax=450, color='w', linestyle='--', linewidth=1)

    # Title
    axs[i].set_title(model)
    # Add colorbar
    cbar = axs[i].figure.colorbar(axs[i].images[0], ax=axs[i])

plt.tight_layout
plt.show()

## Text examples

To get an idea of the components of the data, we will list unique passages (adjusted to be all lowercase) from each symptom (and also divided by positive and negative). These examples will be saved in `/Datasets/impactme/text_ex`.

In [None]:
import pandas as pd

text_anno = pd.read_csv("data/text_anno.tsv", sep='\t')

In [None]:
for s in text_anno.columns[5:]:
    # Sort the examples by pos/neg label and length
    # and make sure they're unique
    pos = text_anno[text_anno[s] == 1]["quote_t"].values
    pos = sorted(list(set([s.lower() for s in pos])), key=len)

    neg = text_anno[text_anno[s] == 0]["quote_t"].values
    neg = sorted(list(set([s.lower() for s in neg])), key=len)

    # Save text files in `/Datasets/impactme/text_ex`
    with open(f"/Datasets/impactme/text_ex/{s}_pos.txt", "w") as f:
        f.write("\n".join(pos))
    with open(f"/Datasets/impactme/text_ex/{s}_neg.txt", "w") as f:
        f.write("\n".join(neg))

    # Print some examples
    print(f"{s} pos: {pos[:5]}")
    print(f"{s} neg: {neg[:11]}")    
    

In [None]:
# Counting the most common short neg text
from collections import Counter

# Concatenate together all the negative examples from a1_mood...a8_comorbid
neg_text = []

for s in text_anno.columns[5:]:
    neg = text_anno[text_anno[s] == 0]["quote_t"].values
    neg = sorted(list(set([s.lower() for s in neg])), key=len)
    neg_text += neg

# Get the counts for each element
neg_counts = Counter(neg_text)

# Repeat for positive examples
pos_text = []

for s in text_anno.columns[5:]:
    pos = text_anno[text_anno[s] == 1]["quote_t"].values
    pos = sorted(list(set([s.lower() for s in pos])), key=len)
    pos_text += pos

pos_counts = Counter(pos_text)

In [None]:
# Write counts to a tab-separated text file
# Headers: text, count
with open("/Datasets/impactme/text_ex/neg_counts.tsv", "w") as f:
    f.write("text\tcount\n")
    for k, v in neg_counts.most_common():
        f.write(f"{k}\t{v}\n")

with open("/Datasets/impactme/text_ex/pos_counts.tsv", "w") as f:
    f.write("text\tcount\n")
    for k, v in pos_counts.most_common():
        f.write(f"{k}\t{v}\n")

In [None]:
# For pos_counts, write the Counter to a pnadas DataFrame and sort by count
pos_df = pd.DataFrame.from_dict(pos_counts, orient='index')
pos_df.reset_index(inplace=True)
pos_df.columns = ['text', 'total']

# Create a new column for each symptom indicating whether the text appears
for s in text_anno.columns[5:]:
    pos = text_anno[text_anno[s] == 1]["quote_t"].values
    pos = sorted(list(set([s.lower() for s in pos])), key=len)
    pos_df[s] = pos_df['text'].isin(pos).astype(int)

# insert column for number of characters in the text at index 1
pos_df.insert(1, 'nchar', pos_df['text'].str.len())
# Sort by the number of characters in the text column (ascending order)
pos_df = pos_df.sort_values(by=['total', 'nchar'], ascending=[False, True])

# Write pandas df to a tab-separated text file at `/Datasets/impactme/text_ex/pos_counts_all.tsv`
with open("/Datasets/impactme/text_ex/pos_counts_all.tsv", "w") as f:
    pos_df.to_csv(f, sep="\t", index=False)

# Repeat the above for negative counts
neg_df = pd.DataFrame.from_dict(neg_counts, orient='index')
neg_df.reset_index(inplace=True)
neg_df.columns = ['text', 'total']

for s in text_anno.columns[5:]:
    neg = text_anno[text_anno[s] == 0]["quote_t"].values
    neg = sorted(list(set([s.lower() for s in neg])), key=len)
    neg_df[s] = neg_df['text'].isin(neg).astype(int)

neg_df.insert(1, 'nchar', neg_df['text'].str.len())
neg_df = neg_df.sort_values(by=['total', 'nchar'], ascending=[False, True])

with open("/Datasets/impactme/text_ex/neg_counts_all.tsv", "w") as f:
    neg_df.to_csv(f, sep="\t", index=False)


In [None]:
neg_df.sort_values(by='nchar', ascending=True).head(6)

In [None]:
pos_df.sort_values(by='nchar', ascending=True).head(6)

## Notes on the other datasets

As of a meeting regarding BERT learning (October 4, 2023), the dataset will be adjusted in an effort to make training more informative. 

The following modifications have been proposed: 

- Only using passates with more than 12 characters
- Patient passages only
- Patient passages with more than 12 characters
- Interview-patient cycles instead of individual turns (no eliminatation entries here to preserve the turns)

The cutoff of 12 characters was chosen because the shortest positively labeled passage is 13 characters long. 

The datasets without modifications are saved in `data/trn/all.tsv` and `data/tst/all.tsv`.

We will be using the same set of holdouts as identified previously. 

As of 2024 Feb 22, the code used to create these datasets have been deleted, as the task was outsourced to Dylan. 

The data with only non-interviewer entries more than 12 characters is `pt_noshort`. Labeling of full turns of conversation is `turns`.

## LLama embeds

Llama embeds were generated on the original, uncleaned data. We can gather the correct indices to drop for each data type (all, pt_noshort, and turns) using the following.

### Dropped rows

The basic premise is that Llama embeds need to be converted into tensors for training and test sets. 

We need to collect the indices for:

1. Dropped rows (empty lines in transcript)
2. Retained and holdout set

In [None]:
# Count the number of indices in a1.tsv 

In [None]:
import pandas as pd
import numpy as np

datas = [
    'all',
    'pt_noshort',
    'turns'
]

holdouts = [
    'BEH2360',
    'TAV2103',
    'ISL2227',
    'BEH2357',
    'ISL2224',
    'BEH2303',
    'TAV2139',
    'TAV2101'
]

keep_indices = {d: [] for d in datas}

### 'all' ###

# Get indices where the text is more than or equal to 4 characters
df = pd.read_csv('data/labeled/all/a1.tsv', sep='\t')
drop = np.where(df['text'].str.len() < 4)[0]

trn_ind = np.where(~df['subject'].isin(holdouts))[0]
trn_ind = np.setdiff1d(trn_ind, drop)
keep_indices['all'].append(trn_ind)

tst_ind = np.where(df['subject'].isin(holdouts))[0]
tst_ind = np.setdiff1d(tst_ind, drop)
keep_indices['all'].append(tst_ind)

print('all:')
print(f'{len(trn_ind)} rows in trn, {len(tst_ind)} rows in tst')

### 'pt_noshort' ###

df = pd.read_csv('data/labeled/pt_noshort/a1.tsv', sep='\t')
# No need for drops because it's already < 13 char)
trn_ind = np.where(~df['subject'].isin(holdouts))[0]
keep_indices['pt_noshort'].append(trn_ind)

tst_ind = np.where(df['subject'].isin(holdouts))[0]
keep_indices['pt_noshort'].append(tst_ind)

print('pt_noshort:')
print(f'{len(trn_ind)} rows in trn, {len(tst_ind)} rows in tst')

# 'turns' ###

df = pd.read_csv('data/labeled/turns/a1.tsv', sep='\t')
drop = np.where(df['text'].str.endswith(('I: P: ', 'I: P:')))[0]

trn_ind = np.where(~df['subject'].isin(holdouts))[0]
trn_ind = np.setdiff1d(trn_ind, drop)
keep_indices['turns'].append(trn_ind)

tst_ind = np.where(df['subject'].isin(holdouts))[0]
tst_ind = np.setdiff1d(tst_ind, drop)
keep_indices['turns'].append(tst_ind)

print('turns:')
print(f'{len(trn_ind)} rows in trn, {len(tst_ind)} rows in tst')

### CSV to torch conversion

We now translate the very inefficient CSV format to the much more compact torch format. 

In [None]:
# Read llama embeds from data/llama
import pandas as pd
import torch
import os

datas = [
    'all',
    'pt_noshort', 
    'turns'
]

fnames = [
    'Meta-Llama-3-8B', 
    'Meta-Llama-3-8B-Instruct'
]

save_names = [
    'llama3_8b', 
    'llama3_8b_instruct'
]
# sizes = ['70b']

# Takes around 1 m 10 s to run
for data in datas:
    print(f'Processing {data}...')
    for f, save_name in zip(fnames, save_names):

        # Pass if the tensor already exists
        if f'{save_name}-last_avg-{data}.t' in os.listdir('data/trn/') and f'{save_name}-last_avg-{data}.t' in os.listdir('data/tst/'):
            print(f'File already exists: {save_name}-last_avg-{data}.t')
            continue

        print(f'    Processing {f}...')
        try:
            llama = pd.read_csv(f'llama/{f}_{data}.csv', header=None)
        except FileNotFoundError:
            print(f'File not found: llama/{f}_{data}.csv')
            continue

        print(llama.shape)

        # Convert to torch
        llama_tensor = torch.tensor(llama.values)

        llama_trn = llama_tensor[keep_indices[data][0]]
        llama_tst = llama_tensor[keep_indices[data][1]]

        with open(f'data/trn/{save_name}-last_avg-{data}.t', 'wb') as f:
            torch.save(llama_trn, f)

        with open(f'data/tst/{save_name}-last_avg-{data}.t', 'wb') as f:
            torch.save(llama_tst, f)
        
        del llama, llama_tensor, llama_trn, llama_tst

### Comparing embeddings

Similar to above, we can create heatmaps comparing the embeddings of BERT to the three Llama sizes. 

Note that this hasn't yet been done for the Llama 3 models. 

In [None]:
text_anno.columns

In [None]:
models = [
    'bert_base_uncased',
    'llama_7b',
    'llama_13b',
    'llama_70b',
]
data = 'turns'
text_anno = pd.read_csv(f"data/trn/{data}.tsv", sep="\t")
embeds = {}

for model in models:
    with open(f'data/trn/{model}-last_avg-{data}.t','rb') as f:
        x = torch.load(f)

    # Convert embeddings to numpy
    x = x.cpu().numpy()

    unique_ids = text_anno["subject"].unique()
    text_id = text_anno["subject"].values
    text_type = text_anno["reporter"].values
    id_grouped = [np.where(text_id == i)[0] for i in unique_ids]
    type_list = ["y", "p", "t"]
    id_dict = {}

    for i in unique_ids:
        id_dict[i] = {}
        for t in type_list:
            id_dict[i][t] = np.where((text_id == i) & (text_type == t))[0]

    id_grouped = []

    for i in id_dict:
        temp = [id_dict[i][t] for t in type_list]
        temp = np.concatenate(temp)
        id_grouped.append(temp)
    id_grouped = np.concatenate(id_grouped)

    text_id = text_id[id_grouped]
    text_type = text_type[id_grouped]

    id_idx = [i for i, (a, b) in enumerate(zip(text_id, text_id[1:]), 1) if a != b]
    type_idx = [i for i, (a, b) in enumerate(zip(text_type, text_type[1:]), 1) if a != b]

    embeds[model] = x[id_grouped]

In [None]:
# Plot the embeddings side by side
fig, axs = plt.subplots(1, 4, figsize=(30, 10))

vminlim = -1
vmaxlim = 1

axs[0].imshow(embeds['bert_base_uncased'], vmin=vminlim, vmax=vmaxlim, aspect=.1)
axs[1].imshow(embeds['llama_7b'], vmin=vminlim, vmax=vmaxlim, aspect=0.53)
axs[2].imshow(embeds['llama_13b'], vmin=vminlim, vmax=vmaxlim, aspect=0.67)
axs[3].imshow(embeds['llama_70b'], vmin=vminlim, vmax=vmaxlim, aspect=1.07)

for i, model in enumerate(models):
    axs[i].set_title(model)
    axs[i].hlines(y=id_idx, xmin=0, xmax=150, color='r', linestyle='-', linewidth=0.5)
    axs[i].hlines(y=id_idx, xmin=618, xmax=767, color='r', linestyle='-', linewidth=0.5)
    axs[i].hlines(y=type_idx, xmin=151, xmax=450, color='w', linestyle='--', linewidth=1)

    # Title
    axs[i].set_title(model)
    # Add colorbar
    cbar = axs[i].figure.colorbar(axs[i].images[0], ax=axs[i])

plt.tight_layout
plt.show()

In [None]:
for model in models:
    df = embeds[model]
    

In [None]:
fig, axes = plt.subplots(4)
for ax, model in zip(axes, models):
    ax.plot(embeds[model].std(0))


## Hidden dimension and context length counts

In [None]:
import torch

models = [
    'llama_7b', 
    'llama3_8b',
    'bert_base_uncased',
    'mental_bert', 
    'mental_longformer'
]

for model in models:
    with open(f'data/trn/{model}-last_avg-all.t','rb') as f:
        x = torch.load(f)
    print(model, x.shape)