In [5]:
# !pip install tensorflow sentence-transformers numpy pandas scikit-learn
# !pip install ipywidgets

In [17]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Dense, Dropout, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
tqdm.pandas() 

In [7]:
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def call(self, lstm_output):
        score = tf.nn.tanh(lstm_output)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * lstm_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector


In [10]:
def split_utterances(text):
    return [u.strip() for u in text.split("<uttr>") if u.strip()]

In [11]:
sbert = SentenceTransformer("all-mpnet-base-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [18]:
df = pd.read_excel("data_for_hvp_rankings_utterance_level.xlsx")

In [19]:
df.head()

Unnamed: 0,Record ID#,Patient ID#,VQ A 6,VQ B 9,VQ C 10,VQ D 11,VQ E 13,VQ F 5,VQ G 17,VQ H 16,...,SQ J 4,SQ K 1,SQ L 18,SQ M 2,SQ N 14,SQ O 8,SQ P 15,SQ Q 3,SQ R 7,Message
0,5146,1977,7,9,8,11,15,6,16,14,...,14,17,13,8,15,1,9,2,16,No i'm driving actually I don't know if you ca...
1,5139,5422,6,7,18,16,17,8,15,14,...,3,9,10,11,15,14,13,16,12,i'm good. How the virus.<uttr>know. The day be...
2,5120,17915,1,9,8,10,12,2,11,13,...,3,10,18,4,9,5,15,6,7,hi. I always get stuck in those. Like which li...
3,5118,5422,1,8,9,10,11,7,18,17,...,7,8,15,9,14,10,13,11,12,i'm good. How the virus.<uttr>know. The day be...
4,5100,10859,7,6,10,11,13,8,17,16,...,4,5,10,7,13,6,14,8,9,Hello.<uttr>yeah I can hear you. mmm i'm Stan ...


In [20]:
embedded_sequences = []
max_uttrs = 100

for txt in tqdm(df["Message"], desc="Encoding utterances"):
    
    utts = split_utterances(txt)
    
    embeddings = sbert.encode(utts, convert_to_numpy=True)

    if len(embeddings) > max_uttrs:
        
        idx = np.linspace(0, len(embeddings)-1, max_uttrs).astype(int)
        
        embeddings = embeddings[idx]
        
    else:
        
        pad_len = max_uttrs - len(embeddings)
        
        embeddings = np.vstack([embeddings, np.zeros((pad_len, 768))])

    embedded_sequences.append(embeddings)

embedded_sequences = np.array(embedded_sequences)

Encoding utterances:  57%|█████▋    | 377/667 [2:41:29<2:04:13, 25.70s/it]


KeyboardInterrupt: 

In [None]:
target_cols = ['VQ A 6', 'VQ B 9', 'VQ C 10', 'VQ D 11',
       'VQ E 13', 'VQ F 5', 'VQ G 17', 'VQ H 16', 'VQ I 12', 'VQ J 4',
       'VQ K 1', 'VQ L 18', 'VQ M 2', 'VQ N 14', 'VQ O 8', 'VQ P 15', 'VQ Q 3',
       'VQ R 7', 'SQ A 6', 'SQ B 9', 'SQ C 10', 'SQ D 11', 'SQ E 13', 'SQ F 5',
       'SQ G 17', 'SQ H 16', 'SQ I 12', 'SQ J 4', 'SQ K 1', 'SQ L 18',
       'SQ M 2', 'SQ N 14', 'SQ O 8', 'SQ P 15', 'SQ Q 3', 'SQ R 7']

y = df[target_cols].values

scaler_y = StandardScaler()

y_scaled = scaler_y.fit_transform(y)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(embedded_sequences, y_scaled, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

#Result: ~70% train, 10% val, 20% test
print(f"Training: {len(X_train)} records")
print(f"Validation: {len(X_val)} records")
print(f"Testing: {len(X_test)} records")

In [None]:
input_layer = Input(shape=(max_uttrs, 768))

x = Masking(mask_value=0.0)(input_layer)

x = Bidirectional(LSTM(256, return_sequences=True))(x)
x = Dropout(0.3)(x)

x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Dropout(0.3)(x)

x = AttentionLayer()(x)

x = Dense(128, activation="relu")(x)
x = Dropout(0.3)(x)

x = Dense(64, activation="relu")(x)

output = Dense(36, activation="linear")(x)

model = Model(inputs=input_layer, outputs=output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="mse",
    metrics=["mae"]
)

model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.15,
    epochs=40,
    batch_size=8,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=5, restore_best_weights=True
        )
    ]
)


In [None]:
loss, mae = model.evaluate(X_test, y_test)
print("Test MSE:", loss)
print("Test MAE:", mae)


In [None]:
y_pred_scaled = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_true = scaler_y.inverse_transform(y_test)

In [None]:
def predict_traits(conversation_text):
    utts = split_utterances(conversation_text)
    emb = sbert.encode(utts, convert_to_numpy=True)
    
    if len(emb) > max_uttrs:
        emb = emb[:max_uttrs]
    else:
        pad_len = max_uttrs - len(emb)
        emb = np.vstack([emb, np.zeros((pad_len, 768))])

    pred = model.predict(emb[np.newaxis, :])[0]
    pred_unscaled = scaler_y.inverse_transform([pred])[0]
    return pred_unscaled


In [None]:
import numpy as np

trait_mae = np.mean(np.abs(y_pred - y_true), axis=0)
print("Trait-wise MAE:\n", trait_mae)


In [None]:
trait_rmse = np.sqrt(np.mean((y_pred - y_true)**2, axis=0))
print("Trait-wise RMSE:\n", trait_rmse)


In [None]:
from sklearn.metrics import r2_score

trait_r2 = [r2_score(y_true[:, i], y_pred[:, i]) for i in range(36)]
print("Trait-wise R²:\n", trait_r2)


In [None]:
print("Mean MAE:", trait_mae.mean())
print("Mean RMSE:", trait_rmse.mean())
print("Mean R²:", np.mean(trait_r2))


In [None]:
import matplotlib.pyplot as plt

i = 0  # trait index 0–35
plt.scatter(y_true[:, i], y_pred[:, i])
plt.xlabel("True Trait Value")
plt.ylabel("Predicted Trait Value")
plt.title(f"Trait {i} - True vs Predicted")
plt.show()


In [None]:
from scipy.stats import pearsonr

trait_corr = [pearsonr(y_true[:, i], y_pred[:, i])[0] for i in range(36)]
print("Trait-wise Pearson Correlation:\n", trait_corr)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

mean_true = y_true.mean(axis=0)
mean_pred = y_pred.mean(axis=0)

angles = np.linspace(0, 2 * np.pi, len(mean_true), endpoint=False).tolist()
mean_true = np.concatenate((mean_true, [mean_true[0]]))
mean_pred = np.concatenate((mean_pred, [mean_pred[0]]))
angles += angles[:1]

plt.figure(figsize=(10, 10))
ax = plt.subplot(111, polar=True)
ax.plot(angles, mean_true, linewidth=2)
ax.plot(angles, mean_pred, linewidth=2)
ax.fill(angles, mean_true, alpha=0.2)
ax.fill(angles, mean_pred, alpha=0.2)
plt.title("Mean Trait Pattern: True vs Predicted")
plt.show()
