In [None]:
import pandas as pd
import numpy as np
import glob
import os
from scipy import stats
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def plot_time_series(therapist_resampled, patient_resampled, feature, filename):
    plt.figure(figsize=(15, 6))
    plt.plot(therapist_resampled.index, therapist_resampled[feature], label='Therapist', alpha=0.7)
    plt.plot(patient_resampled.index, patient_resampled[feature], label='Patient', alpha=0.7)
    plt.title(f'{feature} Time Series for {filename}')
    #plt.title(f'{feature} Time Series Example')
    plt.xlabel('Time (seconds)')
    plt.ylabel(feature)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'~/WAI/TimeSeriesFigs/{filename}_{feature}_timeseries.png')
    plt.show()
    plt.close()

In [None]:
def calculate_features(csv_file):
    df = pd.read_csv(csv_file, delimiter=';')
    df = df[df['Speaker'] != 'NAN'] #filter out nan speakers
    
    features = {}

    features['filename'] = os.path.basename(csv_file)
    
    for speaker in df['Speaker'].unique():
        speaker_df = df[df['Speaker'] == speaker]
        
        features[f'{speaker}_avg_arousal'] = speaker_df['Arousal'].mean()
        features[f'{speaker}_avg_valence'] = speaker_df['Valence'].mean()
        
        features[f'{speaker}_var_arousal'] = speaker_df['Arousal'].var()
        features[f'{speaker}_var_valence'] = speaker_df['Valence'].var()
        
        features[f'{speaker}_max_arousal'] = speaker_df['Arousal'].max()
        features[f'{speaker}_max_valence'] = speaker_df['Valence'].max()
        
        features[f'{speaker}_min_arousal'] = speaker_df['Arousal'].min()
        features[f'{speaker}_min_valence'] = speaker_df['Valence'].min()
        
        features[f'{speaker}_total_time'] = (speaker_df['End Time'] - speaker_df['Start Time']).sum()
        features[f'{speaker}_num_utterances'] = len(speaker_df)
        
        sentiment_counts = speaker_df['Sentiment'].value_counts(normalize=True)
        features[f'{speaker}_positive_sentiment_ratio'] = sentiment_counts.get('Positive', 0)
        features[f'{speaker}_negative_sentiment_ratio'] = sentiment_counts.get('Negative', 0)
    
    df = df.sort_values('Start Time')
    df['Next Start Time'] = df['Start Time'].shift(-1)
    df['Response Time'] = df['Next Start Time'] - df['End Time']

    #avg response time therapist to patient
    therapist_to_patient = df[(df['Speaker'] == 'therapist') & (df['Speaker'].shift(-1) == 'patient')]
    features['avg_response_time_therapist_to_patient'] = therapist_to_patient['Response Time'].mean()

    #avg response time patient to therapist
    patient_to_therapist = df[(df['Speaker'] == 'patient') & (df['Speaker'].shift(-1) == 'therapist')]
    features['avg_response_time_patient_to_therapist'] = patient_to_therapist['Response Time'].mean()

    #time alignment
    df['Mid Time'] = (df['Start Time'] + df['End Time']) / 2
    df = df.sort_values('Mid Time')
    start_time = df['Mid Time'].min()
    end_time = df['Mid Time'].max()
    step = 1.0
    common_times = np.arange(start_time, end_time + step, step)
    
    #resample to time grid
    def resample_data(data):
        resampled = pd.DataFrame(index=common_times, columns=data.columns)
        for time in common_times:
            mask = (data.index <= time)
            if mask.any():
                resampled.loc[time] = data[mask].iloc[-1]
        return resampled.ffill()
    
    therapist_data = df[df['Speaker'] == 'therapist'].set_index('Mid Time')
    patient_data = df[df['Speaker'] == 'patient'].set_index('Mid Time')
    
    therapist_resampled = resample_data(therapist_data)
    patient_resampled = resample_data(patient_data)
    
    common_indices = therapist_resampled.index.intersection(patient_resampled.index)
    therapist_resampled = therapist_resampled.loc[common_indices]
    patient_resampled = patient_resampled.loc[common_indices]
    
    #remove nan
    valid_indices = ~(np.isnan(therapist_resampled['Arousal']) | np.isinf(therapist_resampled['Arousal']) |
                      np.isnan(patient_resampled['Arousal']) | np.isinf(patient_resampled['Arousal']) |
                      np.isnan(therapist_resampled['Valence']) | np.isinf(therapist_resampled['Valence']) |
                      np.isnan(patient_resampled['Valence']) | np.isinf(patient_resampled['Valence']))
    
    therapist_resampled = therapist_resampled[valid_indices]
    patient_resampled = patient_resampled[valid_indices]
    
    #synchrony calculations with pearson correlation
    if len(therapist_resampled) >= 2:
        features['arousal_synchrony'], _ = stats.pearsonr(therapist_resampled['Arousal'], patient_resampled['Arousal'])
        features['valence_synchrony'], _ = stats.pearsonr(therapist_resampled['Valence'], patient_resampled['Valence'])
        therapist_sentiment = (therapist_resampled['Sentiment'] == 'Positive').astype(int)
        patient_sentiment = (patient_resampled['Sentiment'] == 'Positive').astype(int)
        features['sentiment_synchrony'], _ = stats.pearsonr(therapist_sentiment, patient_sentiment)
    else:
        features['arousal_synchrony'] = np.nan
        features['valence_synchrony'] = np.nan
        features['sentiment_synchrony'] = np.nan

    plot_time_series(therapist_resampled, patient_resampled, 'Arousal', features['filename'])
    plot_time_series(therapist_resampled, patient_resampled, 'Valence', features['filename'])
    
    #sentiment to numerical
    therapist_sentiment = (therapist_resampled['Sentiment'] == 'Positive').astype(int)
    patient_sentiment = (patient_resampled['Sentiment'] == 'Positive').astype(int)
    
    sentiment_df = pd.DataFrame({
        'Therapist': therapist_sentiment,
        'Patient': patient_sentiment
    }, index=therapist_resampled.index)
    
    plt.figure(figsize=(15, 6))
    sns.lineplot(data=sentiment_df)
    plt.title(f'Sentiment Time Series for {features["filename"]}')
    plt.xlabel('Time (seconds)')
    plt.ylabel('Sentiment (0: Negative, 1: Positive)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'~/WAI/TimeSeriesFigs/{features["filename"]}_Sentiment_timeseries.png')
    plt.show()
    plt.close()
    
    return features

In [None]:
data_folder = "~/output/SpeakerSentAnalysis"

In [None]:
os.makedirs("~/WAI/TimeSeriesFigs/", exist_ok=True)
csv_files = glob.glob(os.path.join(data_folder, '*.csv'))
all_features = [calculate_features(csv_file) for csv_file in csv_files]
features_df = pd.DataFrame(all_features)

In [None]:
features_df

In [None]:
#needed to align analysed transcripts with WAI scores
def standardize_session_id(filename):
    session_id = re.sub(r'_SentArVal\.csv$', '', filename) #remove csv suffix
    
    session_id = re.sub(r'\s+\d{1,2}-\d{1,2}-(\d{2}|\d{4})$', '', session_id) #remove dates from name
    session_id = re.sub(r'\s+\d{4}-\d{2}-\d{2}$', '', session_id)

    session_id = re.sub(r'\bs(\d+)', lambda m: f' sessie {int(m.group(1)):02d}', session_id) #replace s8 with sessie 08
    
    session_id = re.sub(r'sessie\s*(\d+)', lambda m: f'sessie {int(m.group(1)):02d}', session_id) #ensure 0 padded number
    
    return session_id.strip()
    
features_df['session_id'] = features_df['filename'].apply(standardize_session_id)

In [None]:
features_df

In [None]:
patient_wai = pd.read_csv('~/WAI/patient_btg.csv')
observer_wai = pd.read_csv('~/WAI/observer_btg.csv')
therapist_wai = pd.read_csv('~/WAI/Therapist_ratings.csv').drop('Unnamed: 0', axis=1)

In [None]:
for df in [patient_wai, therapist_wai, observer_wai]:
    df['session_id'] = df['ppnr'].astype(str) + ' sessie ' + df['session'].astype(str).str.zfill(2)

In [None]:
patient_wai

In [None]:
therapist_wai

In [None]:
observer_wai

In [None]:
merged_df = features_df.merge(patient_wai[['session_id', 'bond', 'goal', 'task', 'wai']], on='session_id', suffixes=('', '_patient'))
merged_df = merged_df.merge(therapist_wai[['session_id', 'bond', 'goal', 'task', 'wai']], on='session_id', suffixes=('', '_therapist'))
merged_df = merged_df.merge(observer_wai[['session_id', 'bond', 'goal', 'task', 'wai']], on='session_id', suffixes=('', '_observer'))
merged_df = merged_df.rename(columns={'bond':'bond_patient', 'goal':'goal_patient', 'task':'task_patient', 'wai':'wai_patient'})

In [None]:
merged_df = merged_df.drop_duplicates(subset='filename', keep='first')

In [None]:
merged_df

In [None]:
#remove duplicate patients manually, keep more recent entry (later session)
merged_df = merged_df.drop([2,4,7,10])

In [None]:
merged_df

In [None]:
merged_df.to_csv('~/WAI/merged_df.csv', index=False)

In [None]:
wai_columns = ['bond_patient', 'goal_patient', 'task_patient', 'wai_patient', 
               'bond_therapist', 'goal_therapist', 'task_therapist', 'wai_therapist',
               'bond_observer', 'goal_observer', 'task_observer', 'wai_observer']

feature_columns = [col for col in merged_df.columns if col not in wai_columns + ['filename', 'session_id']]

In [None]:
def descriptive(merged_df):
    print("Summary")
    print(merged_df.describe())

    #readability font sizes
    plt.rcParams.update({
        'font.size': 24,
        'axes.titlesize': 22,
        'axes.labelsize': 22,
        'xtick.labelsize': 18,
        'ytick.labelsize': 18,
        'legend.fontsize': 22,
        'figure.titlesize': 24
    })

    participant_types = ['patient', 'therapist', 'observer']
    for participant in participant_types:
        wai_cols = [col for col in wai_columns if col.endswith(participant)]
        fig, axes = plt.subplots(2, 2, figsize=(16, 16))
        fig.suptitle(f"{participant.capitalize()} WAI Score Distributions", fontsize=24)
        for i, col in enumerate(wai_cols):
            sns.histplot(merged_df[col], kde=True, ax=axes[i//2, i%2])
            axes[i//2, i%2].set_title(col, fontsize=18)
            axes[i//2, i%2].set_xlabel(axes[i//2, i%2].get_xlabel(), fontsize=20)
            axes[i//2, i%2].set_ylabel(axes[i//2, i%2].get_ylabel(), fontsize=20)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig(f'~/WAI/{participant}_wai_score_distributions.png', dpi=300, bbox_inches='tight')
        plt.show()
        plt.close()

    #divide into separate figures
    feature_types = {
        'therapist': [col for col in feature_columns if col.startswith('therapist')],
        'patient': [col for col in feature_columns if col.startswith('patient')],
        'miscellaneous': [col for col in feature_columns if not (col.startswith('therapist') or col.startswith('patient'))]
    }

    for feature_type, features in feature_types.items():
        num_features = len(features)
        num_cols = 4
        num_rows = (num_features + num_cols - 1) // num_cols
        fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 6*num_rows))
        fig.suptitle(f"{feature_type.capitalize()} Feature Distributions", fontsize=24)
        for i, feature in enumerate(features):
            sns.histplot(merged_df[feature], kde=True, ax=axes[i//num_cols, i%num_cols])
            axes[i//num_cols, i%num_cols].set_title(feature, fontsize=16)
            axes[i//num_cols, i%num_cols].set_xlabel(axes[i//num_cols, i%num_cols].get_xlabel(), fontsize=16)
            axes[i//num_cols, i%num_cols].set_ylabel(axes[i//num_cols, i%num_cols].get_ylabel(), fontsize=16)
        for i in range(num_features, num_rows * num_cols):
            fig.delaxes(axes[i//num_cols, i%num_cols])
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig(f'~/WAI/{feature_type}_feature_distributions.png', dpi=300, bbox_inches='tight')
        plt.show()
        plt.close()

In [None]:
descriptive(merged_df)

In [None]:
def correlation_calc(df, wai_columns, feature_columns):
    correlations = {}
    for wai_col in wai_columns:
        for feature_col in feature_columns:
            valid_data = df[[wai_col, feature_col]].replace([np.inf, -np.inf], np.nan).dropna()
            if len(valid_data) > 1:
                corr, p_value = stats.pearsonr(valid_data[wai_col], valid_data[feature_col])
                correlations[f"{wai_col}_{feature_col}"] = {'correlation': corr, 'p_value': p_value}
            else:
                correlations[f"{wai_col}_{feature_col}"] = {'correlation': np.nan, 'p_value': np.nan}
    return pd.DataFrame(correlations).T

In [None]:
correlations = correlation_calc(merged_df, wai_columns, feature_columns)

In [None]:
correlations['abs_correlation'] = abs(correlations['correlation'])
correlations_sorted = correlations.sort_values('abs_correlation', ascending=False)

In [None]:
print(correlations_sorted.head(20))

In [None]:
correlations_sorted.to_csv('~/WAI/wai_feature_correlations.csv')

In [None]:
X = merged_df[feature_columns]

In [None]:
X

In [None]:
def final_models(X, y, target_name, max_k=5):
    loo = LeaveOneOut()
    
    models = {
        'Mean': lambda: np.mean(y),
        'Linear': LinearRegression(),
        'SVR': SVR(kernel='rbf')
    }
    
    for k in range(1, max_k + 1):
        models[f'KNN (k={k})'] = KNeighborsRegressor(n_neighbors=k)
    
    results = {model_name: [] for model_name in models}
    
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        for model_name, model in models.items():
            if model_name == 'Mean':
                y_pred = model()
            else:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
            
            mse = mean_squared_error([y_test.iloc[0]], [y_pred[0]] if hasattr(y_pred, '__iter__') else [y_pred])
            results[model_name].append(np.sqrt(mse))
    
    avg_results = {model_name: np.mean(scores) for model_name, scores in results.items()}
    
    #best knn
    knn_results = {k: avg_results[f'KNN (k={k})'] for k in range(1, max_k + 1)}
    best_k = min(knn_results, key=knn_results.get)
    best_knn_rmse = knn_results[best_k]
    
    final_results = {
        'target': target_name,
        'Mean': avg_results['Mean'],
        'Linear': avg_results['Linear'],
        'SVR': avg_results['SVR'],
        'KNN': {
            'best_k': best_k,
            'best_rmse': best_knn_rmse
        }
    }
    
    return final_results, knn_results

In [None]:
model_results = []
knnr_results = {}
for target in wai_columns:
    y = merged_df[target]
    result, knn = final_models(X, y, target)
    model_results.append(result)
    knnr_results[target] = knn

model_results_df = pd.DataFrame(model_results)
print(model_results_df)

In [None]:
model_results_df.to_csv('~/WAI/model_rmse.csv')

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(20, 15))
fig.suptitle('KNN Regression Performance for WAI Targets', fontsize=22)

for i, (target, results) in enumerate(knnr_results.items()):
    ax = axes[i // 4, i % 4]
    k_values = list(results.keys())
    rmse_values = [results[k] for k in k_values]
    
    ax.plot(k_values, rmse_values, marker='s', label='RMSE')
    ax.set_title(target)
    ax.set_xlabel('k')
    ax.set_ylabel('Score')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("~/WAI/KNNWAI.png", dpi=300, bbox_inches='tight')
plt.show()