#### Load libraries

In [20]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import zipfile
import os

In [21]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


#### Download AnnoMI data

In [22]:
## ## 2. Download AnnoMI Dataset


# %%
def download_annomi_dataset():
    """Download and extract AnnoMI dataset"""
    url = "https://github.com/uccollab/AnnoMI/archive/refs/heads/main.zip"
    zip_path = "annomi.zip"
    extract_path = "./data"
   
    # Download
    if not os.path.exists(extract_path):
        print("Downloading AnnoMI dataset...")
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
       
        # Extract
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        os.remove(zip_path)
        print("Dataset downloaded and extracted!")
    else:
        print("Dataset already exists!")
   
    return extract_path


In [23]:
data_path = download_annomi_dataset()

Dataset already exists!


#### Read in the data set

In [42]:
# %% [markdown]
## ## 3. Load and Preprocess Data

# %%
def load_annomi_data(data_path):
    """Load and preprocess AnnoMI conversations"""
    # The AnnoMI dataset has a simple CSV file
    annomi_csv_path = os.path.join(data_path, "AnnoMI-main", "AnnoMI-simple.csv")
    
    # Check if file exists
    if not os.path.exists(annomi_csv_path):
        # Try alternative path
        annomi_csv_path = os.path.join(data_path, "AnnoMI-main", "data", "AnnoMI-simple.csv")
        
    if not os.path.exists(annomi_csv_path):
        print(f"Error: Could not find AnnoMI-simple.csv at {annomi_csv_path}")
        print(f"Available files in {data_path}:")
        for root, dirs, files in os.walk(data_path):
            for file in files:
                print(os.path.join(root, file))
        return pd.DataFrame()
    
    print(f"Loading data from: {annomi_csv_path}")
    
    # Load the CSV file
    df = pd.read_csv(annomi_csv_path)
    
    # Display column names to understand structure
    #print(f"\nColumns in dataset: {df.columns.tolist()}")
    #print(f"Dataset shape: {df.shape}")
    #print(f"\nFirst few rows:")
    #print(df.head())
    
    # Typical AnnoMI structure has columns like:
    # - utterance_id, session_id, interlocutor (speaker role)
    # - utterance_text, main_therapist_behaviour_code, etc.
    
    # Rename columns for consistency
    column_mapping = {
        'interlocutor': 'role',
        'utterance_text': 'text',
        'main_therapist_behaviour_code': 'code'
    }
    
    # Apply mapping for columns that exist
    df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})
    
    # Filter out rows without behavior codes (clients don't have codes)
    if 'code' in df.columns:
        df = df.dropna(subset=['code'])
    
    # Keep only therapist utterances for forecasting
    if 'role' in df.columns:
        print(f"\nRole distribution:")
        print(df['role'].value_counts())
    
    # Clean and standardize
    if 'text' in df.columns:
        df['text'] = df['text'].fillna('').astype(str).str.strip()
    
    if 'code' in df.columns:
        df['code'] = df['code'].fillna('').astype(str).str.strip()
        # Remove empty codes
        df = df[df['code'] != '']
    
    #print(f"\nAfter preprocessing: {len(df)} utterances")
    
    return df


# %% [markdown]

In [44]:
# Load data
df = load_annomi_data(data_path)

Loading data from: ./data\AnnoMI-main\AnnoMI-simple.csv

Role distribution:
role
therapist    4882
client       4817
Name: count, dtype: int64


In [None]:
df.head(5)

Unnamed: 0,transcript_id,mi_quality,video_title,video_url,topic,utterance_id,role,timestamp,text,main_therapist_behaviour,client_talk_type
0,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,question,
1,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,1,client,00:00:24,Sure.,,neutral
2,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",therapist_input,
3,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,,neutral
4,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,therapist_input,
