#### Load libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import zipfile
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


#### Download AnnoMI data

In [3]:
## ## 2. Download AnnoMI Dataset


# %%
def download_annomi_dataset():
    """Download and extract AnnoMI dataset"""
    url = "https://github.com/uccollab/AnnoMI/archive/refs/heads/main.zip"
    zip_path = "annomi.zip"
    extract_path = "./data"
   
    # Download
    if not os.path.exists(extract_path):
        print("Downloading AnnoMI dataset...")
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
       
        # Extract
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        os.remove(zip_path)
        print("Dataset downloaded and extracted!")
    else:
        print("Dataset already exists!")
   
    return extract_path


In [4]:
data_path = download_annomi_dataset()

Dataset already exists!


#### Step 1: Read in the data set

In [5]:
# %% [markdown]
## ## 3. Load and Preprocess Data

# %%
def load_annomi_data(data_path):
    """Load and preprocess AnnoMI conversations"""
    # The AnnoMI dataset has a simple CSV file
    annomi_csv_path = os.path.join(data_path, "AnnoMI-main", "AnnoMI-simple.csv")
    
    # Check if file exists
    if not os.path.exists(annomi_csv_path):
        # Try alternative path
        annomi_csv_path = os.path.join(data_path, "AnnoMI-main", "data", "AnnoMI-simple.csv")
        
    if not os.path.exists(annomi_csv_path):
        print(f"Error: Could not find AnnoMI-simple.csv at {annomi_csv_path}")
        print(f"Available files in {data_path}:")
        for root, dirs, files in os.walk(data_path):
            for file in files:
                print(os.path.join(root, file))
        return pd.DataFrame()
    
    print(f"Loading data from: {annomi_csv_path}")
    
    # Load the CSV file
    df = pd.read_csv(annomi_csv_path)
    
    # Display column names to understand structure
    #print(f"\nColumns in dataset: {df.columns.tolist()}")
    #print(f"Dataset shape: {df.shape}")
    #print(f"\nFirst few rows:")
    #print(df.head())
    
    # Typical AnnoMI structure has columns like:
    # - utterance_id, session_id, interlocutor (speaker role)
    # - utterance_text, main_therapist_behaviour_code, etc.
    
    # Rename columns for consistency
    column_mapping = {
        'interlocutor': 'role',
        'utterance_text': 'text',
        'main_therapist_behaviour_code': 'code'
    }
    
    # Apply mapping for columns that exist
    df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})
    
    # Filter out rows without behavior codes (clients don't have codes)
    if 'code' in df.columns:
        df = df.dropna(subset=['code'])
    
    # Keep only therapist utterances for forecasting
    if 'role' in df.columns:
        print(f"\nRole distribution:")
        print(df['role'].value_counts())
    
    # Clean and standardize
    if 'text' in df.columns:
        df['text'] = df['text'].fillna('').astype(str).str.strip()
    
    if 'code' in df.columns:
        df['code'] = df['code'].fillna('').astype(str).str.strip()
        # Remove empty codes
        df = df[df['code'] != '']
    
    #print(f"\nAfter preprocessing: {len(df)} utterances")
    
    return df


# %% [markdown]

In [6]:
# Load data
df = load_annomi_data(data_path)

Loading data from: ./data\AnnoMI-main\AnnoMI-simple.csv

Role distribution:
role
therapist    4882
client       4817
Name: count, dtype: int64


In [7]:
df.head(5)

Unnamed: 0,transcript_id,mi_quality,video_title,video_url,topic,utterance_id,role,timestamp,text,main_therapist_behaviour,client_talk_type
0,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,question,
1,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,1,client,00:00:24,Sure.,,neutral
2,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",therapist_input,
3,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,,neutral
4,0,high,"NEW VIDEO: Brief intervention: ""Barbara""",https://www.youtube.com/watch?v=PaSKcfTmFEk,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,therapist_input,


#### Step 2: Train-test split

In [8]:
# %% [markdown]
#### Prepare Data for BERT Classification

# %%
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Filter to therapist utterances only
therapist_df = df[df['role'] == 'therapist'].copy()

# Remove rows with missing behavior codes
therapist_df = therapist_df.dropna(subset=['main_therapist_behaviour'])

print(f"Total therapist utterances with behavior codes: {len(therapist_df)}")



Total therapist utterances with behavior codes: 4882


In [9]:
# Map behavior codes to focus on main categories
# Adjust this mapping based on your actual codes
def map_behavior_codes(code):
    """Map behavior codes to main categories"""
    code = str(code).upper()
    
    if 'REFLECTION' in code or code.startswith('R'):
        return 'Reflection'
    elif 'QUESTION' in code or code.startswith('Q'):
        return 'Question'
    elif 'THERAPIST_INPUT' in code or code.startswith('T'):
        return 'Input'
    else:
        return 'Other'

In [10]:
# Apply mapping (adjust based on actual column values)
therapist_df['behavior_category'] = therapist_df['main_therapist_behaviour'].apply(map_behavior_codes)
therapist_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4882 entries, 0 to 9698
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   transcript_id             4882 non-null   int64 
 1   mi_quality                4882 non-null   object
 2   video_title               4882 non-null   object
 3   video_url                 4882 non-null   object
 4   topic                     4882 non-null   object
 5   utterance_id              4882 non-null   int64 
 6   role                      4882 non-null   object
 7   timestamp                 4882 non-null   object
 8   text                      4882 non-null   object
 9   main_therapist_behaviour  4882 non-null   object
 10  client_talk_type          0 non-null      object
 11  behavior_category         4882 non-null   object
dtypes: int64(2), object(10)
memory usage: 495.8+ KB


In [11]:
print("\n4-Class Distribution:")
print(therapist_df['behavior_category'].value_counts())
print("\nPercentages:")
print(therapist_df['behavior_category'].value_counts(normalize=True) * 100)


4-Class Distribution:
behavior_category
Other         1586
Question      1386
Reflection    1296
Input          614
Name: count, dtype: int64

Percentages:
behavior_category
Other         32.486686
Question      28.390004
Reflection    26.546497
Input         12.576813
Name: proportion, dtype: float64


In [12]:
# Prepare features and labels
X = therapist_df['text'].values
y = therapist_df['behavior_category'].values

In [13]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"\nLabel Mapping:")
for idx, label in enumerate(label_encoder.classes_):
    print(f"  {label}: {idx}")


Label Mapping:
  Input: 0
  Other: 1
  Question: 2
  Reflection: 3


In [14]:
# Train/Test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"\nTrain size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

print("\nTrain label distribution:")
train_dist = pd.Series(y_train).value_counts().sort_index()
for idx, count in train_dist.items():
    print(f"  {label_encoder.classes_[idx]}: {count}")

print("\nTest label distribution:")
test_dist = pd.Series(y_test).value_counts().sort_index()
for idx, count in test_dist.items():
    print(f"  {label_encoder.classes_[idx]}: {count}")


Train size: 3905
Test size: 977

Train label distribution:
  Input: 491
  Other: 1268
  Question: 1109
  Reflection: 1037

Test label distribution:
  Input: 123
  Other: 318
  Question: 277
  Reflection: 259


#### Step 3: Create HuggingFace Datasets and Tokenize

In [15]:
# %% [markdown]
#### Tokenize Data for BERT

# %%
from datasets import Dataset

# Create HuggingFace datasets
train_data = Dataset.from_dict({
    'text': X_train,
    'label': y_train
})

test_data = Dataset.from_dict({
    'text': X_test,
    'label': y_test
})



In [16]:
# Load BERT tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128  # Adjust based on text length analysis
    )

# Tokenize datasets
print("Tokenizing datasets...")
train_dataset = train_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("Tokenization complete!")
print(f"Train dataset: {len(train_dataset)} examples")
print(f"Test dataset: {len(test_dataset)} examples")

Tokenizing datasets...


Map: 100%|██████████| 3905/3905 [00:00<00:00, 17598.42 examples/s]
Map: 100%|██████████| 977/977 [00:00<00:00, 15130.93 examples/s]

Tokenization complete!
Train dataset: 3905 examples
Test dataset: 977 examples





#### Step 4: Train BERT Model

In [17]:
# %% [markdown]
#### Train BERT Classification Model

# %%
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import numpy as np

# Load pre-trained BERT model
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)
model.to(device)

# Define metrics computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate F1 scores
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    # Per-class F1 scores
    f1_per_class = f1_score(labels, predictions, average=None)
    
    metrics = {
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
    }
    
    # Add per-class F1 scores
    for idx, class_name in enumerate(label_encoder.classes_):
        metrics[f'f1_{class_name}'] = f1_per_class[idx]
    
    return metrics



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# ...existing code...
import sys, transformers, accelerate, datasets
print("python:", sys.executable)
print("transformers:", transformers.__version__, transformers.__file__)
print("accelerate:", accelerate.__version__, accelerate.__file__)
print("datasets:", datasets.__version__, datasets.__file__)

from transformers.utils.versions import require_version
require_version("accelerate>=0.26.0")

python: c:\Users\lnbco\OneDrive\Documents\GitHub\interspeech2022-motivational-interviewing\venv\Scripts\python.exe
transformers: 4.57.1 c:\Users\lnbco\OneDrive\Documents\GitHub\interspeech2022-motivational-interviewing\venv\Lib\site-packages\transformers\__init__.py
accelerate: 1.11.0 c:\Users\lnbco\OneDrive\Documents\GitHub\interspeech2022-motivational-interviewing\venv\Lib\site-packages\accelerate\__init__.py
datasets: 4.3.0 c:\Users\lnbco\OneDrive\Documents\GitHub\interspeech2022-motivational-interviewing\venv\Lib\site-packages\datasets\__init__.py


In [19]:
# ...existing code...
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',     # <-- use eval_strategy here
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("TRAINING BERT MODEL")
trainer.train()
# ...existing code...

TRAINING BERT MODEL




Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,F1 Input,F1 Other,F1 Question,F1 Reflection
1,0.5659,0.5101,0.782481,0.817664,0.590476,0.896774,0.866779,0.775895
2,0.4376,0.502822,0.797099,0.827459,0.635659,0.906149,0.867725,0.778865
3,0.3091,0.525517,0.793981,0.823168,0.639405,0.898502,0.867647,0.77037
4,0.2615,0.547037,0.799663,0.828359,0.648438,0.904685,0.870337,0.775194




TrainOutput(global_step=980, training_loss=0.4374250402255934, metrics={'train_runtime': 5352.5453, 'train_samples_per_second': 2.918, 'train_steps_per_second': 0.183, 'total_flos': 1027467121274880.0, 'train_loss': 0.4374250402255934, 'epoch': 4.0})

#### Step 5: Evaluate on test set

In [None]:
# %% [markdown]
#### Evaluate Model Performance

# %%
# Evaluate on test set
print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)

test_results = trainer.evaluate(test_dataset)

print("\n1. Overall Metrics:")
print(f"   F1-Macro: {test_results['eval_f1_macro']:.4f}")
print(f"   F1-Weighted: {test_results['eval_f1_weighted']:.4f}")
print(f"   Loss: {test_results['eval_loss']:.4f}")

print("\n2. Per-Class F1 Scores:")
for class_name in label_encoder.classes_:
    f1_key = f'eval_f1_{class_name}'
    if f1_key in test_results:
        print(f"   {class_name}: {test_results[f1_key]:.4f}")

# Get detailed predictions
predictions_output = trainer.predict(test_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=-1)
y_true = predictions_output.label_ids

# Detailed classification report
print("\n3. Detailed Classification Report:")
print("="*60)
print(classification_report(
    y_true, 
    y_pred,
    target_names=label_encoder.classes_,
    digits=4
))

# Confusion Matrix
print("\n4. Confusion Matrix:")
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(
    cm,
    index=label_encoder.classes_,
    columns=label_encoder.classes_
)
print(cm_df)