In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import Dense, GlobalAveragePooling2D, Input
from transformers import AutoTokenizer, TFAutoModel
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score
from keras.callbacks import EarlyStopping

# --- Configuration ---
# Path to CSV file
CSV_FILE_PATH = 'dyslexia_data_with_text_duplicates_removed.csv'  

# Batch size for training
BATCH_SIZE = 16

# Number of epochs for training the classifier head
EPOCHS = 50

# Learning rate for the classifier head
LEARNING_RATE = 1e-4

# Seed used for splitting datasets
seed = 24

# --- Load and Prepare Data ---
print("Loading data from CSV...")
try:
    df = pd.read_csv(CSV_FILE_PATH)
except FileNotFoundError:
    print(f"Error: CSV file not found at {CSV_FILE_PATH}. Please check the path.")
    exit()

# Check CSV has 'text' and 'presence_of_dyslexia' columns
if 'text' not in df.columns or 'presence_of_dyslexia' not in df.columns:
    print("Error: CSV must contain 'text' and 'presence_of_dyslexia' columns for text-only model.")
    exit()

# Ensure 'text' column is string type and handle potential NaN values
df['text'] = df['text'].fillna('').astype(str) # Fill NaN with empty string, then convert to str

# Convert labels to integer type
df['presence_of_dyslexia'] = df['presence_of_dyslexia'].astype(int)

print(f"Total samples: {len(df)}")

# --- Text Tokenization (Pre-tokenizing all text) ---
print("Loading BERT tokenizer and tokenizing all text...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # Using bert-base-cased as we care about case 

all_input_ids = []
all_attention_masks = []

for text_entry in df['text']:
    encoded_input = tokenizer(
        text_entry,
        padding='max_length',
        truncation=True,
        max_length=128, # Consistent max length
        return_tensors='np' # Return NumPy arrays directly
    )
    all_input_ids.append(encoded_input['input_ids'][0])
    all_attention_masks.append(encoded_input['attention_mask'][0])

all_input_ids = np.array(all_input_ids)
all_attention_masks = np.array(all_attention_masks)
print(f"All text tokenized. Input IDs shape: {all_input_ids.shape}, Attention Masks shape: {all_attention_masks.shape}")

# --- Split Preprocessed Data ---
# Split data into training and validation sets for text only (70% - 15% - 15%)
X_input_ids_train, X_input_ids_val, \
X_attention_masks_train, X_attention_masks_val, \
y_train, y_val = train_test_split(
    all_input_ids,
    all_attention_masks,
    df['presence_of_dyslexia'].values,
    test_size=0.3,
    random_state=seed,
    stratify=df['presence_of_dyslexia'].values
)

# --- Split Validation Data into Validation and Test ---
# Then split data into validation and test sets for text only
X_input_ids_val, X_input_ids_test, \
X_attention_masks_val, X_attention_masks_test, \
y_val, y_test = train_test_split(
    X_input_ids_val,
    X_attention_masks_val,
    y_val,
    test_size=0.5,
    random_state=seed,
    stratify=y_val
)

print(f"Training samples: {len(y_train)}")
print(f"Validation samples: {len(y_val)}")

# --- Create TensorFlow Datasets from NumPy arrays (for text only) ---
def create_tf_text_dataset_from_np(input_ids, attention_masks, labels):
    """
    Creates a TensorFlow Dataset from NumPy arrays for text data.
    """
    ds = tf.data.Dataset.from_tensor_slices(
        (
            {'input_ids': input_ids, 'attention_mask': attention_masks},
            labels
        )
    )
    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return ds

train_dataset_text = create_tf_text_dataset_from_np(X_input_ids_train, X_attention_masks_train, y_train)
val_dataset_text = create_tf_text_dataset_from_np(X_input_ids_val, X_attention_masks_val, y_val)
test_dataset_text = create_tf_text_dataset_from_np(X_input_ids_test, X_attention_masks_test, y_test)

print("TensorFlow Datasets created from preprocessed NumPy arrays for text.")

# --- Custom Keras Layer for BERT Model (re-used from multi-view) ---
class BertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, model_name, **kwargs):
        super().__init__(**kwargs)
        self.bert_model = TFAutoModel.from_pretrained(model_name, from_pt=True) 
        self.bert_model.trainable = False # Keep BERT layers frozen for feature extraction

    def call(self, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        # The training=False argument to prevent issues with symbolic tensors
        bert_output = self.bert_model(input_ids, attention_mask=attention_mask, training=False)
        # The [CLS] token output is typically at index 0 of the last_hidden_state
        return bert_output.last_hidden_state[:, 0, :]

    def get_config(self):
        config = super().get_config()
        config.update({"model_name": "bert-base-cased"}) # Match the tokenizer
        return config

# --- Define the Single-View Text Model ---
print("Building single-view text model architecture...")

# Text Branch (BERT) using the custom layer
bert_input_ids = keras.Input(shape=(128,), dtype=tf.int32, name='input_ids')
bert_attention_mask = keras.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

bert_embedding_extractor = BertEmbeddingLayer("bert-base-cased", name="bert_feature_extractor") # Match the tokenizer
text_features = bert_embedding_extractor({'input_ids': bert_input_ids, 'attention_mask': bert_attention_mask})
print("BERT model wrapped in custom layer and layers frozen.")

classifier_head = Dense(768, activation='relu')(text_features) # A dense layer before the final output which will learn
# Other option for slightly more complicated Classifier Head (New layers to be trained)
#classifier_head = layers.Dense(256, activation='relu')(text_features)
#classifier_head = layers.Dropout(0.3)(classifier_head)
#classifier_head = layers.Dense(128, activation='relu')(classifier_head)
#classifier_head = layers.Dropout(0.3)(classifier_head)
output_layer = layers.Dense(1, activation='sigmoid', name='output')(classifier_head)

# Create the final single-view text model
text_model = keras.Model(
    inputs={'input_ids': bert_input_ids, 'attention_mask': bert_attention_mask},
    outputs=output_layer
)

# --- Compile and Train the Text Model ---
print("Compiling text model...")
text_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

text_model.summary()

# Define the EarlyStopping callback
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

print("Training text model...")
history_text = text_model.fit(
    train_dataset_text,
    epochs=EPOCHS,
    validation_data=val_dataset_text,
    callbacks=[early_stopping_callback]
)

# --- Evaluation for Text Model ---
print("\nEvaluating text model on test set...")
test_loss_text, test_accuracy_text = text_model.evaluate(test_dataset_text)
print(f"Text Model Validation Loss: {test_loss_text:.4f}")
print(f"Text Model Validation Accuracy: {test_accuracy_text:.4f}")

# Generate predictions and classification report for text model
all_preds_text = []
all_labels_text = []
# Create a dataset for prediction from test data for text
predict_dataset_text = tf.data.Dataset.from_tensor_slices(
    (
        {'input_ids': X_input_ids_test, 'attention_mask': X_attention_masks_test},
        y_test
    )
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

for inputs, labels in predict_dataset_text:
    predictions = text_model.predict(inputs)
    all_preds_text.extend(predictions.flatten().tolist())
    all_labels_text.extend(labels.numpy().flatten().tolist())

# Convert probabilities to binary predictions (0 or 1)
binary_preds_text = np.array(all_preds_text) > 0.5

print("\nClassification Report (Text Model):")
print(classification_report(all_labels_text, binary_preds_text))

print("\nConfusion Matrix (Text Model):")
cm = confusion_matrix(all_labels_text, binary_preds_text)
print(cm)

# --- Calculate and Print Additional Metrics for Test Results ---
print("\nAdditional Metrics for Test Results:")

# Extract values from the confusion matrix
# cm = [[TN, FP], [FN, TP]]
tn, fp, fn, tp = cm.ravel()

# Sensitivity (Recall)
sensitivity = tp / (tp + fn)
print(f"Sensitivity (Recall): {sensitivity:.4f}")

# Specificity
specificity = tn / (tn + fp)
print(f"Specificity: {specificity:.4f}")

# AUC-ROC
# roc_auc_score requires probabilities, not binary predictions for `y_score`
# all_preds_test contains the raw probabilities (0-1)
auc_roc = roc_auc_score(all_labels_text, all_preds_text)
print(f"AUC-ROC: {auc_roc:.4f}")

# F1 Score
f1 = f1_score(all_labels_text, binary_preds_text)
print(f"F1 Score: {f1:.4f}")

print("\nText single-view model training and evaluation complete.")

2025-08-21 18:43:19.386833: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-21 18:43:20.039864: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI AVX512_BF16, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


Loading data from CSV...
Total samples: 95
Loading BERT tokenizer and tokenizing all text...




All text tokenized. Input IDs shape: (95, 128), Attention Masks shape: (95, 128)
Training samples: 66
Validation samples: 14
TensorFlow Datasets created from preprocessed NumPy arrays for text.
Building single-view text model architecture...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

BERT model wrapped in custom layer and layers frozen.
Compiling text model...


Training text model...
Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 629ms/step - accuracy: 0.5581 - loss: 0.6945 - val_accuracy: 0.7143 - val_loss: 0.6279
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 407ms/step - accuracy: 0.8655 - loss: 0.5882 - val_accuracy: 0.8571 - val_loss: 0.5718
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 406ms/step - accuracy: 0.9339 - loss: 0.5143 - val_accuracy: 0.7143 - val_loss: 0.5328
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 409ms/step - accuracy: 0.9339 - loss: 0.4558 - val_accuracy: 0.7143 - val_loss: 0.4971
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 406ms/step - accuracy: 0.9451 - loss: 0.4004 - val_accuracy: 0.8571 - val_loss: 0.4645
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 411ms/step - accuracy: 0.9837 - loss: 0.3526 - val_accuracy: 0.7857 - val_loss: 0.4407
Epoch 7/50
[1m5/

2025-08-21 18:45:48.652316: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
