In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# --- 1. CONFIGURATION ---
EMBEDDING_DIM = 64     # The size of the shared embedding space
MARGIN = 0.5           # Similarity Margin: Pairs with Similarity > 0.5 are considered a match
EPOCHS = 30            # Training epochs
BATCH_SIZE = 4         # Batch size (small due to the small dataset)
LEARNING_RATE = 1e-4   # Learning rate
TEXT_MODEL_NAME = "distilbert-base-uncased" # Pre-trained language model

# --- UPDATED CONFIGURATION FOR WISDOM DATA ---
SENSOR_FEATURES_COUNT = 3 # **UPDATED: Only X_Acc, Y_Acc, Z_Acc are present**
TEXT_COLUMN_NAME = 'Semantic_Interpretation'
SENSOR_COLUMNS = ['X_Acc', 'Y_Acc', 'Z_Acc']
SENSOR_MODEL_PATH = 'sensor_encoder_wisdom_3col.pth'
TEXT_MODEL_PATH = 'text_encoder_wisdom_3col.pth'
DATA_FILE = "./data/WISDM_with_semantic_interpretation.csv" # **UPDATED data file**

# --- 2. DATASET CLASS ---

class SensorTextDataset(Dataset):
    def __init__(self, sensor_data, text_data, labels):
        # Convert sensor data to tensor
        self.sensor_data = torch.tensor(sensor_data, dtype=torch.float32)
        self.text_data = text_data
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sensor_data[idx], self.text_data[idx], self.labels[idx]

# --- 3. DUAL-ENCODER ARCHITECTURE ---

# Encoder for Sensor Features (Simple Multi-Layer Perceptron)
class SensorEncoder(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    def forward(self, x):
        return self.encoder(x)

# Encoder for Text Descriptions (Pre-trained DistilBERT)
class TextEncoder(nn.Module):
    def __init__(self, model_name, output_dim):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        # Projection layer aligns BERT output (e.g., 768 dim) to the shared embedding space (64 dim)
        self.projection = nn.Linear(self.model.config.hidden_size, output_dim)
    def forward(self, texts):
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        output = self.model(**encoded_input)
        mean_pooled = torch.mean(output.last_hidden_state, dim=1) # Mean-pooling for sentence embedding
        return self.projection(mean_pooled)

# --- 4. CONTRASTIVE LOSS FUNCTION (Cosine Similarity) ---

class ContrastiveSimilarityLoss(nn.Module):
    """
    Contrastive Loss based on Cosine Similarity (S).
    L = y * (1 - S) + (1 - y) * max(0, S - margin)
    """
    def __init__(self, margin=0.5):
        super(ContrastiveSimilarityLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        # Calculate Cosine Similarity
        similarity = F.cosine_similarity(output1, output2).unsqueeze(1)
        
        # Prepare tensors for loss calculation
        device = output1.device
        zero_tensor = torch.tensor(0.0).to(device)
        margin_tensor = torch.tensor(self.margin).to(device)

        # Positive pairs (label == 1.0): Loss is 1 - S (Minimized when S is high)
        loss_positive = label * (1 - similarity)
        
        # Negative pairs (label == 0.0): Loss is max(0, S - margin) (Minimized when S is low)
        loss_negative = (1 - label) * torch.max(zero_tensor, similarity - margin_tensor)
        
        loss = torch.mean(loss_positive + loss_negative)
        return loss

# --- 5. TRAINING FUNCTION ---

def train_contrastive_model():
    print("--- Starting Training (Cosine Similarity Contrastive Loss) ---")
    df = pd.read_csv(DATA_FILE)
    
    # --- UPDATED SENSOR COLUMN SELECTION LOGIC ---
    # Use the explicitly defined sensor columns (X_Acc, Y_Acc, Z_Acc)
    sensor_cols = SENSOR_COLUMNS 

    # Safety check
    if set(sensor_cols).issubset(df.columns) is False:
        raise ValueError(f"Required sensor columns {sensor_cols} not found in {DATA_FILE}")

    if len(sensor_cols) != SENSOR_FEATURES_COUNT:
        raise ValueError(f"Feature count mismatch! Expected {SENSOR_FEATURES_COUNT} features, but detected {len(sensor_cols)} in the data file: {DATA_FILE}")

    sensor_data = df[sensor_cols].values
    text_data = df[TEXT_COLUMN_NAME].tolist()

    # Create POSITIVE and NEGATIVE training samples (by rolling/mismatching the text)
    pos_sensor, pos_text, pos_labels = sensor_data, text_data, np.ones(len(df))
    
    # Simple rolling for negative samples
    neg_sensor = sensor_data
    neg_text = np.roll(text_data, 1).tolist()
    neg_labels = np.zeros(len(df))

    all_sensor = np.concatenate([pos_sensor, neg_sensor])
    all_text = pos_text + neg_text
    all_labels = np.concatenate([pos_labels, neg_labels])

    # Split into training and test sets
    X_train_s, X_test_s, X_train_t, X_test_t, y_train, y_test = train_test_split(
        all_sensor, all_text, all_labels, test_size=0.3, random_state=42
    )

    sensor_encoder = SensorEncoder(SENSOR_FEATURES_COUNT, EMBEDDING_DIM)
    text_encoder = TextEncoder(TEXT_MODEL_NAME, EMBEDDING_DIM)
    criterion = ContrastiveSimilarityLoss(margin=MARGIN)
    
    params = list(sensor_encoder.parameters()) + list(text_encoder.parameters())
    optimizer = optim.Adam(params, lr=LEARNING_RATE)

    train_dataset = SensorTextDataset(X_train_s, X_train_t, y_train)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    print(f"Total training pairs: {len(train_dataset)}")

    for epoch in range(EPOCHS):
        total_loss = 0
        for sensor_batch, text_batch, label_batch in train_loader:
            optimizer.zero_grad()
            sensor_embedding = sensor_encoder(sensor_batch)
            text_embedding = text_encoder(text_batch)
            loss = criterion(sensor_embedding, text_embedding, label_batch.unsqueeze(1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{EPOCHS}, Average Loss: {avg_loss:.4f}")

    print("\nTraining complete.")
    
    # Save the trained encoders
    torch.save(sensor_encoder.state_dict(), SENSOR_MODEL_PATH)
    torch.save(text_encoder.state_dict(), TEXT_MODEL_PATH)
    print(f"Models saved: {SENSOR_MODEL_PATH} and {TEXT_MODEL_PATH}")

    return X_test_s, X_test_t, y_test

# --- 6. EVALUATION FUNCTION ---

def evaluate_contrastive_model(X_test_s, X_test_t, y_test):
    print("\n--- Starting Evaluation on Test Data ---")
    
    sensor_encoder = SensorEncoder(SENSOR_FEATURES_COUNT, EMBEDDING_DIM)
    text_encoder = TextEncoder(TEXT_MODEL_NAME, EMBEDDING_DIM)

    # Note: These lines will only work if the user has saved these files locally after training
    sensor_encoder.load_state_dict(torch.load(SENSOR_MODEL_PATH))
    text_encoder.load_state_dict(torch.load(TEXT_MODEL_PATH))

    sensor_encoder.eval()
    text_encoder.eval()
    
    test_dataset = SensorTextDataset(X_test_s, X_test_t, y_test)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    similarities = []
    true_labels = []

    with torch.no_grad():
        for sensor_batch, text_batch, label_batch in test_loader:
            sensor_embedding = sensor_encoder(sensor_batch)
            text_embedding = text_encoder(text_batch)
            sim = F.cosine_similarity(sensor_embedding, text_embedding)
            similarities.extend(sim.cpu().numpy())
            true_labels.extend(label_batch.cpu().numpy())

    similarities = np.array(similarities)
    true_labels = np.array(true_labels)
    
    pos_similarities = similarities[true_labels == 1.0]
    neg_similarities = similarities[true_labels == 0.0]

    print(f"Test Set Size: {len(true_labels)}")
    print(f"Mean Similarity (Positive Pairs): {np.mean(pos_similarities):.4f}")
    print(f"Mean Similarity (Negative Pairs): {np.mean(neg_similarities):.4f}")
    
    # Prediction: A pair is POSITIVE if its similarity is greater than the MARGIN
    predictions = (similarities > MARGIN).astype(int)
    
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Matching Accuracy (Similarity > MARGIN={MARGIN}): {accuracy * 100:.2f}%")


if __name__ == "__main__":
    # The script runs training and then evaluates the model on the held-out test data
    try:
        X_test_s, X_test_t, y_test = train_contrastive_model()
        evaluate_contrastive_model(X_test_s, X_test_t, y_test)
    except Exception as e:
        print("\n----------------------------------------------------------------------")
        print("ðŸ›‘ **EXECUTION REMINDER** ðŸ›‘")
        print(f"An error occurred: {e}")
        print("\n**ACTION REQUIRED:** This code is for sensor-text alignment. Please copy the entire code block above and run it in your local Python environment where **PyTorch, Transformers, and scikit-learn** are installed.")
        print("----------------------------------------------------------------------")

--- Starting Training (Cosine Similarity Contrastive Loss) ---
Total training pairs: 8400
Epoch 5/30, Average Loss: 0.2513
Epoch 10/30, Average Loss: 0.2503
Epoch 15/30, Average Loss: 0.2496
Epoch 20/30, Average Loss: 0.2486
Epoch 25/30, Average Loss: 0.2491
Epoch 30/30, Average Loss: 0.2484

Training complete.
Models saved: sensor_encoder_wisdom_3col.pth and text_encoder_wisdom_3col.pth

--- Starting Evaluation on Test Data ---


  sensor_encoder.load_state_dict(torch.load(SENSOR_MODEL_PATH))
  text_encoder.load_state_dict(torch.load(TEXT_MODEL_PATH))


Test Set Size: 3600
Mean Similarity (Positive Pairs): 0.5715
Mean Similarity (Negative Pairs): 0.5805
Matching Accuracy (Similarity > MARGIN=0.5): 50.28%
