##Active Learning Pipeline for Hybrid Models

---



##Notebook Setup and File Loading

In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, LoraConfig, get_peft_model, TaskType
from scipy.stats import entropy
from tqdm.auto import tqdm
import os
from google.colab import drive

# Mount Google Drive to access your files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 🏗️ Hybrid Model Architecture


This class defines the `HybridClassificationModel`. It's a multi-modal model that integrates three different types of features for classification: text, categorical data, and continuous data.


* **Text Branch**: Uses a DistilBERT model. LoRA (Low-Rank Adaptation) is applied to fine-tune it efficiently without modifying all the model parameters.

* **Categorical Branch**: Each categorical feature is passed through its own embedding layer. The embeddings are then concatenated and processed by a small feed-forward network.

* **Continuous Branch**: Continuous features are processed by a simple feed-forward network.

* **Classifier Head**: The outputs from all three branches are concatenated and fed into a final classifier to produce the prediction.

In [2]:
class HybridClassificationModel(nn.Module):
    def __init__(self, num_labels, categorical_feature_dim, continuous_feature_dim,
                 categorical_vocab_sizes=None, dropout_rate=0.1):
        super(HybridClassificationModel, self).__init__()

        # Text branch (DistilBERT with LoRA)
        self.distilbert = AutoModel.from_pretrained('distilbert-base-uncased')

        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=["q_lin", "k_lin", "v_lin"],
            lora_dropout=0.05,
            bias="none",
            task_type=TaskType.FEATURE_EXTRACTION
        )
        self.distilbert = get_peft_model(self.distilbert, lora_config)

        distilbert_hidden_size = self.distilbert.config.hidden_size

        # Dynamic categorical embeddings
        if categorical_vocab_sizes is None:
            categorical_vocab_sizes = [5, 7, 24, 10]

        self.categorical_embeddings = nn.ModuleList([
            nn.Embedding(vocab_size, min(50, vocab_size // 2 + 10))
            for vocab_size in categorical_vocab_sizes
        ])

        total_cat_embed_dim = sum(emb.embedding_dim for emb in self.categorical_embeddings)

        self.categorical_ffn = nn.Sequential(
            nn.Linear(total_cat_embed_dim, distilbert_hidden_size),
            nn.BatchNorm1d(distilbert_hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.continuous_ffn = nn.Sequential(
            nn.Linear(continuous_feature_dim, distilbert_hidden_size // 2),
            nn.BatchNorm1d(distilbert_hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(distilbert_hidden_size // 2, distilbert_hidden_size),
            nn.BatchNorm1d(distilbert_hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        combined_size = distilbert_hidden_size * 3
        self.classifier = nn.Sequential(
            nn.Linear(combined_size, distilbert_hidden_size),
            nn.BatchNorm1d(distilbert_hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(distilbert_hidden_size, distilbert_hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(distilbert_hidden_size // 2, num_labels)
        )

        self._init_weights()

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.xavier_uniform_(module.weight)

    def forward(self, input_ids, attention_mask, categorical_features, continuous_features, labels=None):
        batch_size = input_ids.size(0)

        # Text branch
        text_outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        text_hidden = text_outputs.last_hidden_state[:, 0, :]

        # Categorical branch
        cat_embeddings = []
        for i, embedding_layer in enumerate(self.categorical_embeddings):
            feature_embedded = embedding_layer(categorical_features[:, i])
            cat_embeddings.append(feature_embedded)

        cat_combined = torch.cat(cat_embeddings, dim=-1)
        cat_processed = self.categorical_ffn(cat_combined)

        # Continuous branch
        cont_processed = self.continuous_ffn(continuous_features)

        # Combine all branches
        combined_features = torch.cat((text_hidden, cat_processed, cont_processed), dim=-1)
        logits = self.classifier(combined_features)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(label_smoothing=0.1)
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}

##Hybrid Model Wrapper for Inference

This class is designed to handle the loading of a pre-trained HybridClassificationModel and prepare it for making predictions on new data. It manages the tokenizer, model weights, and feature processing.

In [3]:
class HybridModelWrapper:
    """Wrapper to make HybridClassificationModel work with inference"""

    def __init__(self, model_path, config):
        # Load the hybrid model data
        hybrid_data_path = os.path.join(model_path, 'hybrid_model.bin')
        self.hybrid_data = torch.load(hybrid_data_path, map_location='cpu')

        # Extract necessary information
        self.config = self.hybrid_data['config']
        self.categorical_vocab_sizes = self.hybrid_data['categorical_vocab_sizes']
        self.feature_metadata = self.hybrid_data['feature_metadata']

        # Get the actual feature dimensions from the saved metadata
        print("Feature metadata keys:", self.feature_metadata.keys())

        if 'categorical_features' in self.feature_metadata:
            categorical_feature_dim = self.feature_metadata['categorical_features']['feature_dim']
            self.categorical_vocab_sizes = [
                self.feature_metadata['categorical_features']['encoders'][col]['n_classes']
                for col in self.feature_metadata['categorical_features']['feature_names']
            ]
        else:
            categorical_feature_dim = len(self.categorical_vocab_sizes)

        if 'continuous_features' in self.feature_metadata:
            continuous_feature_dim = self.feature_metadata['continuous_features']['feature_dim']
            print(f"Continuous feature dimension from metadata: {continuous_feature_dim}")
        else:
            # Fallback: inspect the model state dict to get the correct dimension
            continuous_feature_dim = self.hybrid_data['full_model_state_dict']['continuous_ffn.0.weight'].shape[1]
            print(f"Continuous feature dimension from model weights: {continuous_feature_dim}")

        print(f"Creating model with categorical_dim={categorical_feature_dim}, continuous_dim={continuous_feature_dim}")

        # Create the hybrid model with correct dimensions
        self.model = HybridClassificationModel(
            num_labels=self.config['num_labels'],
            categorical_feature_dim=categorical_feature_dim,
            continuous_feature_dim=continuous_feature_dim,
            categorical_vocab_sizes=self.categorical_vocab_sizes
        )

        # Get the state dict from the hybrid_data
        model_state_dict = self.hybrid_data['full_model_state_dict']

        if "class_weights" in model_state_dict:
            del model_state_dict["class_weights"]
            print("Successfully removed 'class_weights' from the state_dict.")

        # Load the cleaned state dict into the model
        self.model.load_state_dict(model_state_dict)
        self.model.eval()

        # Store the feature dimensions for inference
        self.categorical_feature_dim = categorical_feature_dim
        self.continuous_feature_dim = continuous_feature_dim

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

        # Move to GPU if available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

        print(f"Hybrid model loaded on device: {self.device}")


    def extract_features_from_engineered_data(self, texts, all_features_df):
        """Extract categorical and continuous features for given texts"""
        batch_size = len(texts)

        # Get feature names from the model's metadata
        categorical_feature_names = self.feature_metadata['categorical_features']['feature_names']
        continuous_feature_names = self.feature_metadata['continuous_features']['feature_names']

        # Find matching rows in the engineered features DataFrame
        matched_features = []
        for text in texts:
            matching_rows = all_features_df[all_features_df['text'] == text]
            if not matching_rows.empty:
                matched_features.append(matching_rows.iloc[0])
            else:
                # If no match found, create a row with default values (0 or 0.0)
                default_row_data = {
                    col: 0 for col in categorical_feature_names
                }
                default_row_data.update({
                    col: 0.0 for col in continuous_feature_names
                })
                matched_features.append(pd.Series(default_row_data))

        # Convert to DataFrame for easier processing
        features_df = pd.DataFrame(matched_features)

        # Extract categorical features and clip values
        categorical_data = []
        for i, col in enumerate(categorical_feature_names):
            if col in features_df.columns:
                # Get the vocabulary size for this specific feature
                vocab_size = self.categorical_vocab_sizes[i]

                # Clip the values to ensure they are within the valid range
                values = features_df[col].astype(int).clip(0, vocab_size - 1).values
                categorical_data.append(values)
            else:
                categorical_data.append(np.zeros(batch_size, dtype=int))

        categorical_features = torch.tensor(
            np.column_stack(categorical_data),
            dtype=torch.long
        ).to(self.device)

        # Extract continuous features
        continuous_data = []
        for col in continuous_feature_names:
            if col in features_df.columns:
                values = pd.to_numeric(features_df[col], errors='coerce').fillna(0).astype(float).values
                continuous_data.append(values)
            else:
                continuous_data.append(np.zeros(batch_size, dtype=float))

        continuous_features = torch.tensor(
            np.column_stack(continuous_data),
            dtype=torch.float32
        ).to(self.device)


        assert categorical_features.shape[1] == self.categorical_feature_dim, \
            f"Expected {self.categorical_feature_dim} categorical features, got {categorical_features.shape[1]}"
        assert continuous_features.shape[1] == self.continuous_feature_dim, \
            f"Expected {self.continuous_feature_dim} continuous features, got {continuous_features.shape[1]}"

        return categorical_features, continuous_features

    def __call__(self, texts, all_features_df, batch_size=32, return_all_scores=True):
        """Pipeline-like interface for batch inference"""
        if isinstance(texts, str):
            texts = [texts]

        all_results = []

        # Process in batches with a progress bar
        for i in tqdm(range(0, len(texts), batch_size), desc="Getting predictions"):
            batch_texts = texts[i:i + batch_size]
            batch_results = self._process_batch(batch_texts, all_features_df, return_all_scores)
            all_results.extend(batch_results)

        return all_results

    def _process_batch(self, texts, all_features_df, return_all_scores):
        """Process a single batch"""
        batch_size = len(texts)

        # Tokenize texts
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.config.get('max_length', 512),
            return_tensors='pt'
        ).to(self.device)


        categorical_features, continuous_features = self.extract_features_from_engineered_data(texts, all_features_df)

        with torch.no_grad():
            outputs = self.model(
                input_ids=encoded['input_ids'],
                attention_mask=encoded['attention_mask'],
                categorical_features=categorical_features,
                continuous_features=continuous_features
            )

        # Convert to probabilities
        probabilities = torch.softmax(outputs['logits'], dim=-1)

        # Format output similar to pipeline
        results = []
        for i, probs in enumerate(probabilities):
            if return_all_scores:
                result = [
                    {'label': f'LABEL_{j}', 'score': float(probs[j])}
                    for j in range(len(probs))
                ]
            else:
                best_idx = torch.argmax(probs)
                result = {'label': f'LABEL_{best_idx}', 'score': float(probs[best_idx])}
            results.append(result)

        return results

## Active Learning Pipeline Logic

In [4]:
def calculate_uncertainty_from_pipeline(predictions):
    """Calculate entropy-based uncertainty from pipeline predictions"""
    uncertainties = []
    for pred in predictions:
        scores = np.array([p['score'] for p in pred])
        uncertainty = entropy(scores)
        uncertainties.append(uncertainty)
    return np.array(uncertainties)

In [5]:
def get_predicted_labels(predictions):
    """Extract predicted labels from pipeline output"""
    labels = []
    label_mapping = {
        'LABEL_0': 'advertisement',
        'LABEL_1': 'irrelevant',
        'LABEL_2': 'rant_without_visit',
        'LABEL_3': 'relevant_and_quality'
    }

    for pred in predictions:
        best_pred = max(pred, key=lambda x: x['score'])
        label_name = label_mapping.get(best_pred['label'], best_pred['label'])
        labels.append(label_name)
    return np.array(labels)

In [6]:
def class_aware_active_sampling(classifier, unlabeled_df, all_features_df, total_samples=3000):
    """Select samples using class-aware uncertainty sampling"""
    target_samples = {
        'advertisement': 2500,
        'irrelevant': 2500,
        'rant_without_visit': 2500,
        'relevant_and_quality': 2500
    }

    # Sample a subset for prediction if dataset is very large
    prediction_batch = unlabeled_df.sample(n=min(20000, len(unlabeled_df)))
    texts = prediction_batch['text'].tolist()

    print("Getting model predictions...")
    # Pass the all_features_df to the classifier
    all_predictions = classifier(texts, all_features_df=all_features_df, batch_size=64)
    print(f"Example predictions: {all_predictions[:3]}")


    print("Calculating uncertainties...")
    uncertainty = calculate_uncertainty_from_pipeline(all_predictions)
    predicted_labels = get_predicted_labels(all_predictions)

    print(f"Predicted labels counts: {pd.Series(predicted_labels).value_counts()}")

    selected_indices = []
    print("Selecting samples per class...")

    for label, n_samples in target_samples.items():
        class_mask = predicted_labels == label
        class_df = prediction_batch[class_mask].copy()

        if len(class_df) < n_samples:
            print(f"Only {len(class_df)} samples predicted as class '{label}', taking all available.")
            selected_df = class_df
        else:
            class_df['uncertainty'] = uncertainty[class_mask]
            selected_df = class_df.nlargest(n_samples, 'uncertainty')

        selected_indices.extend(selected_df.index.tolist())

    # Return user_ids instead of text_ids
    return unlabeled_df.loc[selected_indices]['user_id'].tolist()

##Main Execution

This final section brings all the pieces together. The main function loads all the necessary resources, filters out any data that has already been labeled, and then calls the class_aware_active_sampling function to select the next batch of samples for manual annotation.

In [12]:
def load_resources():
    CONFIG = {
        'base_path': '/content/drive/MyDrive/Tiktok_Hackaton/',
        'model_name': 'distilbert-base-uncased',
        'num_labels': 4,
        'max_length': 512
    }

    # Load full engineered features from the specified Google Drive path
    engineered_path = os.path.join(CONFIG['base_path'], 'preprocessed_data/engineered_features.pkl')
    try:
        all_features = pd.read_pickle(engineered_path)
        print(f"Successfully loaded engineered features from {engineered_path}")
    except FileNotFoundError:
        print(f"Error: The file {engineered_path} was not found.")
        print("Please ensure the file exists at this location in your Google Drive.")
        raise # Re-raise the exception to stop execution


    # Load existing labeled data user_ids to exclude them
    labeled_user_ids = set()
    try:
        file_paths = [
            os.path.join(CONFIG['base_path'], 'preprocessed_data/processed_train.pkl'),
            os.path.join(CONFIG['base_path'], 'preprocessed_data/processed_val.pkl'),
            os.path.join(CONFIG['base_path'], 'preprocessed_data/processed_test.pkl')
        ]

        for path in file_paths:
            try:
                data_dict = pickle.load(open(path, 'rb'))

                # Look for user_id in the data (assuming it was saved with this key)
                if isinstance(data_dict, dict):
                    # Try different possible keys for user_id
                    user_id_key = None
                    for key in ['user_id', 'review_id', 'user_ids']:
                        if key in data_dict:
                            user_id_key = key
                            break

                    if user_id_key:
                        labeled_user_ids.update(data_dict[user_id_key])
                        print(f"Loaded {len(data_dict[user_id_key])} user IDs from {os.path.basename(path)}")
                    else:
                        print(f"Warning: No user_id field found in {os.path.basename(path)}. Available keys: {list(data_dict.keys())}")
                else:
                    print(f"Warning: File {os.path.basename(path)} has unexpected format. Skipping.")

            except (FileNotFoundError, TypeError, KeyError) as e:
                print(f"Warning loading {os.path.basename(path)}: {e}. Skipping.")

        if labeled_user_ids:
            print(f"Total labeled user IDs found: {len(labeled_user_ids):,}")
        else:
            print("No labeled data files were found or could be loaded.")

    except Exception as e:
        print(f"An unexpected error occurred during data loading: {e}")
        print("Assuming this is the first run and sampling from the entire dataset.")


    # Load trained model using the hybrid wrapper
    TRAINED_MODEL_PATH = os.path.join(CONFIG['base_path'], 'final_model_enhanced')

    # Define device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    try:
        # Load the HybridModelWrapper
        classifier = HybridModelWrapper(TRAINED_MODEL_PATH, CONFIG)
        print("Successfully loaded hybrid model with custom wrapper")
    except Exception as e:
        print(f"Error loading hybrid model: {e}")
        raise

    return all_features, labeled_user_ids, classifier, CONFIG

In [13]:
def main():
    print("Starting Active Learning Sample Selection...")

    all_features, labeled_user_ids, classifier, CONFIG = load_resources()

    # Use user_id to filter out already labeled data
    unlabeled_pool = all_features[~all_features['user_id'].isin(labeled_user_ids)].copy()

    print(f"Total samples: {len(all_features):,}")
    print(f"Already labeled user IDs: {len(labeled_user_ids):,}")
    print(f"Available for selection: {len(unlabeled_pool):,}")

    if unlabeled_pool.empty:
        print("No unlabeled data to select. Exiting.")
        return

    selected_user_ids = class_aware_active_sampling(classifier, unlabeled_pool, all_features)

    output_path = os.path.join(CONFIG['base_path'], 'preprocessed_data/active_learning_sample_user_ids.pkl')
    with open(output_path, 'wb') as f:
        pickle.dump(selected_user_ids, f)

    print(f"Selected {len(selected_user_ids)} user samples for labeling.")
    print(f"Saved list of user IDs to: {output_path}")
    print("Ready to pass to labeling pipeline!")

if __name__ == "__main__":
    main()

Starting Active Learning Sample Selection...
Successfully loaded engineered features from /content/drive/MyDrive/Tiktok_Hackaton/preprocessed_data/engineered_features.pkl
Loaded 4382 user IDs from processed_train.pkl
Loaded 939 user IDs from processed_val.pkl
Loaded 939 user IDs from processed_test.pkl
Total labeled user IDs found: 6,260
Feature metadata keys: dict_keys(['version', 'creation_timestamp', 'pipeline_info', 'ira_analysis', 'categorical_features', 'continuous_features', 'labels', 'dataset_stats', 'processing_stats', 'text_processing', 'quality_metrics'])
Continuous feature dimension from metadata: 55
Creating model with categorical_dim=3, continuous_dim=55


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Successfully removed 'class_weights' from the state_dict.
Hybrid model loaded on device: cuda
Successfully loaded hybrid model with custom wrapper
Total samples: 50,000
Already labeled user IDs: 6,260
Available for selection: 50,000
Getting model predictions...


Getting predictions:   0%|          | 0/313 [00:00<?, ?it/s]

Example predictions: [[{'label': 'LABEL_0', 'score': 4.0898022325285103e-38}, {'label': 'LABEL_1', 'score': 1.0}, {'label': 'LABEL_2', 'score': 6.109869935902739e-16}, {'label': 'LABEL_3', 'score': 0.0}], [{'label': 'LABEL_0', 'score': 3.132205527388819e-14}, {'label': 'LABEL_1', 'score': 0.9999998807907104}, {'label': 'LABEL_2', 'score': 1.0968638264330366e-07}, {'label': 'LABEL_3', 'score': 4.6162542761754975e-21}], [{'label': 'LABEL_0', 'score': 4.259854665983297e-28}, {'label': 'LABEL_1', 'score': 1.0}, {'label': 'LABEL_2', 'score': 2.8021475764750114e-11}, {'label': 'LABEL_3', 'score': 3.629363022601276e-42}]]
Calculating uncertainties...
Predicted labels counts: irrelevant              15601
rant_without_visit       3317
advertisement             640
relevant_and_quality      442
Name: count, dtype: int64
Selecting samples per class...
Only 640 samples predicted as class 'advertisement', taking all available.
Only 442 samples predicted as class 'relevant_and_quality', taking all 