In [None]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score

df_features = pd.read_json('/kaggle/input/indoml-phase2/train.features',lines=True)
df_labels = pd.read_json('/kaggle/input/indoml-phase2/train.labels',lines=True)

In [None]:
from huggingface_hub import login

huggingface_token = "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"
login(token=huggingface_token)

In [None]:
df = pd.merge(df_features,df_labels,on="indoml_id")

In [None]:
# df = df[:1000]

In [None]:
from sklearn.preprocessing import LabelEncoder
group_encoder = LabelEncoder()
supergroup_encoder = LabelEncoder()
module_encoder = LabelEncoder()
brand_encoder = LabelEncoder()

# Fit and transform each column
df['group'] = group_encoder.fit_transform(df['group'])
df['supergroup'] = supergroup_encoder.fit_transform(df['supergroup'])
df['module'] = module_encoder.fit_transform(df['module'])
df['brand'] = brand_encoder.fit_transform(df['brand'])
df['description'] = df['description'] + ' ' + df['retailer'] # retailer is beaing added

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch.nn.functional as F
from transformers import AutoTokenizer, XLMRobertaModel
from torch.distributions import Categorical
import numpy as np
from typing import Dict, List, Union, Tuple
import os

In [None]:
# For using the new one
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# For loading the saved one
# model_dir = "/kaggle/input/new-transformer-experiment-12-emb-xlm-roberta-tmp/New_model"
# tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
MAX_LENGTH = 13 # Added +1 for the retailer
BATCH_SIZE = 64
LEARNING_RATE = 5e-5
NUM_EPOCHS = 20
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device=DEVICE
PATIENCE = 5  # Early stopping patience
PATIENCE_LR = 3  # Reduce LR on plateau patience

In [None]:
from sklearn.metrics import accuracy_score
import torch

In [None]:
class ProductDataset(Dataset):
    def __init__(self, texts, labels1, labels2, labels3, labels4):
        self.texts = texts
        self.labels1 = labels1
        self.labels2 = labels2
        self.labels3 = labels3
        self.labels4 = labels4
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors="pt")
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        item = {key: val[idx].to(DEVICE) for key, val in self.encodings.items()}
        item['labels1'] = torch.tensor(self.labels1[idx], device=DEVICE)
        item['labels2'] = torch.tensor(self.labels2[idx], device=DEVICE)
        item['labels3'] = torch.tensor(self.labels3[idx], device=DEVICE)
        item['labels4'] = torch.tensor(self.labels4[idx], device=DEVICE)
        return item
        
def compute_accuracy(preds, labels):
    # Convert each tensor in the list to numpy arrays
    preds_np = [p.cpu().numpy() for p in preds]
    labels_np = [l.cpu().numpy() for l in labels]
    # Individual accuracies for each of the 4 labels
    accuracies = [accuracy_score(labels_np[i], preds_np[i]) for i in range(4)]
    # Overall accuracy where all 4 labels match
    overall_accuracy = accuracy_score(
        np.all([labels_np[i] == preds_np[i] for i in range(4)], axis=0), 
        np.ones(len(labels_np[0]))
    )
    # Return the 5 accuracies (4 individual, 1 overall)
    return accuracies + [overall_accuracy]

In [None]:
# Split data
train_texts, val_texts, train_labels1, val_labels1, train_labels2, val_labels2, train_labels3, val_labels3, train_labels4, val_labels4 = train_test_split(
    df['description'], 
    df['supergroup'], 
    df['group'], 
    df['module'], 
    df['brand'], 
    test_size=0.2, 
    random_state=42
)
train_dataset = ProductDataset(
    train_texts.tolist(), 
    train_labels1.tolist(), 
    train_labels2.tolist(), 
    train_labels3.tolist(), 
    train_labels4.tolist()
)
val_dataset = ProductDataset(
    val_texts.tolist(), 
    val_labels1.tolist(), 
    val_labels2.tolist(), 
    val_labels3.tolist(), 
    val_labels4.tolist()
)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class EnhancedHierarchicalClassifier(nn.Module):
    def __init__(self, num_supergroups, num_groups, num_modules, num_brands, hidden_size=768, projection_dim=128):
        super().__init__()
        self.model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.hidden_size = hidden_size
        
        # Original classifiers
        self.supergroup_classifier = nn.Linear(hidden_size, num_supergroups)
        self.group_classifier = nn.Linear(hidden_size + num_supergroups, num_groups)
        self.module_classifier = nn.Linear(hidden_size + num_supergroups + num_groups, num_modules)
        self.brand_classifier = nn.Linear(hidden_size + num_supergroups + num_groups + num_modules, num_brands)
        
        # Original RL Policy networks
        self.supergroup_policy = nn.Linear(hidden_size, num_supergroups)
        self.group_policy = nn.Linear(hidden_size + num_supergroups, num_groups)
        self.module_policy = nn.Linear(hidden_size + num_supergroups + num_groups, num_modules)
        self.brand_policy = nn.Linear(hidden_size + num_supergroups + num_groups + num_modules, num_brands)
        
        # Original contrastive learning projection
        self.projection = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, projection_dim)
        )
        
        # Original few-shot learning prototypes
        self.prototypes = nn.Parameter(torch.randn(num_supergroups + num_groups + num_modules + num_brands, hidden_size))
        
        # FIXED: Multi-head attention for enhanced feature extraction
        self.attention = nn.MultiheadAttention(hidden_size, num_heads=8)
        
        # Rest of the architecture remains the same
        self.expert_classifiers = nn.ModuleDict({
            'supergroup': self._make_expert_classifier(hidden_size, num_supergroups),
            'group': self._make_expert_classifier(hidden_size + num_supergroups, num_groups),
            'module': self._make_expert_classifier(hidden_size + num_supergroups + num_groups, num_modules),
            'brand': self._make_expert_classifier(hidden_size + num_supergroups + num_groups + num_modules, num_brands)
        })
        
        self.confidence_heads = nn.ModuleDict({
            'supergroup': nn.Linear(hidden_size, 1),
            'group': nn.Linear(hidden_size + num_supergroups, 1),
            'module': nn.Linear(hidden_size + num_supergroups + num_groups, 1),
            'brand': nn.Linear(hidden_size + num_supergroups + num_groups + num_modules, 1)
        })
        
        self.auxiliary_projection = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, projection_dim // 2)
        )

    def _make_expert_classifier(self, input_dim, output_dim):
        return nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),
            nn.LayerNorm(input_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(input_dim // 2, output_dim)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
        
        # FIXED: Correct attention mask handling
        # Transpose hidden states to match attention input requirements [seq_len, batch_size, hidden_size]
        hidden_states = hidden_states.transpose(0, 1)
        
        # Transform attention_mask for key_padding_mask
        # attention_mask: [batch_size, seq_len] -> [batch_size, seq_len]
        key_padding_mask = attention_mask.bool()
        key_padding_mask = ~key_padding_mask  # Invert mask as per PyTorch convention
        
        # Apply multi-head attention with corrected dimensions
        attended_output, _ = self.attention(
            hidden_states,  # [seq_len, batch_size, hidden_size]
            hidden_states,  # [seq_len, batch_size, hidden_size]
            hidden_states,  # [seq_len, batch_size, hidden_size]
            key_padding_mask=key_padding_mask  # [batch_size, seq_len]
        )
        
        # Transform back to [batch_size, seq_len, hidden_size]
        attended_output = attended_output.transpose(0, 1)
        
        # Pool the attended output
        pooled_output = torch.mean(attended_output, dim=1)
        
        # Rest of the forward pass remains the same
        supergroup_logits = self.supergroup_classifier(pooled_output)
        group_input = torch.cat([pooled_output, torch.softmax(supergroup_logits, dim=1)], dim=1)
        group_logits = self.group_classifier(group_input)
        module_input = torch.cat([group_input, torch.softmax(group_logits, dim=1)], dim=1)
        module_logits = self.module_classifier(module_input)
        brand_input = torch.cat([module_input, torch.softmax(module_logits, dim=1)], dim=1)
        brand_logits = self.brand_classifier(brand_input)
        
        supergroup_policy = self.supergroup_policy(pooled_output)
        group_policy = self.group_policy(group_input)
        module_policy = self.module_policy(module_input)
        brand_policy = self.brand_policy(brand_input)
        
        projection = self.projection(pooled_output)
        prototype_distances = torch.cdist(pooled_output, self.prototypes)
        few_shot_logits = -prototype_distances
        
        expert_supergroup = self.expert_classifiers['supergroup'](pooled_output)
        expert_group = self.expert_classifiers['group'](group_input)
        expert_module = self.expert_classifiers['module'](module_input)
        expert_brand = self.expert_classifiers['brand'](brand_input)
        
        confidences = {
            'supergroup': torch.sigmoid(self.confidence_heads['supergroup'](pooled_output)),
            'group': torch.sigmoid(self.confidence_heads['group'](group_input)),
            'module': torch.sigmoid(self.confidence_heads['module'](module_input)),
            'brand': torch.sigmoid(self.confidence_heads['brand'](brand_input))
        }
        
        aux_projection = self.auxiliary_projection(pooled_output)
        
        return (
            (supergroup_logits, group_logits, module_logits, brand_logits),
            (supergroup_policy, group_policy, module_policy, brand_policy),
            projection,
            few_shot_logits,
            (expert_supergroup, expert_group, expert_module, expert_brand),
            confidences,
            aux_projection
        )

class EnhancedJointAccuracyTrainer:
    def __init__(self, model, supervised_lr=1e-5, rl_lr=1e-4, contrastive_temperature=0.07, loss_weights=None):
        self.model = model
        self.device = next(model.parameters()).device
        
        # Original optimizers
        self.supervised_optimizer = torch.optim.NAdam(model.parameters(), lr=supervised_lr)
        self.rl_optimizer = torch.optim.NAdam(model.parameters(), lr=rl_lr)
        
        # Loss parameters
        self.criterion = nn.CrossEntropyLoss()
        self.contrastive_temperature = contrastive_temperature
        self.loss_weights = loss_weights or [1.0, 1.0, 1.0, 1.0]
        
        # Knowledge distillation temperature
        self.kd_temperature = 2.0
        
        # Focal loss gamma parameters
        self.focal_gamma = 2.0

    def compute_joint_loss(self, all_outputs, true_labels, batch_size):
        supervised_logits, policy_logits, projection, few_shot_logits, expert_logits, confidences, aux_projection = all_outputs
        
        # Classification losses
        base_losses = [
            self.criterion(logits, true_labels[f'labels{i+1}'])
            for i, logits in enumerate(supervised_logits)
        ]
        
        # Weighted sum
        original_loss = sum(w * l for w, l in zip(self.loss_weights, base_losses))
        
        # Expert ensemble loss with knowledge distillation
        expert_losses = []
        for base_logit, expert_logit, true_label in zip(supervised_logits, expert_logits, true_labels.values()):
            # Knowledge distillation loss
            soft_base = F.softmax(base_logit / self.kd_temperature, dim=1)
            soft_expert = F.softmax(expert_logit / self.kd_temperature, dim=1)
            kd_loss = F.kl_div(
                F.log_softmax(base_logit / self.kd_temperature, dim=1),
                soft_expert,
                reduction='batchmean'
            ) * (self.kd_temperature ** 2)
            
            # Focal loss for hard labels
            ce_loss = F.cross_entropy(expert_logit, true_label, reduction='none')
            pt = torch.exp(-ce_loss)
            focal_loss = ((1 - pt) ** self.focal_gamma) * ce_loss
            
            expert_losses.append(kd_loss + focal_loss.mean())
        
        # Confidence-weighted loss
        confidence_loss = 0
        for logits, conf in zip(supervised_logits, confidences.values()):
            pred_prob = F.softmax(logits, dim=1)
            confidence_loss += F.mse_loss(conf, torch.max(pred_prob, dim=1)[0])
        
        # FIXED: Main contrastive loss with proper normalization
        proj_norm = F.normalize(projection, dim=1)
        similarity = torch.matmul(proj_norm, proj_norm.t()) / self.contrastive_temperature
        contrastive_labels = torch.arange(batch_size).to(self.device)
        contrastive_loss = F.cross_entropy(similarity, contrastive_labels)
        
        # FIXED: Auxiliary contrastive loss with proper dimension handling
        aux_proj_norm = F.normalize(aux_projection, dim=1)
        # Project aux_proj_norm to the same dimension as proj_norm if needed
        if aux_proj_norm.size(1) != proj_norm.size(1):
            projection_layer = nn.Linear(aux_proj_norm.size(1), proj_norm.size(1)).to(self.device)
            aux_proj_norm = projection_layer(aux_proj_norm)
            aux_proj_norm = F.normalize(aux_proj_norm, dim=1)
        
        aux_similarity = torch.matmul(aux_proj_norm, proj_norm.t()) / self.contrastive_temperature
        aux_contrastive_loss = F.cross_entropy(aux_similarity, contrastive_labels)
        
        # Few-shot loss
        few_shot_loss = F.cross_entropy(few_shot_logits, true_labels['labels1'])
        
        # Combine all losses with weights
        total_loss = (
            original_loss +
            0.5 * sum(expert_losses) +
            0.1 * confidence_loss +
            0.1 * contrastive_loss +
            0.05 * aux_contrastive_loss +
            0.1 * few_shot_loss
        )
        
        return total_loss

    def supervised_step(self, batch, true_labels):
        self.supervised_optimizer.zero_grad()
        batch_size = batch['input_ids'].size(0)
        all_outputs = self.model(batch['input_ids'], batch['attention_mask'])
        total_loss = self.compute_joint_loss(all_outputs, true_labels, batch_size)
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=5.0)
        self.supervised_optimizer.step()
        return total_loss.item()

def train_and_evaluate(model, train_loader, val_loader, num_epochs=10):
    trainer = EnhancedJointAccuracyTrainer(model)
    best_accuracy = 0
    patience = 5
    no_improve = 0
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch_idx, batch in enumerate(train_loader):
            batch = {k: v.to(trainer.device) for k, v in batch.items()}
            true_labels = {
                f'labels{i+1}': batch[f'labels{i+1}']
                for i in range(4)
            }
            loss = trainer.supervised_step(batch, true_labels)
            total_loss += loss
            
            if batch_idx % 100 == 0:
                print(f"Batch {batch_idx}, Loss: {loss:.4f}")
        
        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        print(f"Average Training Loss: {avg_loss:.4f}")
        
        # Validation
        model.eval()
        val_accuracies = [0, 0, 0, 0, 0]
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(trainer.device) for k, v in batch.items()}
                outputs = model(batch['input_ids'], batch['attention_mask'])
                supervised_logits = outputs[0]
                
                true_labels = {
                    f'labels{i+1}': batch[f'labels{i+1}']
                    for i in range(4)
                }
                
                val_loss += trainer.compute_joint_loss(outputs, true_labels, batch['input_ids'].size(0)).item()
                
                # Calculate accuracies
                for i, logits in enumerate(supervised_logits):
                    preds = torch.argmax(logits, dim=1)
                    val_accuracies[i] += (preds == batch[f'labels{i+1}']).float().mean().item()
                
                # Calculate joint accuracy
                all_correct = torch.all(torch.stack([
                    torch.argmax(logits, dim=1) == batch[f'labels{i+1}']
                    for i, logits in enumerate(supervised_logits)
                ]), dim=0)
                val_accuracies[4] += all_correct.float().mean().item()
        
        val_accuracies = [acc / len(val_loader) for acc in val_accuracies]
        avg_val_loss = val_loss / len(val_loader)
        
        print(f"\nValidation Results:")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Supergroup Accuracy: {val_accuracies[0]:.4f}")
        print(f"Group Accuracy: {val_accuracies[1]:.4f}")
        print(f"Module Accuracy: {val_accuracies[2]:.4f}")
        print(f"Brand Accuracy: {val_accuracies[3]:.4f}")
        print(f"Joint Accuracy: {val_accuracies[4]:.4f}")
        
        # Save best model and early stopping
        if val_accuracies[4] > best_accuracy:
            best_accuracy = val_accuracies[4]
            torch.save(model.state_dict(), "best_model.pth")
            print(f"New best model saved with joint accuracy: {best_accuracy:.4f}")
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping triggered after", patience, "epochs without improvement")
                break

# Usage
model = EnhancedHierarchicalClassifier(num_supergroups=32, num_groups=228, num_modules=449, num_brands=5679).to(DEVICE)

# For loading the model saved after pre-training the saved dict
# model_path = os.path.join(model_dir, "model.pth")
# model.load_state_dict(torch.load(model_path))

# Assuming you have train_loader and val_loader already defined
train_and_evaluate(model, train_loader, val_loader, num_epochs=NUM_EPOCHS)

In [None]:
import os

# Define the directory to save the model and tokenizer
save_directory = "New_model"

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the model's state dictionary
model_save_path = os.path.join(save_directory, "model.pth")
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the tokenizer
tokenizer.save_pretrained(save_directory)
print(f"Tokenizer saved to {save_directory}")

In [None]:
df_test_feat = pd.read_json('/kaggle/input/indoml-phase2/final_test_data.features',lines=True)

In [None]:
df_test_feat['description'] = df_test_feat['description'] + ' ' + df_test_feat['retailer']
df_test_feat.head()

In [None]:
def predict(model, tokenizer, text, device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Make predictions using the enhanced hierarchical classifier.
    
    Args:
        model: The trained EnhancedHierarchicalClassifier model
        tokenizer: The tokenizer used for preprocessing
        text: Input text to classify
        device: Device to run inference on
        
    Returns:
        dict: Dictionary containing predictions and confidence scores
    """
    # Ensure model is in eval mode
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,  # Using standard MAX_LENGTH, adjust if needed
        return_tensors="pt"
    ).to(device)
    
    # Get predictions
    with torch.no_grad():
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        
        # Unpack model outputs
        logits, _, _, _, expert_logits, confidences, _ = outputs
        
        # Get predictions from main classifiers
        predictions = [torch.argmax(logit, dim=1).item() for logit in logits]
        
        # Get confidence scores
        confidence_scores = {
            'supergroup': confidences['supergroup'].item(),
            'group': confidences['group'].item(),
            'module': confidences['module'].item(),
            'brand': confidences['brand'].item()
        }
        
        # Get probabilities
        probabilities = [F.softmax(logit, dim=1).max(1).values.item() for logit in logits]
        
        # Get expert predictions
        expert_predictions = [torch.argmax(logit, dim=1).item() for logit in expert_logits]
        
    # Combine results
    result = {
        'predictions': {
            'supergroup': predictions[0],
            'group': predictions[1],
            'module': predictions[2],
            'brand': predictions[3]
        },
        'expert_predictions': {
            'supergroup': expert_predictions[0],
            'group': expert_predictions[1],
            'module': expert_predictions[2],
            'brand': expert_predictions[3]
        },
        'confidence_scores': confidence_scores,
        'probabilities': {
            'supergroup': probabilities[0],
            'group': probabilities[1],
            'module': probabilities[2],
            'brand': probabilities[3]
        }
    }
    
    return result

In [None]:
def reconcile_predictions(
    model, 
    tokenizer, 
    text: str,
    confidence_threshold: float = 0.8,
    probability_threshold: float = 0.7,
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
) -> Tuple[int, int, int, int]:
    """
    End-to-end function that manages prediction reconciliation between main and expert models.
    
    Args:
        model: The trained EnhancedHierarchicalClassifier model
        tokenizer: The tokenizer used for preprocessing
        text: Input text to classify
        confidence_threshold: Minimum confidence score to trust main prediction
        probability_threshold: Minimum probability score to trust main prediction
        device: Device to run inference on
        
    Returns:
        Tuple[int, int, int, int]: Final predictions for (supergroup, group, module, brand)
    """
    # Get initial predictions
    predictions = predict(model, tokenizer, text, device)
    
    # Initialize final predictions
    final_predictions = []
    
    # Process each category
    categories = ['supergroup', 'group', 'module', 'brand']
    for category in categories:
        main_pred = predictions['predictions'][category]
        expert_pred = predictions['expert_predictions'][category]
        confidence = predictions['confidence_scores'][category]
        probability = predictions['probabilities'][category]
        
        # Decision logic
        if main_pred == expert_pred:
            final_pred = main_pred
        else:
            # Case 1: High confidence and probability in main prediction
            if confidence >= confidence_threshold and probability >= probability_threshold:
                final_pred = main_pred
            
            # Case 2: Low confidence or probability - trust expert
            elif confidence < confidence_threshold or probability < probability_threshold:
                final_pred = expert_pred
            
            # Case 3: Moderate confidence/probability - use weighted ensemble
            else:
                final_pred = _weighted_ensemble_decision(
                    main_pred=main_pred,
                    expert_pred=expert_pred,
                    confidence=confidence,
                    probability=probability
                )
        
        final_predictions.append(final_pred)
    
    return tuple(final_predictions)

def _weighted_ensemble_decision(
    main_pred: int,
    expert_pred: int,
    confidence: float,
    probability: float
) -> int:
    """
    Make a weighted decision between main and expert predictions.
    
    Args:
        main_pred: Prediction from main classifier
        expert_pred: Prediction from expert classifier
        confidence: Confidence score from main classifier
        probability: Probability score from main classifier
        
    Returns:
        int: Final prediction
    """
    main_weight = (confidence + probability) / 2
    return main_pred if main_weight >= 0.5 else expert_pred

# Single prediction
text = "Sample product description" + "retailer name"
supergroup, group, module, brand = reconcile_predictions(model, tokenizer, text)
print(supergroup, group, module, brand)

In [None]:
# df_test_feat = df_test_feat[:50]

In [None]:
def make_test_pred_and_save(df_test_feat):
    supergroups_list = []
    groups_list = []
    modules_list = []
    brands_list = []
    indoml_id_list = range(0, len(df_test_feat))
    length_df = df_test_feat.shape[0]
    with torch.no_grad():
        for i in range(length_df):
            if i % 1000 == 0:
                print(f"Processing {i} of {length_df - 1}")
            predictions = reconcile_predictions(model, tokenizer, df_test_feat.iloc[i].description)
            
            # Append predictions to respective lists
            supergroups_list.append(predictions[0])
            groups_list.append(predictions[1])
            modules_list.append(predictions[2])
            brands_list.append(predictions[3])

        try:
            supergroups_names = supergroup_encoder.inverse_transform(supergroups_list)
        except ValueError as e:
            print(f"Error in supergroups: {e}")
            supergroups_names = ['Unknown' if x not in supergroup_encoder.classes_ else x for x in supergroups_list]
        try:
            groups_names = group_encoder.inverse_transform(groups_list)
        except ValueError as e:
            print(f"Error in groups: {e}")
            groups_names = ['Unknown' if x not in group_encoder.classes_ else x for x in groups_list]
        try:
            modules_names = module_encoder.inverse_transform(modules_list)
        except ValueError as e:
            print(f"Error in modules: {e}")
            modules_names = ['Unknown' if x not in module_encoder.classes_ else x for x in modules_list]
        try:
            brands_names = brand_encoder.inverse_transform(brands_list)
        except ValueError as e:
            print(f"Error in brands: {e}")
            brands_names = ['Unknown' if x not in brand_encoder.classes_ else x for x in brands_list]
        # Create a DataFrame with predictions
        predictions_df = pd.DataFrame({
            'indoml_id': indoml_id_list,
            'supergroup': supergroups_names,
            'group': groups_names,
            'module': modules_names,
            'brand': brands_names
        })
        predictions_df.to_json('/kaggle/working/predictions.predict', orient='records', lines=True)
        print("predictions.predict saved")
make_test_pred_and_save(df_test_feat)