In [1]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.3


In [2]:
import math
import pandas as pd
import torch
import numpy as np
from torch.optim import AdamW
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_curve, auc
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

#your own Huggingface Token
HF_TOKEN= '...'

In [3]:
import matplotlib.colors as mcolors

class ModelMetrics:
    @staticmethod
    def calculate_metrics(preds, labels):
        """Calculate accuracy and F1-score for a single fold."""
        pred_labels = torch.argmax(preds, dim=1)
        correct = torch.sum(pred_labels == labels).item()
        total = len(labels)
        accuracy = correct / total
        f1 = f1_score(labels.numpy(), pred_labels.numpy(), average='macro')
        return {'accuracy': accuracy, 'f1_score': f1}

    @staticmethod
    def calculate_final_metrics(all_preds, all_labels):
        """Calculate mean and standard deviation of metrics across all folds."""
        accuracy_scores = []
        f1_scores = []

        for preds, labels in zip(all_preds, all_labels):
            metrics = ModelMetrics.calculate_metrics(preds, labels)
            accuracy_scores.append(metrics['accuracy'])
            f1_scores.append(metrics['f1_score'])

        accuracy_scores = torch.tensor(accuracy_scores)
        f1_scores = torch.tensor(f1_scores)

        return {
            'mean_accuracy': torch.mean(accuracy_scores),
            'std_accuracy': torch.std(accuracy_scores),
            'mean_f1': torch.mean(f1_scores),
            'std_f1': torch.std(f1_scores)
        }

    @staticmethod
    def plot_mean_roc_curve(all_folds_preds, all_folds_labels, n_classes):
        """Plot the mean ROC curve across all folds with dynamic colors."""
        mean_fpr = np.linspace(0, 1, 100)
        tprs = [[] for _ in range(n_classes)]
        aucs = [[] for _ in range(n_classes)]
        colors = list(mcolors.TABLEAU_COLORS.values())[:n_classes]  # Dynamic color selection

        for fold_preds, fold_labels in zip(all_folds_preds, all_folds_labels):
            for i in range(n_classes):
                fpr, tpr, _ = roc_curve(fold_labels.numpy() == i, fold_preds[:, i].numpy())
                interp_tpr = np.interp(mean_fpr, fpr, tpr)
                interp_tpr[0] = 0.0
                tprs[i].append(interp_tpr)
                aucs[i].append(auc(fpr, tpr))

        mean_tpr = [np.mean(tprs[i], axis=0) for i in range(n_classes)]
        mean_auc = [np.mean(aucs[i]) for i in range(n_classes)]
        std_auc = [np.std(aucs[i]) for i in range(n_classes)]

        plt.figure(figsize=(10, 8))
        for i in range(n_classes):
            plt.plot(mean_fpr, mean_tpr[i], color=colors[i],
                     label=f'Class {i} (Mean AUC = {mean_auc[i]:.4f} ± {std_auc[i]:.4f})')
            std_tpr = np.std(tprs[i], axis=0)
            tprs_upper = np.minimum(mean_tpr[i] + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr[i] - std_tpr, 0)
            plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=colors[i], alpha=0.2)

        plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Mean ROC Curve Across All Folds (±1 SD)')
        plt.legend(loc="lower right")
        plt.grid(True)
        plt.show()

In [4]:
import os
import torch.amp
import time

class SentimentAnalysisModel:
    def __init__(self, checkpoint, num_labels=3, epochs=2, batch_size=32, n_splits=5, save_path=None):
        """Initialize the sentiment analysis model for single-GPU training."""
        os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
        self.checkpoint = checkpoint
        self.num_labels = num_labels
        self.epochs = epochs
        self.batch_size = batch_size
        self.n_splits = n_splits
        self.save_path = save_path or f"sentiment_model_{checkpoint.split('/')[-1]}.pt"
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load model (tokenizer not needed since pre-tokenized)
        self.model = self.load_model()
        
        print(f"Using device: {self.device} (Single GPU)")
        self.optimizer = AdamW(self.model.parameters(), lr=5e-5)
        self.scheduler = None

    def load_model(self):
        """Load model with error handling."""
        try:
            model = AutoModelForSequenceClassification.from_pretrained(
                self.checkpoint, num_labels=self.num_labels
            )
            model.to(self.device)
            return model
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {str(e)}")

    def prepare_dataloaders(self, input_ids, attention_mask, labels, train_idx, validation_idx):
        train_dataset = TensorDataset(input_ids[train_idx], attention_mask[train_idx], labels[train_idx])
        validation_dataset = TensorDataset(input_ids[validation_idx], attention_mask[validation_idx], labels[validation_idx])
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, pin_memory=True)
        validation_loader = DataLoader(validation_dataset, batch_size=256, num_workers=0, pin_memory=True)  # Larger batch size for validation
        return train_loader, validation_loader
    
    def train_model(self, train_loader):
        self.model.train()
        total_steps = len(train_loader) * self.epochs
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        scaler = torch.amp.GradScaler('cuda')
        denominator = math.ceil(len(train_loader.dataset) / self.batch_size)
        accumulation_steps = 2  # Effective batch size = 64 * 2 = 128
    
        for epoch in range(self.epochs):
            epoch_start = time.time()
            for i, batch in enumerate(train_loader):
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device)
                inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}
    
                self.optimizer.zero_grad(set_to_none=True)  # Optimize memory
                with torch.amp.autocast('cuda'):
                    outputs = self.model(**inputs)
                    loss = outputs.loss / accumulation_steps  # Normalize loss
                scaler.scale(loss).backward()
    
                if (i + 1) % accumulation_steps == 0:
                    # scaler.unscale_(self.optimizer)
                    if (i + 1) % 50 == 0:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                    scaler.step(self.optimizer)
                    scaler.update()
                    self.scheduler.step()
    
                if (i + 1) % 100 == 0:
                    print(f'[Epoch: {epoch + 1}] -> Batch: [{i + 1}/{denominator}]')
            print(f'[Epoch: {epoch + 1}] -> Batch: [{denominator}/{denominator}]')
            print(f'Epoch {epoch + 1} took {time.time() - epoch_start:.2f} seconds')

    def evaluate_model(self, validation_loader):
        """Evaluate the model and move tensors to CPU to manage memory."""
        self.model.eval()
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in validation_loader:
                # Move batch to GPU after DataLoader fetches it
                inputs = {k: v.to(self.device) for k, v in zip(['input_ids', 'attention_mask'], batch[:-1])}
                labels_batch = batch[-1].to(self.device)
                outputs = self.model(**inputs)
                probs = torch.softmax(outputs.logits, dim=1)
                all_preds.append(probs.cpu())
                all_labels.append(labels_batch.cpu())

        return torch.cat(all_preds, dim=0), torch.cat(all_labels, dim=0)

    def kfold_cross_validation(self, data_dict):
        """Perform k-fold cross-validation with pre-tokenized data."""
        train_data = data_dict['data']
        # Keep tensors on CPU for DataLoader; move to GPU in train/evaluate
        input_ids = data_dict['input_ids']  # Already on CPU from load_data
        attention_mask = data_dict['attention_mask']  # Already on CPU
        labels = torch.tensor(train_data['labels'].tolist())  # Keep on CPU

        print("Unique labels in dataset:", torch.unique(labels))
        print("Label value range:", labels.min(), labels.max())
        assert labels.min() >= 0 and labels.max() < self.num_labels, \
            f"Labels must be between 0 and {self.num_labels - 1}"

        kfold = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        all_preds_by_fold = []
        all_labels_by_fold = []

        for fold, (train_idx, validation_idx) in enumerate(kfold.split(input_ids)):
            print(f"\nFold {fold + 1}/{self.n_splits}")
            train_loader, validation_loader = self.prepare_dataloaders(
                input_ids, attention_mask, labels, train_idx, validation_idx
            )
            self.train_model(train_loader)
            fold_preds, fold_labels = self.evaluate_model(validation_loader)
            all_preds_by_fold.append(fold_preds)
            all_labels_by_fold.append(fold_labels)

            metrics = ModelMetrics.calculate_metrics(fold_preds, fold_labels)
            print(f"Validation Accuracy: {metrics['accuracy']:.4f}")
            print(f"Validation F1-Score: {metrics['f1_score']:.4f}")
            print('=' * 30)

        final_metrics = ModelMetrics.calculate_final_metrics(all_preds_by_fold, all_labels_by_fold)
        print(f"\nFinal Results:")
        print(f"Mean accuracy: {final_metrics['mean_accuracy']:.4f} ± {final_metrics['std_accuracy']:.4f}")
        print(f"Mean F1-score: {final_metrics['mean_f1']:.4f} ± {final_metrics['std_f1']:.4f}")

        ModelMetrics.plot_mean_roc_curve(all_preds_by_fold, all_labels_by_fold, self.num_labels)

        try:
            torch.save(self.model.state_dict(), self.save_path)
            print(f"\nModel saved to {self.save_path}!")
        except Exception as e:
            print(f"Failed to save model: {str(e)}")

In [5]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/yelp-corpus-aspected/aspected_yelp_train_corpus.csv


In [None]:
import os
import time

# Set environment to use only one GPU (GPU 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Disable tokenizers parallelism to avoid warnings during DataLoader forking
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def load_data(file_path, tokenizer, nrows=None, label_column='labels'):
    """Load, balance, and pre-tokenize data from a CSV file.

    Args:
        file_path (str): Path to the CSV file.
        tokenizer: Hugging Face tokenizer for pre-tokenizing texts.
        nrows (int, optional): Number of rows to read.
        label_column (str): Name of the column containing labels.

    Returns:
        dict: Contains 'data' (balanced DataFrame), 'input_ids', and 'attention_mask'.
    """
    try:
        # Read CSV
        data = pd.read_csv(file_path, nrows=nrows, encoding='utf-8')
        
        # Validate required columns
        required_columns = ['text', label_column]
        missing_columns = [col for col in required_columns if col not in data.columns]
        if missing_columns:
            print(f"Error: Missing columns {missing_columns}. Available columns: {list(data.columns)}")
            raise SystemExit(1)
        
        # Check unique label values
        unique_labels = data[label_column].unique()
        print(f"Unique labels in dataset: {unique_labels}")
        
        # Map labels explicitly for binary classification
        if set(unique_labels).issubset({0, 1}):
            print("Labels are already 0 and 1; no mapping needed.")
        elif set(unique_labels).issubset({1, 2}):
            print("Mapping labels: 1 -> 0, 2 -> 1")
            data[label_column] = data[label_column].map({1: 0, 2: 1})
        else:
            print(f"Error: Unexpected label values {unique_labels}. Expected 0/1 or 1/2 for binary classification.")
            raise SystemExit(1)
        
        # Balance dataset (40000 samples per label)
        n_samples_per_group = 50000
        # Group by labels and sample indices
        sampled_indices = []
        for label, group in data.groupby(label_column):
            sample_size = min(n_samples_per_group, len(group))
            sampled_indices.extend(group.sample(n=sample_size, random_state=42).index)
        # Create balanced DataFrame using the sampled indices
        balanced_data = data.loc[sampled_indices].reset_index(drop=True)
        
        # Pre-tokenize texts
        texts = balanced_data['text'].tolist()
        encodings = tokenizer(
            texts,
            padding=True,
            max_length= 128,
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        
        print("Balanced data info:")
        print(balanced_data.info())
        print("Label distribution:")
        print(balanced_data[label_column].value_counts())
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention mask shape: {attention_mask.shape}")
        
        return {
            'data': balanced_data,
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
    
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        raise SystemExit(1)
    except pd.errors.EmptyDataError:
        print(f"Error: Empty CSV file at {file_path}")
        raise SystemExit(1)
    except Exception as e:
        print(f"Error loading data: {e}")
        raise SystemExit(1)

def main():
    """Run the sentiment analysis pipeline on a single GPU."""
    start_time = time.time()
    
    # Configuration
    config = {
        'file_path': '../input/yelp-corpus-aspected/aspected_yelp_train_corpus.csv',
        'nrows': None,
        'checkpoint': 'FacebookAI/roberta-base',
        'num_labels': 2,
        'epochs': 2,
        'batch_size': 192,
        'n_splits': 5,
        'save_path': '../working/trained_model_roberta-base_k5.pt',
        'label_column': 'labels'
    }
    
    # Initialize tokenizer for pre-tokenization
    tokenizer = AutoTokenizer.from_pretrained(config['checkpoint'])
    
    # Load and pre-tokenize data
    data_dict = load_data(config['file_path'], tokenizer, config['nrows'], config['label_column'])
    
    # Validate loaded data
    if not isinstance(data_dict, dict) or 'data' not in data_dict or 'input_ids' not in data_dict or 'attention_mask' not in data_dict:
        print("Error: load_data must return a dictionary with 'data', 'input_ids', and 'attention_mask' keys")
        raise SystemExit(1)
    if not isinstance(data_dict['data'], pd.DataFrame):
        print("Error: data_dict['data'] must be a pandas DataFrame")
        raise SystemExit(1)
    
    print(f"Data loading took {time.time() - start_time:.2f} seconds")
    
    # Train model
    model = SentimentAnalysisModel(
        checkpoint=config['checkpoint'],
        num_labels=config['num_labels'],
        epochs=config['epochs'],
        batch_size=config['batch_size'],
        n_splits=config['n_splits'],
        save_path=config['save_path']
    )
    model.kfold_cross_validation(data_dict)
    print(f"Total runtime: {time.time() - start_time:.2f} seconds")


if __name__ == "__main__":
    try:
        main()
    except SystemExit as e:
        print(f"Script exited with code: {e.code}")
        raise
    except Exception as e:
        print(f"Unexpected error: {e}")
        raise

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Unique labels in dataset: [1 2]
Mapping labels: 1 -> 0, 2 -> 1
Balanced data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    100000 non-null  object
 1   labels  100000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB
None
Label distribution:
labels
0    50000
1    50000
Name: count, dtype: int64
Input IDs shape: torch.Size([100000, 128])
Attention mask shape: torch.Size([100000, 128])
Data loading took 70.69 seconds


2025-04-21 08:45:32.277057: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745225132.507292      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745225132.572308      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda (Single GPU)
Unique labels in dataset: tensor([0, 1])
Label value range: tensor(0) tensor(1)

Fold 1/5
[Epoch: 1] -> Batch: [100/417]
[Epoch: 1] -> Batch: [200/417]
