In [1]:
# COMPLETE KAGGLE SUBMISSION CODE - DISASTER TWEETS CLASSIFICATION (FIXED)
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import os
import gc
import warnings
import logging

# Suppress all warnings and logging
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Suppress transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("datasets").setLevel(logging.ERROR)

# Disable progress bars to reduce output clutter
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['DATASETS_VERBOSITY'] = 'error'

print("🚀 COMPLETE KAGGLE DISASTER TWEETS SOLUTION (FIXED)")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

# =============================================================================
# CONFIGURATION - OPTIMIZED FOR KAGGLE DISK SPACE
# =============================================================================
CONFIG = {
    # Data paths - EXACT Kaggle dataset paths
    'train_file': '/kaggle/input/nlp-getting-started/train.csv',
    'test_file': '/kaggle/input/nlp-getting-started/test.csv',
    'sample_submission': '/kaggle/input/nlp-getting-started/sample_submission.csv',
    
    # Model settings
    'model_name': 'roberta-base',
    'max_length': 128,
    'num_labels': 2,
    
    # Training settings - OPTIMIZED FOR DISK SPACE
    'train_batch_size': 16,
    'eval_batch_size': 32,
    'learning_rate': 2e-5,
    'num_epochs': 2,  # Reduced to save space
    'weight_decay': 0.01,
    'warmup_steps': 300,
    
    # Output settings
    'model_output_dir': '/kaggle/working/disaster_model',
    'submission_file': '/kaggle/working/submission.csv',
    'run_name': 'disaster_roberta_final'
}

# Set seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# =============================================================================
# DISK SPACE MANAGEMENT
# =============================================================================
def clean_disk_space():
    """Clean up disk space"""
    print("🧹 Cleaning disk space...")
    
    # Force garbage collection
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Remove any existing model files
    if os.path.exists(CONFIG['model_output_dir']):
        import shutil
        shutil.rmtree(CONFIG['model_output_dir'])
    
    print("✅ Disk space cleaned")

def check_disk_space():
    """Check available disk space"""
    import shutil
    total, used, free = shutil.disk_usage("/kaggle/working")
    free_gb = free // (1024**3)
    print(f"💾 Available disk space: {free_gb} GB")
    return free_gb

# =============================================================================
# DATA LOADING AND PREPROCESSING
# =============================================================================
def load_and_preprocess_data():
    """Load and preprocess training data"""
    print("📂 Loading training data...")
    
    # Load data
    train_df = pd.read_csv(CONFIG['train_file'])
    test_df = pd.read_csv(CONFIG['test_file'])
    sample_sub = pd.read_csv(CONFIG['sample_submission'])
    
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"Sample submission shape: {sample_sub.shape}")
    
    # Show data info
    print(f"Train columns: {train_df.columns.tolist()}")
    print(f"Test columns: {test_df.columns.tolist()}")
    print(f"Target distribution:\n{train_df['target'].value_counts()}")
    
    # Clean training data
    train_df = train_df[['text', 'target']].dropna()
    train_df['text'] = train_df['text'].fillna('').astype(str).str.strip()
    train_df = train_df[train_df['text'].str.len() > 0]
    
    # Clean test data
    test_df['text'] = test_df['text'].fillna('').astype(str).str.strip()
    
    print(f"After cleaning - Train: {train_df.shape[0]}, Test: {test_df.shape[0]}")
    
    return train_df, test_df, sample_sub

# =============================================================================
# DATASET CREATION
# =============================================================================
def create_datasets(train_df, tokenizer):
    """Create train and validation datasets"""
    print("🔀 Creating datasets...")
    
    # Split data
    train_data, val_data = train_test_split(
        train_df, 
        test_size=0.15, 
        stratify=train_df['target'], 
        random_state=42
    )
    
    # Rename target to labels for HuggingFace
    train_data = train_data.rename(columns={'target': 'labels'})
    val_data = val_data.rename(columns={'target': 'labels'})
    
    print(f"Train size: {len(train_data)}, Val size: {len(val_data)}")
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"], 
            truncation=True, 
            padding=False,
            max_length=CONFIG['max_length']
        )
    
    # Convert to HuggingFace datasets
    train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
    val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))
    
    # Tokenize
    print("🔤 Tokenizing datasets...")
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    
    # Set format
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    
    return train_dataset, val_dataset

# =============================================================================
# MODEL TRAINING - OPTIMIZED FOR DISK SPACE
# =============================================================================
def compute_metrics(eval_pred):
    """Compute evaluation metrics using sklearn"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    
    return {
        "accuracy": accuracy,
        "f1": f1
    }

def train_model(train_dataset, val_dataset, tokenizer):
    """Train the RoBERTa model with disk space optimization"""
    print("🤖 Initializing model...")
    
    # Clean disk space before training
    clean_disk_space()
    check_disk_space()
    
    # Load model
    model = RobertaForSequenceClassification.from_pretrained(
        CONFIG['model_name'], 
        num_labels=CONFIG['num_labels'],
        problem_type="single_label_classification"
    )
    
    # Suppress the warning about newly initialized weights
    print("✅ Model loaded successfully")
    
    # Training arguments - OPTIMIZED FOR DISK SPACE
    training_args = TrainingArguments(
        output_dir=CONFIG['model_output_dir'],
        run_name=CONFIG['run_name'],
        num_train_epochs=CONFIG['num_epochs'],
        per_device_train_batch_size=CONFIG['train_batch_size'],
        per_device_eval_batch_size=CONFIG['eval_batch_size'],
        learning_rate=CONFIG['learning_rate'],
        weight_decay=CONFIG['weight_decay'],
        warmup_steps=CONFIG['warmup_steps'],
        
        # Evaluation - REDUCED FREQUENCY TO SAVE SPACE
        eval_strategy="epoch",  # Changed from steps to epoch
        logging_steps=100,
        
        # Model saving - MINIMAL SAVES TO PRESERVE DISK SPACE
        save_strategy="no",  # Don't save intermediate checkpoints
        save_total_limit=1,  # Keep only 1 checkpoint
        load_best_model_at_end=False,  # Disabled to save space
        
        # Performance
        dataloader_num_workers=0,
        remove_unused_columns=True,
        
        # Reproducibility
        seed=42,
        data_seed=42,
        
        # Disable logging to save space
        report_to=[],
        logging_first_step=False,
        
        # Additional space-saving options
        prediction_loss_only=True,
        dataloader_pin_memory=False,
    )
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Initialize trainer without early stopping to save space
    print("🏋️ Setting up trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Train
    print("🚀 Starting training...")
    trainer.train()
    
    # Evaluate
    print("📊 Final evaluation...")
    eval_results = trainer.evaluate()
    print(f"Final results: {eval_results}")
    
    # Save only the final model (no optimizer state)
    print("💾 Saving final model...")
    model.save_pretrained(CONFIG['model_output_dir'])
    tokenizer.save_pretrained(CONFIG['model_output_dir'])
    
    # Clean up trainer to free memory
    del trainer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return model, tokenizer

# =============================================================================
# PREDICTION AND SUBMISSION
# =============================================================================
def create_submission(model, tokenizer, test_df):
    """Create submission file"""
    print("🔮 Creating submission predictions...")
    
    # Prepare model for inference
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    predictions = []
    batch_size = 32
    
    print(f"Processing {len(test_df)} test samples...")
    
    # Process in batches
    for i in range(0, len(test_df), batch_size):
        batch_texts = test_df['text'].iloc[i:i+batch_size].tolist()
        
        # Tokenize batch
        inputs = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=CONFIG['max_length'],
            return_tensors='pt'
        ).to(device)
        
        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            predictions.extend(batch_predictions)
        
        # Clean up GPU memory
        del inputs, outputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Progress update
        if (i // batch_size + 1) % 10 == 0:
            processed = min(i + batch_size, len(test_df))
            print(f"   Processed {processed}/{len(test_df)} samples")
    
    predictions = np.array(predictions)
    
    # Create submission DataFrame
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'target': predictions
    })
    
    # Save submission file
    print(f"💾 Saving submission to {CONFIG['submission_file']}")
    submission_df.to_csv(CONFIG['submission_file'], index=False)
    
    # Verify file was created
    if os.path.exists(CONFIG['submission_file']):
        file_size = os.path.getsize(CONFIG['submission_file'])
        print(f"✅ Submission file created successfully!")
        print(f"   File: {CONFIG['submission_file']}")
        print(f"   Size: {file_size} bytes")
        print(f"   Rows: {len(submission_df)}")
    else:
        print("❌ Error: Submission file was not created!")
    
    # Show prediction summary
    print(f"\n📈 Prediction Summary:")
    print(f"   Total predictions: {len(predictions)}")
    print(f"   Disaster tweets (1): {np.sum(predictions == 1)} ({np.mean(predictions == 1)*100:.1f}%)")
    print(f"   Non-disaster tweets (0): {np.sum(predictions == 0)} ({np.mean(predictions == 0)*100:.1f}%)")
    
    # Show first 10 predictions
    print(f"\n📋 First 10 predictions:")
    print(submission_df.head(10))
    
    return submission_df

# =============================================================================
# VALIDATION
# =============================================================================
def validate_submission(submission_df, sample_sub):
    """Validate submission format"""
    print("\n🔍 Validating submission format...")
    
    # Check shape
    if len(submission_df) != len(sample_sub):
        print(f"❌ Wrong number of predictions: {len(submission_df)} vs {len(sample_sub)} expected")
        return False
    
    # Check columns
    expected_cols = ['id', 'target']
    if not all(col in submission_df.columns for col in expected_cols):
        print(f"❌ Wrong columns: {submission_df.columns.tolist()} vs {expected_cols}")
        return False
    
    # Check target values
    unique_targets = submission_df['target'].unique()
    if not all(target in [0, 1] for target in unique_targets):
        print(f"❌ Invalid target values: {unique_targets}")
        return False
    
    # Check for missing values
    if submission_df.isnull().any().any():
        print("❌ Found missing values")
        return False
    
    # Check IDs match
    if not submission_df['id'].equals(sample_sub['id']):
        print("❌ ID mismatch with sample submission")
        return False
    
    print("✅ Submission format is PERFECT!")
    return True

# =============================================================================
# MAIN EXECUTION - OPTIMIZED
# =============================================================================
def main():
    """Main execution function with disk space management"""
    print("=" * 80)
    print("🎯 KAGGLE DISASTER TWEETS - COMPLETE SOLUTION (OPTIMIZED)")
    print("=" * 80)
    
    # Initial disk space check
    check_disk_space()
    
    # Step 1: Load data
    train_df, test_df, sample_sub = load_and_preprocess_data()
    
    # Step 2: Initialize tokenizer
    print("🔤 Loading tokenizer...")
    tokenizer = RobertaTokenizer.from_pretrained(CONFIG['model_name'])
    
    # Suppress tokenizer warnings
    tokenizer.model_max_length = CONFIG['max_length']
    
    # Step 3: Create datasets
    train_dataset, val_dataset = create_datasets(train_df, tokenizer)
    
    # Step 4: Train model (optimized)
    model, tokenizer = train_model(train_dataset, val_dataset, tokenizer)
    
    # Clean up datasets to free memory
    del train_dataset, val_dataset
    gc.collect()
    
    # Step 5: Create submission
    submission_df = create_submission(model, tokenizer, test_df)
    
    # Step 6: Validate submission
    is_valid = validate_submission(submission_df, sample_sub)
    
    # Step 7: Final cleanup and status
    del model, tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print("\n" + "=" * 80)
    if is_valid and os.path.exists(CONFIG['submission_file']):
        print("🎉 SUCCESS! SUBMISSION FILE IS READY!")
        print(f"📁 File location: {CONFIG['submission_file']}")
        print(f"📊 Total predictions: {len(submission_df)}")
        print("🚀 You can now submit this file to Kaggle!")
    else:
        print("❌ FAILED! Please check the errors above.")
    print("=" * 80)
    
    return submission_df

# =============================================================================
# RUN THE COMPLETE SOLUTION
# =============================================================================
if __name__ == "__main__":
    # Create output directory
    os.makedirs('/kaggle/working', exist_ok=True)
    
    # Run the complete solution
    final_submission = main()
    
    # Additional file verification
    print(f"\n📁 Final file check:")
    submission_path = CONFIG['submission_file']
    if os.path.exists(submission_path):
        print(f"✅ {submission_path} EXISTS!")
        print(f"   File size: {os.path.getsize(submission_path)} bytes")
        
        # Read and verify the file
        verify_df = pd.read_csv(submission_path)
        print(f"   Rows in file: {len(verify_df)}")
        print(f"   Columns: {verify_df.columns.tolist()}")
        print(f"   Sample content:")
        print(verify_df.head())
    else:
        print(f"❌ {submission_path} NOT FOUND!")
    
    print(f"\n🏁 COMPLETE! Check /kaggle/working/submission.csv for your submission file!")

2025-08-04 04:36:00.219626: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-04 04:36:00.219755: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-04 04:36:00.374921: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🚀 COMPLETE KAGGLE DISASTER TWEETS SOLUTION (FIXED)
Device: GPU
🎯 KAGGLE DISASTER TWEETS - COMPLETE SOLUTION (OPTIMIZED)
💾 Available disk space: 19 GB
📂 Loading training data...
Train shape: (7613, 5)
Test shape: (3263, 4)
Sample submission shape: (3263, 2)
Train columns: ['id', 'keyword', 'location', 'text', 'target']
Test columns: ['id', 'keyword', 'location', 'text']
Target distribution:
target
0    4342
1    3271
Name: count, dtype: int64
After cleaning - Train: 7613, Test: 3263
🔤 Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

🔀 Creating datasets...
Train size: 6471, Val size: 1142
🔤 Tokenizing datasets...


Map:   0%|          | 0/6471 [00:00<?, ? examples/s]

Map:   0%|          | 0/1142 [00:00<?, ? examples/s]

🤖 Initializing model...
🧹 Cleaning disk space...
✅ Disk space cleaned
💾 Available disk space: 19 GB


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

✅ Model loaded successfully
🏋️ Setting up trainer...
🚀 Starting training...
{'loss': 0.6788, 'grad_norm': 3.154034376144409, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.24691358024691357}
{'loss': 0.5088, 'grad_norm': 14.518596649169922, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.49382716049382713}
{'loss': 0.4472, 'grad_norm': 10.06059455871582, 'learning_rate': 2e-05, 'epoch': 0.7407407407407407}
{'loss': 0.4231, 'grad_norm': 17.37221336364746, 'learning_rate': 1.607843137254902e-05, 'epoch': 0.9876543209876543}
{'eval_loss': 0.48333728313446045, 'eval_runtime': 2.0508, 'eval_samples_per_second': 556.855, 'eval_steps_per_second': 17.554, 'epoch': 1.0}
{'loss': 0.3813, 'grad_norm': 13.979466438293457, 'learning_rate': 1.215686274509804e-05, 'epoch': 1.2345679012345678}
{'loss': 0.3619, 'grad_norm': 17.51183319091797, 'learning_rate': 8.23529411764706e-06, 'epoch': 1.4814814814814814}
{'loss': 0.3729, 'grad_norm': 13.8571195602417, 'learning_rate': 4.313725490196079e-06