In [None]:
# Dataset: https://data.cincinnati-oh.gov/Thriving-Neighborhoods/Fleet-Preventative-Maintenance-Repair-Work-Orders/2a8x-bxjm/about_data

In [None]:
!pip install mlflow>=2.11.0
!pip install peft==0.8.2
!pip install bitsandbytes==0.42.0
!pip install datasets==2.17.1
!pip install --upgrade transformers>=4.44.0
!pip install --upgrade torch
!pip install --upgrade accelerate

# For data processing and utilities
!pip install pandas numpy
!pip install torchvision torchaudio
!pip install scikit-learn
!pip install wandb  # For experiment tracking
!pip install requests  # For data loading
!pip install psutil  # For monitoring system resources

!pip install --force-reinstall -v "triton==3.1.0"

Collecting triton==3.2.0 (from torch>=1.13.0->peft==0.8.2)
  Using cached triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Using cached triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253.2 MB)
Installing collected packages: triton
  Attempting uninstall: triton
    Found existing installation: triton 3.1.0
    Uninstalling triton-3.1.0:
      Successfully uninstalled triton-3.1.0
Successfully installed triton-3.2.0
Collecting torch
  Using cached torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

In [None]:
import os
# Set these BEFORE importing any other packages
os.environ["BNB_CUDA_VERSION"] = "123"  # Use CUDA 12.3 binaries instead
os.environ["LD_LIBRARY_PATH"] = f"{os.environ.get('LD_LIBRARY_PATH', '')}:/usr/local/cuda/lib64"

# Now import your packages
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")

CUDA available: True
CUDA device count: 1
CUDA device: NVIDIA A100-SXM4-40GB


In [None]:
!nvidia-smi

Sun Jun  8 02:06:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             44W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import os
HF_TOKEN="" # Hugging Face token
# os.environ["WANDB_MODE"] = "offline" # 
# %env WANDB_MODE=offline

In [None]:
# Add this at the very beginning of your notebook
import torch
import gc

# Clear any existing GPU memory
torch.cuda.empty_cache()
gc.collect()

# Check available memory
print(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

GPU memory before loading: 0.00 GB
GPU memory reserved: 0.00 GB


In [None]:
# Set these BEFORE importing transformers
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"  # Disable caching for debugging

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
import requests
import json
import logging
from sklearn.model_selection import train_test_split
import wandb
from typing import Dict, List, Any, Optional
import warnings
warnings.filterwarnings("ignore")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CincinnatiFleetDataProcessor:
    """Advanced data processor for Cincinnati Fleet Maintenance dataset"""

    def __init__(self, dataset_url: str):
        self.dataset_url = dataset_url
        self.data = None
        self.processed_data = None

    def load_data(self) -> pd.DataFrame:
        """Load dataset from Cincinnati open data portal"""
        try:
            # Cincinnati open data API endpoint
            api_url = "https://data.cincinnati-oh.gov/resource/2a8x-bxjm.json"

            # Load with reduced limit for testing
            # params = {"$limit": 50000}  # Further reduced for stability
            response = requests.get(api_url)
            response.raise_for_status()

            data = response.json()
            self.data = pd.DataFrame(data)
            logger.info(f"Successfully loaded {len(self.data)} total records")

            # Print actual columns for debugging
            logger.info(f"Actual columns: {list(self.data.columns)}")
            return self.data

        except Exception as e:
            logger.error(f"Failed to load data: {str(e)}")
            # Create sample data for testing
            return self._create_sample_data()


    def explore_data(self) -> Dict[str, Any]:
        """Comprehensive data exploration"""
        if self.data is None:
            raise ValueError("Data not loaded. Call load_data() first.")

        exploration = {
            "shape": self.data.shape,
            "columns": list(self.data.columns),
            "dtypes": self.data.dtypes.to_dict(),
            "missing_values": self.data.isnull().sum().to_dict(),
            "sample_data": self.data.head(3).to_dict('records')
        }

        logger.info(f"Dataset shape: {exploration['shape']}")
        logger.info(f"Columns: {exploration['columns']}")

        return exploration

    def advanced_serialization(self, row: pd.Series) -> str:
        """Advanced serialization based on research findings"""
        serialized_parts = []
        for col, val in row.items():
            if pd.notna(val):
                if isinstance(val, (int, float)):
                    if isinstance(val, float):
                        val = round(val, 3)
                    serialized_parts.append(f"The {col.lower().replace('_', ' ')} is {val}")
                else:
                    serialized_parts.append(f"The {col.lower().replace('_', ' ')} is {str(val)}")

        return ". ".join(serialized_parts) + "."

class FleetMaintenanceDataset(Dataset):
    """Custom dataset for fleet maintenance data"""

    def __init__(self, examples: List[Dict[str, str]], tokenizer, max_length: int = 256):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]

        # Tokenize text
        encoding = self.tokenizer(
            example["text"],
            truncation=True,
            max_length=self.max_length,
            padding=False,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }

class AdvancedLLMFineTuner:
    """FIXED: LLM fine-tuning implementation without FP16 issues"""

    def __init__(self, model_name: str = "microsoft/DialoGPT-medium"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.trainer = None

    def setup_model_and_tokenizer(self):
        """FIXED: Initialize model without FP16/LoRA complications"""

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Add padding token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # FIXED: Load model in float32 to avoid FP16 gradient issues
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float32,  # Force float32 to avoid FP16 issues
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True
        )

        # Resize token embeddings if needed
        self.model.resize_token_embeddings(len(self.tokenizer))

        logger.info(f"Model loaded: {self.model_name} (float32)")

    def create_training_args(self, output_dir: str = "./results") -> TrainingArguments:
        """FIXED: Create training arguments without FP16"""

        return TrainingArguments(
            output_dir=output_dir,

            # Conservative training parameters
            num_train_epochs=200,
            per_device_train_batch_size=4,  # Very small batch size
            per_device_eval_batch_size=4,
            gradient_accumulation_steps=32,  # Simulate larger batch

            # Optimization settings
            learning_rate=3e-5,  # Conservative learning rate
            weight_decay=0.01,
            warmup_ratio=0.1,
            lr_scheduler_type="linear",

            # FIXED: Disable FP16 to avoid gradient scaler issues
            fp16=False,  # Disabled to fix the error
            bf16=False,  # Also disabled
            dataloader_pin_memory=False,
            gradient_checkpointing=False,

            # Evaluation and saving
            eval_strategy="no",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=False,

            # Logging
            logging_steps=5,
            report_to="wandb" if wandb.run else "none",
            run_name="fleet-maintenance-training",  # Fix the run name warning

            # Advanced settings
            remove_unused_columns=True,
            seed=42,

            # FIXED: Disable problematic features
            dataloader_num_workers=0,  # Avoid multiprocessing issues
            prediction_loss_only=True,  # Simplify loss computation
        )

    def train(self, train_dataset: Dataset, training_args: TrainingArguments):
        """FIXED: Execute training without evaluation dataset"""

        # FIXED: Simple data collator without special handling
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
            pad_to_multiple_of=None  # Avoid alignment issues
        )

        # Initialize trainer with minimal configuration
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )

        # FIXED: Clear any existing gradients and ensure proper setup
        self.model.zero_grad()

        # Start training
        logger.info("Starting fine-tuning...")
        try:
            self.trainer.train()
            logger.info("Fine-tuning completed successfully!")
        except Exception as e:
            logger.error(f"Training failed: {e}")
            raise

        # Save the final model
        self.trainer.save_model()
        logger.info("Model saved!")

def main():
    """FIXED: Main execution pipeline"""

    # Initialize wandb
    wandb.init(project="cincinnati-fleet-maintenance", name="fleet-maintenance-fp32")

    # Stage 1: Dataset Preparation
    logger.info("Stage 1: Dataset Preparation")
    processor = CincinnatiFleetDataProcessor(
        "https://data.cincinnati-oh.gov/Thriving-Neighborhoods/Fleet-Preventative-Maintenance-Repair-Work-Orders/2a8x-bxjm"
    )

    # Load and explore data
    data = processor.load_data()
    exploration = processor.explore_data()

    # Create training examples
    examples = processor.create_training_examples()

    # Use only training data for simplicity
    train_examples = examples

    logger.info(f"Training examples: {len(train_examples)}")

    # Stage 2: Model Initialization
    logger.info("Stage 2: Model Initialization")
    fine_tuner = AdvancedLLMFineTuner(
        model_name="microsoft/DialoGPT-medium"
    )
    fine_tuner.setup_model_and_tokenizer()

    # Stage 3: Training Environment Setup
    logger.info("Stage 3: Training Environment Setup")
    train_dataset = FleetMaintenanceDataset(train_examples, fine_tuner.tokenizer)

    training_args = fine_tuner.create_training_args("./cincinnati_fleet_model")

    # Stage 4: Fine-Tuning
    logger.info("Stage 4: Fine-Tuning")
    fine_tuner.train(train_dataset, training_args)

    # Stage 5: Completion
    logger.info("Stage 5: Training completed successfully")
    eval_results = {"status": "completed", "model": "DialoGPT-medium", "precision": "fp32"}

    wandb.log(eval_results)
    wandb.finish()

    return fine_tuner, eval_results

if __name__ == "__main__":
    # Execute the complete pipeline
    model, results = main()
    print("Fine-tuning completed successfully!")
    print(f"Final results: {results}")

Step,Training Loss
5,6.2852
10,5.1996
15,3.94
20,2.8782
25,1.9671
30,1.2169
35,0.7959
40,0.5617
45,0.4438
50,0.3701


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▆████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▇▅▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
model,DialoGPT-medium
precision,fp32
status,completed
total_flos,2786102083584000.0
train/epoch,200
train/global_step,200
train/grad_norm,0.23157
train/learning_rate,0.0
train/loss,0.1379
train_loss,0.71982


Fine-tuning completed successfully!
Final results: {'status': 'completed', 'model': 'DialoGPT-medium', 'precision': 'fp32'}


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
import os

# FIXED: Enable CUDA debugging for better error traces
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

class FixedFleetMaintenanceInference:
    """FIXED: Class to test the fine-tuned fleet maintenance model with proper error handling"""

    def __init__(self, model_path="./cincinnati_fleet_model"):
        self.model_path = model_path
        self.tokenizer = None
        self.model = None
        self.device = "cpu"  # Start with CPU for safety
        self.load_model()

    def load_model(self):
        """FIXED: Load model with proper error handling and device management"""
        try:
            print("🔄 Loading fine-tuned model...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

            # FIXED: Proper pad token setup to avoid tokenization issues
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

            # FIXED: Load model on CPU first, then move to GPU if available
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                torch_dtype=torch.float32,
                device_map=None  # Load on CPU first
            )

            # FIXED: Safely move to GPU with error handling
            if torch.cuda.is_available():
                try:
                    self.model = self.model.to("cuda")
                    self.device = "cuda"
                    print(f"✅ Model loaded on {self.device}")
                except Exception as cuda_error:
                    print(f"⚠️ CUDA loading failed: {cuda_error}")
                    print("🔄 Using CPU instead...")
                    self.model = self.model.to("cpu")
                    self.device = "cpu"
            else:
                self.device = "cpu"
                print(f"✅ Model loaded on {self.device}")

            self.model.eval()

        except Exception as e:
            print(f"❌ Error loading fine-tuned model: {e}")
            print("🔄 Falling back to base model...")
            self._load_fallback_model()

    def _load_fallback_model(self):
        """Load base model as fallback"""
        try:
            self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
            self.model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

            # Move to appropriate device
            if torch.cuda.is_available():
                try:
                    self.model = self.model.to("cuda")
                    self.device = "cuda"
                except:
                    self.device = "cpu"
            else:
                self.device = "cpu"

            print(f"✅ Fallback model loaded on {self.device}")

        except Exception as e:
            print(f"❌ Failed to load fallback model: {e}")
            raise

    def generate_response(self, input_text, max_new_tokens=1000, temperature=0.7):
        """FIXED: Generate response with proper error handling and token validation"""

        try:
            # FIXED: Proper tokenization with attention masks and validation
            inputs = self.tokenizer(
                input_text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=1024,  # Limit input length
                add_special_tokens=True
            )

            # FIXED: Validate token IDs before moving to device
            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]

            # Check for invalid token IDs that cause CUDA errors
            if torch.any(input_ids >= self.tokenizer.vocab_size):
                print("⚠️ Invalid token IDs detected, cleaning...")
                input_ids = torch.clamp(input_ids, 0, self.tokenizer.vocab_size - 1)

            if torch.any(input_ids < 0):
                print("⚠️ Negative token IDs detected, cleaning...")
                input_ids = torch.clamp(input_ids, 0, self.tokenizer.vocab_size - 1)

            # FIXED: Safely move tensors to device with error handling
            try:
                input_ids = input_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
            except RuntimeError as cuda_error:
                print(f"⚠️ CUDA tensor error: {cuda_error}")
                print("🔄 Falling back to CPU...")
                self.device = "cpu"
                self.model = self.model.to("cpu")
                input_ids = input_ids.to("cpu")
                attention_mask = attention_mask.to("cpu")

            # FIXED: Generate with better parameters and error handling
            with torch.no_grad():
                try:
                    outputs = self.model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_new_tokens=max_new_tokens,  # Use max_new_tokens instead of max_length
                        temperature=temperature,
                        do_sample=True,
                        top_p=0.9,
                        top_k=50,
                        repetition_penalty=1.2,
                        pad_token_id=self.tokenizer.eos_token_id,
                        eos_token_id=self.tokenizer.eos_token_id,
                        num_return_sequences=1,
                        early_stopping=True
                    )
                except RuntimeError as gen_error:
                    print(f"⚠️ Generation error: {gen_error}")
                    # Fallback to simpler generation
                    outputs = self.model.generate(
                        input_ids=input_ids,
                        max_new_tokens=50,
                        temperature=0.8,
                        do_sample=False,  # Use greedy decoding as fallback
                        pad_token_id=self.tokenizer.eos_token_id
                    )

            # FIXED: Proper response extraction
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the generated part
            if input_text in full_response:
                generated_part = full_response[len(input_text):].strip()
                return generated_part if generated_part else "No response generated."

            return full_response.strip()

        except Exception as e:
            print(f"❌ Error in generate_response: {e}")
            return f"Error generating response: {str(e)}"

    def test_custom_inputs(self):
        """Test model on custom fleet maintenance scenarios"""

        print("\n🔧 Testing Custom Fleet Maintenance Inputs:")
        print("=" * 60)

        custom_tests = [
            {
                "name": "Police Vehicle Maintenance",
                "input": "Maintenance: The wo no is PD-2024-001. The equipment no is POLICE-CAR-15. The work type is Preventive Maintenance. The priority is High. The cost is 1500.75. The department is Police."
            },
            {
                "name": "Fire Truck Emergency Repair",
                "input": "Maintenance: The wo no is FD-2024-002. The equipment no is FIRE-TRUCK-3. The work type is Repair. The priority is Critical. The cost is 8500.00. The department is Fire."
            },
            {
                "name": "Public Works Equipment",
                "input": "Maintenance: The wo no is PW-2024-003. The equipment no is SNOW-PLOW-7. The work type is Inspection. The priority is Medium. The cost is 250.00. The department is Public Works."
            }
        ]

        for i, test in enumerate(custom_tests, 1):
            print(f"\n{i}. {test['name']}")
            print(f"Input: {test['input']}")

            try:
                response = self.generate_response(test['input'])
                print(f"Generated: {response}")
            except Exception as e:
                print(f"❌ Error: {e}")

            print("-" * 40)

    def test_dataset_examples(self):
        """Test model on examples from the dataset"""

        print("\n📊 Testing Dataset Examples:")
        print("=" * 60)

        dataset_examples = [
            "Maintenance: The wo no is WO-1001. The equipment no is EQ-201. The work type is Preventive Maintenance. The priority is High. The cost is 2500.0. The department is Police.",
            "Maintenance: The wo no is WO-1002. The equipment no is EQ-202. The work type is Repair. The priority is Medium. The cost is 1200.5. The department is Fire.",
            "Maintenance: The wo no is WO-1003. The equipment no is EQ-203. The work type is Inspection. The priority is Low. The cost is 300.0. The department is Public Works."
        ]

        for i, example in enumerate(dataset_examples, 1):
            print(f"\n{i}. Dataset Example {i}")
            print(f"Input: {example}")

            try:
                response = self.generate_response(example)
                print(f"Generated: {response}")
            except Exception as e:
                print(f"❌ Error: {e}")

            print("-" * 40)

    def benchmark_performance(self):
        """Benchmark model performance"""

        print("\n⚡ Performance Benchmark:")
        print("=" * 60)

        test_input = "Maintenance: The wo no is BENCH-001. The equipment no is TEST-VEHICLE. The work type is Preventive Maintenance. The priority is High."

        import time

        # Warm-up
        try:
            _ = self.generate_response(test_input, max_new_tokens=50)
        except:
            print("⚠️ Warm-up failed, continuing with benchmark...")

        # Benchmark
        times = []
        responses = []

        for i in range(3):  # Reduced iterations for safety
            try:
                start_time = time.time()
                response = self.generate_response(test_input, max_new_tokens=50)
                end_time = time.time()
                times.append(end_time - start_time)
                responses.append(response)
            except Exception as e:
                print(f"❌ Benchmark iteration {i+1} failed: {e}")

        if times:
            avg_time = np.mean(times)
            std_time = np.std(times)

            print(f"Average Response Time: {avg_time:.3f}s ± {std_time:.3f}s")
            print(f"Responses per minute: ~{60/avg_time:.1f}")
            print(f"Device used: {self.device}")
            if responses:
                print(f"Sample response: {responses[0]}")
        else:
            print("❌ All benchmark iterations failed")

    def check_system_status(self):
        """Check system and model status"""
        print("\n🔍 System Status Check:")
        print("=" * 50)

        print(f"PyTorch Version: {torch.__version__}")
        print(f"CUDA Available: {torch.cuda.is_available()}")

        if torch.cuda.is_available():
            print(f"CUDA Version: {torch.version.cuda}")
            print(f"GPU Count: {torch.cuda.device_count()}")
            print(f"Current GPU: {torch.cuda.current_device()}")
            print(f"GPU Name: {torch.cuda.get_device_name()}")

            # Check GPU memory
            try:
                memory_allocated = torch.cuda.memory_allocated() / 1024**3
                memory_reserved = torch.cuda.memory_reserved() / 1024**3
                memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3

                print(f"GPU Memory - Allocated: {memory_allocated:.2f}GB")
                print(f"GPU Memory - Reserved: {memory_reserved:.2f}GB")
                print(f"GPU Memory - Total: {memory_total:.2f}GB")
            except:
                print("⚠️ Could not get GPU memory info")

        print(f"Model Device: {self.device}")
        print(f"Model Path: {self.model_path}")

# FIXED: Main function with comprehensive error handling
def main():
    print("🚀 Testing Fine-Tuned Fleet Maintenance Model")
    print("=" * 60)

    try:
        # Initialize inference class
        inference = FixedFleetMaintenanceInference()

        # Check system status
        inference.check_system_status()

        # Run tests with error handling
        inference.test_custom_inputs()
        inference.test_dataset_examples()
        inference.benchmark_performance()

        print("\n✅ Testing completed successfully!")

    except Exception as e:
        print(f"❌ Fatal error in main: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🚀 Testing Fine-Tuned Fleet Maintenance Model (FIXED)
🔄 Loading fine-tuned model...


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


⚠️ CUDA loading failed: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

🔄 Using CPU instead...

🔍 System Status Check:
PyTorch Version: 2.6.0+cu124
CUDA Available: True
CUDA Version: 12.4
GPU Count: 1
Current GPU: 0
GPU Name: NVIDIA A100-SXM4-40GB
GPU Memory - Allocated: 1.54GB
GPU Memory - Reserved: 1.61GB
GPU Memory - Total: 39.56GB
Model Device: cpu
Model Path: ./cincinnati_fleet_model

🔧 Testing Custom Fleet Maintenance Inputs:

1. Police Vehicle Maintenance
Input: Maintenance: The wo no is PD-2024-001. The equipment no is POLICE-CAR-15. The work type is Preventive Maintenance. The priority is High. The cost is 1500.75. The department is Police.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: The job type is PM. The eq equip no is 2023. The work order status is OPEN. The meter 1 reading is 97671. The datetime out service is 2023-----08-07T00:00:00.000Z. The datetime open is 2023----08-07T00:00:00.000Z. The datetime first labor is 2023-----08-07T00:00:
----------------------------------------

2. Fire Truck Emergency Repair
Input: Maintenance: The wo no is FD-2024-002. The equipment no is FIRE-TRUCK-3. The work type is Repair. The priority is Critical. The cost is 8500.00. The department is Fire.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: The job type is REPAIR. The eq equip no is 60954. The work order status is OPEN. The meter 1 reading is 97671. The datetime out service is 2023-08-002. The datetime open is 2023-08-003. The datetime first labor is 2023-08-003. The datetime unit in is 2023-08-003. The datetime due is 2023-08-004. The qty est hours
----------------------------------------

3. Public Works Equipment
Input: Maintenance: The wo no is PW-2024-003. The equipment no is SNOW-PLOW-7. The work type is Inspection. The priority is Medium. The cost is 250.00. The department is Public Works.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: The job type is REPAIR. The eq equip no is 30842. The work order status is WORK FINISHED. The meter 1 reading is 59384. The datetime out service is 2023-08-003. The datetime open is 2023-08-003. The datetime first labor is 2023-08-003. The datetime unit in is 2023-08-003. The datetime due is 2023-08-04T00:00
----------------------------------------

📊 Testing Dataset Examples:

1. Dataset Example 1
Input: Maintenance: The wo no is WO-1001. The equipment no is EQ-201. The work type is Preventive Maintenance. The priority is High. The cost is 2500.0. The department is Police.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: The job type is REPAIR. The eq equip no is 30842. The work order status is WORK FINISHED. The meter 1 reading is 171561. The datetime out service is 2023-08-07T00:00:00.000Z. The datetime open is 2023-08-07T00:00:00.000Z. The datetime first labor unit in is 2023-08-07T00:00:00.000
----------------------------------------

2. Dataset Example 2
Input: Maintenance: The wo no is WO-1002. The equipment no is EQ-202. The work type is Repair. The priority is Medium. The cost is 1200.5. The department is Fire.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: The job type is REPAIR. The eq equip no is 30842. The work order status is WORK FINISHED. The meter 1 reading is 171561. The datetime out service is 2023-08-07T00:00:00.000Z. The datetime open is 2023-08-07T00:00:00.000Z. The datetime first labor is 0. The datetime unit in is 2023-08-07T00
----------------------------------------

3. Dataset Example 3
Input: Maintenance: The wo no is WO-1003. The equipment no is EQ-203. The work type is Inspection. The priority is Low. The cost is 300.0. The department is Public Works.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: The job type is REPAIR. The eq equip no is 62590. The work order status is OPEN. The meter 1 reading is 171561. The datetime out service is 2023-08-07T00:00:00.000Z. The datetime open is 2023-08-07T00:00:00.000Z. The datetime unit in is 2023-08-07T00:00:00.000Z. The datetime
----------------------------------------

⚡ Performance Benchmark:


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Average Response Time: 2.468s ± 0.006s
Responses per minute: ~24.3
Device used: cpu
Sample response: The job type is REPAIR. The eq equip no is 37464. The work order status is CLOSED. The meter 1 reading is 171561. The datetime out service is 2023-08-07T00:00:

✅ Testing completed successfully!
