In [None]:
# ============================================================================
# SKIN LESION DESCRIPTION GENERATOR - GOOGLE COLAB
# Generates domain-specific medical descriptions for ISIC dataset
# ============================================================================

# ===== CELL 1: INSTALLATION =====
"""
Run this first!
!pip install -q kagglehub transformers torch torchvision pillow pandas scikit-learn tqdm accelerate
"""

# ===== CELL 2: IMPORTS =====

import os
import kagglehub
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from pathlib import Path
import json
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# ===== CELL 3: DOWNLOAD & SETUP DATASET =====

def setup_isic_dataset():
    """
    Download ISIC dataset and explore structure
    """
    print("📥 Downloading ISIC Skin Cancer Dataset...")
    kaggle_root = kagglehub.dataset_download('nodoubttome/skin-cancer9-classesisic')

    DATA_ROOT = os.path.join(
        kaggle_root,
        "Skin cancer ISIC The International Skin Imaging Collaboration",
        "Train"
    )

    print(f"✅ Dataset downloaded to: {DATA_ROOT}")

    # List all classes
    classes = sorted([d for d in os.listdir(DATA_ROOT)
                     if os.path.isdir(os.path.join(DATA_ROOT, d))])

    print(f"\n📊 Found {len(classes)} classes:")
    for i, cls in enumerate(classes, 1):
        class_path = os.path.join(DATA_ROOT, cls)
        count = len([f for f in os.listdir(class_path)
                    if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        print(f"  {i}. {cls}: {count} images")

    return DATA_ROOT, classes


def create_dataset_index(DATA_ROOT, classes):
    """
    Create index of all images with paths and labels
    """
    print("\n📝 Creating dataset index...")

    data = []
    for cls in tqdm(classes, desc="Scanning directories"):
        class_dir = os.path.join(DATA_ROOT, cls)
        images = [f for f in os.listdir(class_dir)
                 if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        for img in images:
            data.append({
                'image_name': img,
                'relative_path': os.path.join(cls, img),
                'full_path': os.path.join(class_dir, img),
                'label': cls
            })

    df = pd.DataFrame(data)
    print(f"✅ Total images: {len(df)}")

    # Create train/val/test splits
    train_df, temp_df = train_test_split(
        df, test_size=0.3, stratify=df['label'], random_state=42
    )
    val_df, test_df = train_test_split(
        temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
    )

    print(f"\n📊 Dataset splits:")
    print(f"  Training:   {len(train_df)} images")
    print(f"  Validation: {len(val_df)} images")
    print(f"  Test:       {len(test_df)} images")

    # Save to CSV
    train_df.to_csv('train_index.csv', index=False)
    val_df.to_csv('val_index.csv', index=False)
    test_df.to_csv('test_index.csv', index=False)

    return train_df, val_df, test_df


# Run setup
DATA_ROOT, classes = setup_isic_dataset()
train_df, val_df, test_df = create_dataset_index(DATA_ROOT, classes)

# ===== CELL 4: DOMAIN-SPECIFIC DESCRIPTION TEMPLATES =====

class DermatologyDescriptionGenerator:
    """
    Generates medical descriptions based on dermatological knowledge
    Uses ABCDE criteria and clinical features
    """

    def __init__(self):
        # Comprehensive medical templates for each condition
        self.templates = {
            'actinic keratosis': {
                'description': 'A rough, scaly patch on chronically sun-exposed skin. The lesion displays a sandpaper-like texture with adherent scale. Color varies from erythematous pink to tan-brown. The surface is keratotic with possible surrounding solar elastosis.',
                'abcde': {
                    'asymmetry': 'Generally symmetric in configuration',
                    'border': 'Ill-defined borders blending with surrounding sun-damaged skin',
                    'color': 'Pink to red, sometimes brown, with overlying white or yellow scale',
                    'diameter': 'Typically small, ranging from 2-6mm, may coalesce into larger patches',
                    'evolution': 'Slowly progressive, may develop multiple lesions over time'
                },
                'features': [
                    'rough, dry, scaly surface texture',
                    'erythematous base with adherent keratotic scale',
                    'sandpaper-like feel on palpation',
                    'located on sun-exposed areas (face, scalp, forearms)',
                    'surrounding skin shows signs of chronic sun damage'
                ],
                'risk': 'Precancerous lesion with 5-10% risk of progression to SCC'
            },

            'basal cell carcinoma': {
                'description': 'A translucent, pearly nodule with visible telangiectasias. The lesion exhibits a rolled border with central depression. Surface appears shiny with a waxy quality. May show ulceration or crusting in advanced cases.',
                'abcde': {
                    'asymmetry': 'May show asymmetric growth pattern',
                    'border': 'Rolled, elevated, pearly borders with distinct margin',
                    'color': 'Translucent, pearly white, pink, or flesh-colored with telangiectasias',
                    'diameter': 'Variable size, often 5-10mm, slow-growing',
                    'evolution': 'Slowly progressive over months to years, rarely metastasizes'
                },
                'features': [
                    'pearly, translucent appearance',
                    'prominent telangiectatic vessels visible on surface',
                    'rolled or raised borders',
                    'central depression or ulceration (rodent ulcer)',
                    'firm consistency on palpation'
                ],
                'risk': 'Malignant but rarely metastasizes, locally destructive if untreated'
            },

            'dermatofibroma': {
                'description': 'A firm, dome-shaped dermal nodule with brown pigmentation. The lesion is well-circumscribed and indurated. Exhibits positive dimple sign when lateral pressure applied. Surface may show slight scale or be smooth.',
                'abcde': {
                    'asymmetry': 'Symmetric, round to oval configuration',
                    'border': 'Well-defined, distinct borders',
                    'color': 'Brown, tan, or reddish-brown, uniform pigmentation',
                    'diameter': 'Small, typically 5-10mm, stable in size',
                    'evolution': 'Stable, non-progressive, may persist indefinitely'
                },
                'features': [
                    'firm, hard consistency (fibrotic)',
                    'dome-shaped or slightly elevated',
                    'positive dimple sign (pathognomonic)',
                    'brown to reddish-brown coloration',
                    'fixed to dermis but mobile over underlying tissue'
                ],
                'risk': 'Benign fibrous histiocytoma, no malignant potential'
            },

            'melanoma': {
                'description': 'A pigmented lesion exhibiting multiple concerning features. Marked asymmetry with irregular, notched borders. Variegated coloration with shades of black, brown, red, white, and blue. Surface may be flat or elevated with irregular topography.',
                'abcde': {
                    'asymmetry': 'Pronounced asymmetry - one half distinctly different from the other',
                    'border': 'Irregular, notched, scalloped, or poorly defined borders',
                    'color': 'Multiple colors: black, brown, tan, red, white, blue, or depigmented areas',
                    'diameter': 'Often greater than 6mm (pencil eraser), but can be smaller',
                    'evolution': 'Changing size, shape, color, or symptoms (itching, bleeding)'
                },
                'features': [
                    'markedly asymmetric configuration',
                    'irregular, notched borders with geographic appearance',
                    'striking color variation within single lesion',
                    'may show regression (white areas) or inflammation (red areas)',
                    'surface may be ulcerated or bleeding in advanced cases'
                ],
                'risk': 'High-grade malignancy with significant metastatic potential - urgent evaluation required'
            },

            'nevus': {
                'description': 'A symmetric, uniformly pigmented lesion with regular borders. The nevus displays homogeneous color throughout, typically brown. Surface is smooth or slightly elevated. Overall appearance is bland and benign.',
                'abcde': {
                    'asymmetry': 'Symmetric - mirror image on both halves',
                    'border': 'Regular, smooth, well-circumscribed borders',
                    'color': 'Uniform color - single shade of tan, brown, or black',
                    'diameter': 'Stable size, typically less than 6mm',
                    'evolution': 'Stable over time, minimal to no changes'
                },
                'features': [
                    'symmetric round or oval shape',
                    'regular, well-defined borders',
                    'uniform tan to brown pigmentation',
                    'smooth surface, may be flat or slightly raised',
                    'stable appearance over time'
                ],
                'risk': 'Benign melanocytic nevus - routine monitoring sufficient'
            },

            'pigmented benign keratosis': {
                'description': 'A warty, stuck-on appearing lesion with well-demarcated borders. The keratosis has a cerebriform or verrucous surface. Pigmentation is uniform tan to dark brown. May contain keratin-filled cysts (horn cysts) visible on close inspection.',
                'abcde': {
                    'asymmetry': 'Generally symmetric with regular contours',
                    'border': 'Sharp, well-demarcated, appear "stuck-on" to skin',
                    'color': 'Tan, brown, or black - uniform pigmentation',
                    'diameter': 'Variable, 3-30mm, stable or slowly enlarging',
                    'evolution': 'Slowly progressive, stable once established'
                },
                'features': [
                    'stuck-on, waxy appearance',
                    'verrucous or cerebriform surface texture',
                    'sharply demarcated borders',
                    'horn cysts (keratin plugs) may be visible',
                    'uniform tan to brown pigmentation'
                ],
                'risk': 'Benign seborrheic keratosis - no malignant transformation risk'
            },

            'seborrheic keratosis': {
                'description': 'A benign, well-circumscribed lesion with a waxy, “stuck-on” appearance. The surface is verrucous or cerebriform with possible keratin plugs. Color ranges from light tan to dark brown or black. Common in older adults on the trunk, face, and extremities.',
                'abcde': {
                    'asymmetry': 'Usually symmetric in shape',
                    'border': 'Sharply demarcated, regular borders with a stuck-on appearance',
                    'color': 'Uniform tan, brown, or black pigmentation',
                    'diameter': 'Variable (3–30mm), may enlarge slowly',
                    'evolution': 'Stable or slowly enlarging, no malignant transformation'
                },
                'features': [
                    'stuck-on, waxy surface',
                    'verrucous or cerebriform texture',
                    'well-defined borders',
                    'presence of keratin plugs or horn cysts',
                    'common on sun-exposed areas in elderly individuals'
                ],
                'risk': 'Benign epidermal tumor (seborrheic keratosis) with no malignant potential'
            },

            'squamous cell carcinoma': {
                'description': 'A firm, indurated erythematous nodule or plaque with rough surface. The lesion exhibits hyperkeratosis with possible ulceration. Borders are ill-defined with induration extending beyond visible margin. Surface may be crusted or bleeding.',
                'abcde': {
                    'asymmetry': 'Asymmetric with irregular growth pattern',
                    'border': 'Ill-defined, irregular borders with induration',
                    'color': 'Red, pink, or erythematous with possible pigmentation',
                    'diameter': 'Variable, often 1-2cm, progressively enlarging',
                    'evolution': 'Rapid growth over weeks to months, may ulcerate or bleed'
                },
                'features': [
                    'firm, indurated consistency',
                    'rough, hyperkeratotic surface',
                    'erythematous to pink coloration',
                    'may show ulceration, crusting, or bleeding',
                    'palpable induration extends beyond visible lesion'
                ],
                'risk': 'Malignant with metastatic potential (2-5%), requires prompt treatment'
            },

            'vascular lesion': {
                'description': 'A lesion of vascular origin displaying red to purple coloration. Blanches partially or completely with pressure (diascopy positive). May be flat (macular) or raised (papular). Surface is smooth without scale.',
                'abcde': {
                    'asymmetry': 'Variable - may be symmetric or asymmetric',
                    'border': 'Well-defined or irregular depending on type',
                    'color': 'Red, purple, or violaceous - blanches with pressure',
                    'diameter': 'Highly variable, from pinpoint to several centimeters',
                    'evolution': 'Variable - some stable, others progressive'
                },
                'features': [
                    'red, purple, or violaceous coloration',
                    'blanches with direct pressure (diascopy)',
                    'may be flat (hemangioma) or raised (pyogenic granuloma)',
                    'smooth surface without scale',
                    'may show clustered or linear configuration'
                ],
                'risk': 'Typically benign vascular proliferation or malformation'
            }
        }

    def generate_description(self, label):
        """
        Generate comprehensive medical description
        """
        label_key = label.lower()

        if label_key not in self.templates:
            # Fallback for unknown labels
            return {
                'full_description': f'A skin lesion classified as {label}. Detailed dermatological examination required.',
                'abcde_analysis': 'Complete clinical assessment needed',
                'clinical_features': 'Requires professional evaluation',
                'risk_assessment': 'Consult dermatologist for proper diagnosis'
            }

        template = self.templates[label_key]

        # Build structured description
        description = {
            'label': label,
            'full_description': template['description'],
            'abcde_asymmetry': template['abcde']['asymmetry'],
            'abcde_border': template['abcde']['border'],
            'abcde_color': template['abcde']['color'],
            'abcde_diameter': template['abcde']['diameter'],
            'abcde_evolution': template['abcde']['evolution'],
            'clinical_features': ' | '.join(template['features']),
            'risk_assessment': template['risk']
        }

        # Create comprehensive text version
        abcde_text = f"ABCDE Analysis: Asymmetry - {template['abcde']['asymmetry']}. Border - {template['abcde']['border']}. Color - {template['abcde']['color']}. Diameter - {template['abcde']['diameter']}. Evolution - {template['abcde']['evolution']}."

        description['comprehensive_description'] = f"{template['description']} {abcde_text} Clinical features include: {'; '.join(template['features'])}. {template['risk']}"

        return description

def generate_descriptions_for_dataset(df, output_csv):
    """
    Generate domain-specific descriptions for entire dataset
    """
    print(f"\n🔬 Generating medical descriptions for {len(df)} images...")

    generator = DermatologyDescriptionGenerator()
    descriptions = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        desc = generator.generate_description(row['label'])
        descriptions.append({
            'image_name': row['image_name'],
            'relative_path': row['relative_path'],
            'full_path': row['full_path'],
            'label': row['label'],
            'full_description': desc['full_description'],
            'abcde_asymmetry': desc['abcde_asymmetry'],
            'abcde_border': desc['abcde_border'],
            'abcde_color': desc['abcde_color'],
            'abcde_diameter': desc['abcde_diameter'],
            'abcde_evolution': desc['abcde_evolution'],
            'clinical_features': desc['clinical_features'],
            'risk_assessment': desc['risk_assessment'],
            'comprehensive_description': desc['comprehensive_description']
        })

    # Save to CSV
    desc_df = pd.DataFrame(descriptions)
    desc_df.to_csv(output_csv, index=False)

    print(f"✅ Descriptions saved to: {output_csv}")
    print(f"\n📊 Sample description:")
    sample = desc_df.iloc[0]
    print(f"Label: {sample['label']}")
    print(f"Description: {sample['full_description'][:200]}...")

    return desc_df


# Generate descriptions for all splits
train_descriptions = generate_descriptions_for_dataset(train_df, 'train_descriptions.csv')
val_descriptions = generate_descriptions_for_dataset(val_df, 'val_descriptions.csv')
test_descriptions = generate_descriptions_for_dataset(test_df, 'test_descriptions.csv')


class VisualDescriptionEnhancer:
    """
    Uses BLIP-2 to extract visual features from images
    Enhances template descriptions with actual visual analysis
    """

    def __init__(self, model_name="Salesforce/blip2-opt-2.7b"):
        print(f"\n🤖 Loading BLIP-2 model: {model_name}")
        print("⚠️ This requires GPU and ~8GB VRAM")

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        self.processor = Blip2Processor.from_pretrained(model_name)
        self.model = Blip2ForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto"
        )

        print("✅ Model loaded successfully")

    def analyze_image(self, image_path):
        """
        Extract visual features from skin lesion image
        """
        try:
            image = Image.open(image_path).convert('RGB')

            # Dermatology-specific prompt
            prompt = "Question: Describe this skin lesion focusing on color, texture, border characteristics, and symmetry. Answer:"

            inputs = self.processor(image, prompt, return_tensors="pt").to(self.device, torch.float16)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=100,
                    num_beams=5,
                    temperature=0.7
                )

            visual_desc = self.processor.decode(outputs[0], skip_special_tokens=True)
            return visual_desc

        except Exception as e:
            print(f"Error analyzing {image_path}: {str(e)}")
            return ""

    def enhance_dataset_descriptions(self, desc_df, output_csv, max_samples=None):
        """
        Enhance descriptions with visual analysis
        """
        print(f"\n👁️ Enhancing descriptions with visual analysis...")

        if max_samples:
            print(f"Processing {max_samples} samples (set max_samples=None for all)")
            desc_df = desc_df.head(max_samples)

        enhanced_data = []

        for idx, row in tqdm(desc_df.iterrows(), total=len(desc_df), desc="Analyzing images"):
            visual_analysis = self.analyze_image(row['full_path'])

            # Combine template and visual descriptions
            enhanced_desc = f"{row['comprehensive_description']}\n\nVisual Analysis: {visual_analysis}"

            enhanced_data.append({
                **row.to_dict(),
                'visual_analysis': visual_analysis,
                'enhanced_description': enhanced_desc
            })

        enhanced_df = pd.DataFrame(enhanced_data)
        enhanced_df.to_csv(output_csv, index=False)

        print(f"✅ Enhanced descriptions saved to: {output_csv}")
        return enhanced_df


# ===== CELL 7: OPTIONAL - ENHANCE WITH VISUAL ANALYSIS =====

# Uncomment to use visual enhancement (requires GPU memory)
"""
enhancer = VisualDescriptionEnhancer()

# Enhance test set (limit to 100 samples for demo)
test_enhanced = enhancer.enhance_dataset_descriptions(
    test_descriptions,
    'test_enhanced_descriptions.csv',
    max_samples=100
)

print("\n📊 Sample enhanced description:")
sample = test_enhanced.iloc[0]
print(f"Label: {sample['label']}")
print(f"Enhanced: {sample['enhanced_description'][:300]}...")
"""

# ===== CELL 8: VIEW SAMPLE RESULTS =====

def display_sample_descriptions(desc_df, n_samples=3):
    """
    Display sample descriptions for inspection
    """
    print("\n" + "="*80)
    print("📋 SAMPLE DESCRIPTIONS")
    print("="*80)

    for idx in range(min(n_samples, len(desc_df))):
        row = desc_df.iloc[idx]
        print(f"\n{'─'*80}")
        print(f"🏷️  Label: {row['label']}")
        print(f"📸 Image: {row['image_name']}")
        print(f"\n📝 Description:")
        print(f"   {row['full_description']}")
        print(f"\n🔍 ABCDE Analysis:")
        print(f"   • Asymmetry: {row['abcde_asymmetry']}")
        print(f"   • Border: {row['abcde_border']}")
        print(f"   • Color: {row['abcde_color']}")
        print(f"   • Diameter: {row['abcde_diameter']}")
        print(f"   • Evolution: {row['abcde_evolution']}")
        print(f"\n⚕️  Risk: {row['risk_assessment']}")
        print(f"{'─'*80}")


# Display samples from each split
print("\n🔍 TRAINING SET SAMPLES:")
display_sample_descriptions(train_descriptions, n_samples=2)

print("\n🔍 TEST SET SAMPLES:")
display_sample_descriptions(test_descriptions, n_samples=2)

# ===== CELL 9: STATISTICS & SUMMARY =====

def generate_summary_statistics(train_desc, val_desc, test_desc):
    """
    Generate summary statistics of the dataset
    """
    print("\n" + "="*80)
    print("📊 DATASET SUMMARY")
    print("="*80)

    all_data = pd.concat([train_desc, val_desc, test_desc])

    print(f"\n📈 Total Dataset:")
    print(f"   Total images: {len(all_data)}")
    print(f"   Classes: {len(all_data['label'].unique())}")

    print(f"\n📊 Class Distribution:")
    class_counts = all_data['label'].value_counts()
    for label, count in class_counts.items():
        percentage = (count / len(all_data)) * 100
        print(f"   {label:30s}: {count:5d} ({percentage:5.2f}%)")

    print(f"\n📦 Split Sizes:")
    print(f"   Training:   {len(train_desc):5d} images")
    print(f"   Validation: {len(val_desc):5d} images")
    print(f"   Test:       {len(test_desc):5d} images")

    print(f"\n📄 Generated Files:")
    print(f"   ✓ train_descriptions.csv")
    print(f"   ✓ val_descriptions.csv")
    print(f"   ✓ test_descriptions.csv")

    print("\n" + "="*80)
    print("✅ DESCRIPTION GENERATION COMPLETE!")
    print("="*80)
    print("\n💡 Next steps:")
    print("   1. Review sample descriptions above")
    print("   2. Optional: Run visual enhancement (Cell 7)")
    print("   3. Use descriptions as input for LLM classification")
    print("   4. Download CSV files for further processing")


generate_summary_statistics(train_descriptions, val_descriptions, test_descriptions)

# ===== CELL 10: EXPORT & DOWNLOAD =====

print("\n📥 Files ready for download:")
print("   • train_descriptions.csv")
print("   • val_descriptions.csv")
print("   • test_descriptions.csv")
print("\nUse Colab's file browser (left sidebar) to download these files.")