<a href="https://colab.research.google.com/github/IyadSultan/IyadSultan/blob/main/breast_reports_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages (run this cell first in Colab)
# !pip install openai pandas tqdm

import pandas as pd
import openai
import json
import time
from typing import Dict, Any
import logging
from google.colab import userdata
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MammographyExtractor:
    def __init__(self):
        """
        Initialize the mammography data extractor using Colab secrets
        """
        # Get API key from Colab secrets
        try:
            api_key = userdata.get('OPENAI_API_KEY')
            self.client = openai.OpenAI(api_key=api_key)
            logger.info("✅ OpenAI API key loaded successfully from Colab secrets")
        except Exception as e:
            logger.error(f"❌ Error loading API key: {e}")
            raise Exception("Please ensure OPENAI_API_KEY is set in Colab secrets")

        # Define the extraction prompt
        self.extraction_prompt = """
You are a medical AI assistant specialized in analyzing mammography reports. Extract the following information from the mammography report text for BOTH right and left breasts separately. If information is not mentioned or not applicable, use "not mentioned" or "N/A".

Extract the following data in JSON format:
{
  "right_breast": {
    "birads": "BI-RADS category (1, 2, 3, 4, 5, or 6)",
    "classification": "primary classification (negative, benign, probably benign, suspicious, malignant)",
    "calcification_present": "yes or no, and type if present (benign, dystrophic, scattered, clustered, microcalcifications, etc.)",
    "previous_surgery": "type of previous surgery (mastectomy, BCS, lumpectomy, none, or not mentioned)",
    "normal_status": "normal or abnormal findings",
    "benign_vs_malignant": "benign, malignant, suspicious, or normal",
    "density": "heterogenous, fatty, extremely dense, scattered fibroglandular, or not mentioned",
    "cyst_present": "yes or no (yes if any cysts mentioned, no if not mentioned or explicitly absent)"
  },
  "left_breast": {
    "birads": "BI-RADS category (1, 2, 3, 4, 5, or 6)",
    "classification": "primary classification (negative, benign, probably benign, suspicious, malignant)",
    "calcification_present": "yes or no, and type if present (benign, dystrophic, scattered, clustered, microcalcifications, etc.)",
    "previous_surgery": "type of previous surgery (mastectomy, BCS, lumpectomy, none, or not mentioned)",
    "normal_status": "normal or abnormal findings",
    "benign_vs_malignant": "benign, malignant, suspicious, or normal",
    "density": "heterogenous, fatty, extremely dense, scattered fibroglandular, or not mentioned",
    "cyst_present": "yes or no (yes if any cysts mentioned, no if not mentioned or explicitly absent)"
  }
}

Important notes:
- If the report mentions "bilateral" findings, apply to both breasts
- Look for terms like "heterogeneously dense", "extremely dense", "predominantly fat", "scattered fibroglandular"
- Previous surgery terms: "status post mastectomy", "status post BCS", "breast conserving surgery"
- BI-RADS categories: 1=negative, 2=benign, 3=probably benign, 4=suspicious, 5=highly suspicious, 6=malignant
- For cyst detection, look for terms: "cyst", "cysts", "cystic", "complicated cyst", "simple cyst"
- For calcification detection, look for: "calcification", "calcifications", "microcalcifications", "clustered", "scattered", "benign calcifications", "dystrophic calcifications"
- Return only valid JSON, no additional text
"""

    def extract_from_text(self, report_text: str) -> Dict[str, Any]:
        """
        Extract mammography data from a single report text

        Args:
            report_text: The mammography report text

        Returns:
            Dictionary containing extracted data for both breasts
        """
        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": self.extraction_prompt},
                    {"role": "user", "content": f"Extract data from this mammography report:\n\n{report_text}"}
                ],
                temperature=0.1,
                max_tokens=1000
            )

            # Parse the JSON response
            result_text = response.choices[0].message.content.strip()

            # Clean up the response (remove any markdown formatting)
            if result_text.startswith("```json"):
                result_text = result_text[7:-3]
            elif result_text.startswith("```"):
                result_text = result_text[3:-3]

            extracted_data = json.loads(result_text)
            return extracted_data

        except json.JSONDecodeError as e:
            logger.error(f"JSON decode error: {e}")
            logger.error(f"Response was: {response.choices[0].message.content}")
            return self._get_empty_structure()

        except Exception as e:
            logger.error(f"Error processing text: {e}")
            return self._get_empty_structure()

    def _get_empty_structure(self) -> Dict[str, Any]:
        """Return empty structure when extraction fails"""
        return {
            "right_breast": {
                "birads": "not mentioned",
                "classification": "not mentioned",
                "calcification_present": "not mentioned",
                "previous_surgery": "not mentioned",
                "normal_status": "not mentioned",
                "benign_vs_malignant": "not mentioned",
                "density": "not mentioned",
                "cyst_present": "not mentioned"
            },
            "left_breast": {
                "birads": "not mentioned",
                "classification": "not mentioned",
                "calcification_present": "not mentioned",
                "previous_surgery": "not mentioned",
                "normal_status": "not mentioned",
                "benign_vs_malignant": "not mentioned",
                "density": "not mentioned",
                "cyst_present": "not mentioned"
            }
        }

    def process_dataframe(self, df: pd.DataFrame, text_column: str = '_text',
                         batch_size: int = 10, delay: float = 1.0) -> pd.DataFrame:
        """
        Process a DataFrame containing mammography reports

        Args:
            df: DataFrame with mammography reports
            text_column: Name of column containing report text
            batch_size: Number of reports to process before a delay
            delay: Delay in seconds between batches

        Returns:
            DataFrame with extracted features added
        """
        results = []
        total_rows = len(df)

        print(f"🔄 Starting processing of {total_rows} mammography reports...")

        # Create progress bar
        with tqdm(total=total_rows, desc="Processing Reports", unit="report") as pbar:
            for idx, row in df.iterrows():
                # Extract data from the report text
                extracted_data = self.extract_from_text(row[text_column])

                # Create a flat structure for DataFrame columns
                flat_data = {
                    'right_birads': extracted_data['right_breast']['birads'],
                    'right_classification': extracted_data['right_breast']['classification'],
                    'right_calcification_present': extracted_data['right_breast']['calcification_present'],
                    'right_previous_surgery': extracted_data['right_breast']['previous_surgery'],
                    'right_normal': extracted_data['right_breast']['normal_status'],
                    'right_benign_vs_malignant': extracted_data['right_breast']['benign_vs_malignant'],
                    'right_density': extracted_data['right_breast']['density'],
                    'right_cyst_present': extracted_data['right_breast']['cyst_present'],
                    'left_birads': extracted_data['left_breast']['birads'],
                    'left_classification': extracted_data['left_breast']['classification'],
                    'left_calcification_present': extracted_data['left_breast']['calcification_present'],
                    'left_previous_surgery': extracted_data['left_breast']['previous_surgery'],
                    'left_normal': extracted_data['left_breast']['normal_status'],
                    'left_benign_vs_malignant': extracted_data['left_breast']['benign_vs_malignant'],
                    'left_density': extracted_data['left_breast']['density'],
                    'left_cyst_present': extracted_data['left_breast']['cyst_present']
                }

                results.append(flat_data)

                # Update progress bar
                pbar.update(1)
                pbar.set_postfix({
                    'Current': f"{idx + 1}/{total_rows}",
                    'Batch': f"{(idx + 1) % batch_size}/{batch_size}"
                })

                # Add delay between batches to respect rate limits
                if (idx + 1) % batch_size == 0:
                    pbar.set_description(f"Processing Reports - Sleeping {delay}s")
                    time.sleep(delay)
                    pbar.set_description("Processing Reports")

        # Create results DataFrame
        results_df = pd.DataFrame(results)

        # Combine with original DataFrame
        final_df = pd.concat([df.reset_index(drop=True), results_df], axis=1)

        print("✅ Processing completed successfully!")
        return final_df

    def display_summary(self, df: pd.DataFrame):
        """Display a summary of the extracted data"""
        print("\n" + "="*50)
        print("📊 EXTRACTION SUMMARY")
        print("="*50)

        # Right breast summary
        print("\n🟦 RIGHT BREAST SUMMARY:")
        print(f"BI-RADS Distribution: {df['right_birads'].value_counts().to_dict()}")
        print(f"Density Distribution: {df['right_density'].value_counts().to_dict()}")
        print(f"Surgery History: {df['right_previous_surgery'].value_counts().to_dict()}")
        print(f"Calcification Present: {df['right_calcification_present'].value_counts().to_dict()}")
        print(f"Cyst Present: {df['right_cyst_present'].value_counts().to_dict()}")

        # Left breast summary
        print("\n🟩 LEFT BREAST SUMMARY:")
        print(f"BI-RADS Distribution: {df['left_birads'].value_counts().to_dict()}")
        print(f"Density Distribution: {df['left_density'].value_counts().to_dict()}")
        print(f"Surgery History: {df['left_previous_surgery'].value_counts().to_dict()}")
        print(f"Calcification Present: {df['left_calcification_present'].value_counts().to_dict()}")
        print(f"Cyst Present: {df['left_cyst_present'].value_counts().to_dict()}")

        print("\n" + "="*50)

# Quick setup and demo function for Colab
def run_mammography_extraction(df=None, text_column='_text'):
    """
    Main function to run mammography extraction in Colab

    Args:
        df: DataFrame with mammography reports (optional - will create demo if None)
        text_column: Name of column containing report text
    """

    # Initialize the extractor
    print("🚀 Initializing Mammography Extractor...")
    extractor = MammographyExtractor()

    # Use provided DataFrame or create sample data for demo
    if df is None:
        print("📝 No DataFrame provided, creating sample data for demonstration...")
        sample_data = {
            '_text': [
                """bilateral diagnostic digital mammogram clinical history: 61 year-old female, had history of left breast cancer status post bcs. findings: the breast tissues are heterogeneously dense. postoperative changes at the left axilla and left subareolar aspect with coarse calcification suggestive of fat necrosis. scattered bilateral benign appearing calcifications. cysts at left breast. impression: postoperative changes in the left breast and left axilla. bilateral benign looking calcifications. birads 2- benign finding.""",
                """right diagnostic digital mammogram clinical indication: a 68-year-old female patient. left breast cancer in 2003, status post left mastectomy. findings: the breast tissues are extremely dense. scattered cysts of variable complexity. dystrophic calcifications present. impression: the right breast demonstrates dense parenchyma with marked fibrocystic change. birads 3 : probably benign findings""",
                """bilateral screening digital mammogram clinical history: 53-year-old lady. for screening mammography. findings: the breast tissue are heterogeneously dense. scattered benign looking calcifications. there are few small cysts bilateral mainly in the left breast. there is no evidence of suspicious clustered microcalcifications. impression: no mammographic evidence of malignancy. birads 2- benign findings."""
            ]
        }
        df = pd.DataFrame(sample_data)

    # Process the DataFrame
    print(f"\n🔬 Processing {len(df)} mammography reports...")
    processed_df = extractor.process_dataframe(df, text_column)

    # Display summary
    extractor.display_summary(processed_df)

    # Save to CSV in Colab
    filename = "mammography_extracted_data.csv"
    processed_df.to_csv(filename, index=False)
    print(f"\n💾 Results saved to {filename}")

    # Display first few rows
    print(f"\n📋 SAMPLE OF EXTRACTED DATA:")
    print("="*80)

    # Show only the extracted columns for better readability
    extracted_cols = [col for col in processed_df.columns if col.startswith(('right_', 'left_'))]
    print(processed_df[extracted_cols].head())

    return processed_df

# Instructions for Colab users
print("""
🔧 COLAB SETUP INSTRUCTIONS:

1. First, set up your OpenAI API key in Colab secrets:
   - Click on the 🔑 key icon in the left sidebar
   - Add a new secret with name: OPENAI_API_KEY
   - Paste your OpenAI API key as the value

2. Install required packages by running:
   !pip install openai pandas

3. Run the extraction:

   # For demo with sample data:
   processed_df = run_mammography_extraction()

   # For your own data:
   # df = pd.read_csv('your_file.csv')  # Load your data
   # processed_df = run_mammography_extraction(df, text_column='_text')

4. The results will be saved as 'mammography_extracted_data.csv' with 16 extracted features:
   - 8 features per breast: BI-RADS, classification, calcification presence, surgery history, normal status, benign/malignant, density, and cyst presence
""")

# Load and process your data
print("📂 Loading data from /content/datanew.csv...")
try:
    df = pd.read_csv('/content/datanew.csv')
    print(f"✅ Data loaded successfully! Shape: {df.shape}")
    print(f"📋 Columns: {list(df.columns)}")

    # Check if _text column exists
    if '_text' in df.columns:
        print(f"✅ '_text' column found with {df['_text'].notna().sum()} non-null entries")

        # Show sample of first few characters from _text column
        print(f"📝 Sample text preview:")
        print("-" * 50)
        for i in range(min(3, len(df))):
            sample_text = str(df['_text'].iloc[i])[:200] + "..." if len(str(df['_text'].iloc[i])) > 200 else str(df['_text'].iloc[i])
            print(f"Row {i+1}: {sample_text}")
            print()

        # Run the extraction
        print("🚀 Starting mammography data extraction...")
        processed_df = run_mammography_extraction(df, text_column='_text')

    else:
        print("❌ Error: '_text' column not found in the dataset")
        print(f"Available columns: {list(df.columns)}")

except FileNotFoundError:
    print("❌ Error: File '/content/datanew.csv' not found")
    print("Please ensure your CSV file is uploaded to Colab at this path")
except Exception as e:
    print(f"❌ Error loading data: {e}")

# Keep the demo function available for testing
def run_demo():
    """Run with sample data for testing"""
    return run_mammography_extraction()


🔧 COLAB SETUP INSTRUCTIONS:

1. First, set up your OpenAI API key in Colab secrets:
   - Click on the 🔑 key icon in the left sidebar
   - Add a new secret with name: OPENAI_API_KEY
   - Paste your OpenAI API key as the value

2. Install required packages by running:
   !pip install openai pandas

3. Run the extraction:
   
   # For demo with sample data:
   processed_df = run_mammography_extraction()
   
   # For your own data:
   # df = pd.read_csv('your_file.csv')  # Load your data
   # processed_df = run_mammography_extraction(df, text_column='_text')

4. The results will be saved as 'mammography_extracted_data.csv' with 16 extracted features:
   - 8 features per breast: BI-RADS, classification, calcification presence, surgery history, normal status, benign/malignant, density, and cyst presence

📂 Loading data from /content/datanew.csv...
✅ Data loaded successfully! Shape: (2049, 15)
📋 Columns: ['MRN', 'EXAM DATE/TIME', 'VERIFIED DATE', 'DATE REPORT ENTERED', 'PROCEDURE', 'IMPRESSI

Processing Reports:   2%|▏         | 50/2049 [03:00<2:12:04,  3.96s/report, Current=50/2049, Batch=0/10]                