In [1]:
import pandas as pd

def merge_data_files():
    """
    Merge the ISIC_2020_Training_GroundTruth.csv with merged_sample.csv
    using image_name as the common key.
    """
    print("Loading datasets...")
    
    try:
        # Load the original ISIC dataset
        isic_df = pd.read_csv('ISIC_2020_Training_GroundTruth.csv')
        print(f"Loaded ISIC dataset with {len(isic_df)} rows and {len(isic_df.columns)} columns")
        
        # Load the synthetic data
        synthetic_df = pd.read_csv('merged_sample.csv')
        print(f"Loaded synthetic dataset with {len(synthetic_df)} rows and {len(synthetic_df.columns)} columns")
        
        # Check if both datasets have the image_name column
        if 'image_name' not in isic_df.columns:
            print("Error: 'image_name' column not found in ISIC_2020_Training_GroundTruth.csv")
            return
            
        if 'image_name' not in synthetic_df.columns:
            print("Error: 'image_name' column not found in merged_sample.csv")
            return
        
        # Identify columns that exist in both datasets (to avoid duplication)
        duplicate_cols = [col for col in synthetic_df.columns if col in isic_df.columns and col != 'image_name']
        if duplicate_cols:
            print(f"Note: The following columns exist in both datasets and will be taken from the ISIC dataset: {duplicate_cols}")
            # Remove duplicate columns from synthetic_df to avoid _x, _y suffixes
            synthetic_df = synthetic_df.drop(columns=duplicate_cols)
        
        # Merge the datasets on image_name
        merged_df = pd.merge(isic_df, synthetic_df, on='image_name', how='left')
        print(f"Successfully merged datasets. Result has {len(merged_df)} rows and {len(merged_df.columns)} columns")
        
        # Check for any missing values that might have been introduced during the merge
        missing_count = merged_df.isnull().sum().sum()
        if missing_count > 0:
            print(f"Warning: The merged dataset contains {missing_count} missing values")
        
        # Save the merged data
        output_filename = 'complete_cancer_detection_dataset.csv'
        merged_df.to_csv(output_filename, index=False)
        print(f"Merged dataset saved to {output_filename}")
        
        # Display the first few rows of the merged dataset
        print("\nFirst 5 rows of the merged dataset:")
        print(merged_df.head())
        
        return merged_df
        
    except Exception as e:
        print(f"Error during merge process: {str(e)}")
        return None

if __name__ == "__main__":
    merge_data_files()

Loading datasets...
Loaded ISIC dataset with 33126 rows and 8 columns
Loaded synthetic dataset with 33126 rows and 18 columns
Note: The following columns exist in both datasets and will be taken from the ISIC dataset: ['target', 'benign_malignant']
Successfully merged datasets. Result has 33126 rows and 23 columns
Merged dataset saved to complete_cancer_detection_dataset.csv

First 5 rows of the merged dataset:
     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_2637011  IP_7279968    male        45.0                     head/neck   
1  ISIC_0015719  IP_3075186  female        45.0               upper extremity   
2  ISIC_0052212  IP_2842074  female        50.0               lower extremity   
3  ISIC_0068279  IP_6890425  female        45.0                     head/neck   
4  ISIC_0074268  IP_8723313  female        55.0               upper extremity   

  diagnosis benign_malignant  target  traditional_diagnosis_time  \
0   unknown           benign  

In [5]:
import pandas as pd
import numpy as np
from scipy import stats

def load_data(file_path='complete_cancer_detection_dataset.csv'):
    """
    Load the dataset and display basic information
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file
        
    Returns:
    --------
    pd.DataFrame
        The loaded dataset
    """
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")
    return df

def basic_eda(df):
    """
    Perform basic exploratory data analysis
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== BASIC DATASET EXPLORATION =====")
    
    # Data types and missing values
    print("\nData types and missing values:")
    missing_info = pd.DataFrame({
        'Data Type': df.dtypes,
        'Missing Values': df.isnull().sum(),
        'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2)
    })
    print(missing_info)
    
    # Check for extreme values or outliers in numerical columns
    print("\nSummary statistics for numerical columns:")
    num_cols = df.select_dtypes(include=[np.number]).columns
    print(df[num_cols].describe().T)
    
    # Display info about target variable
    print("\nTarget variable distribution:")
    target_counts = df['target'].value_counts()
    target_pct = df['target'].value_counts(normalize=True) * 100
    print(f"0 (Benign): {target_counts[0]} ({target_pct[0]:.2f}%)")
    print(f"1 (Malignant): {target_counts[1]} ({target_pct[1]:.2f}%)")
    
    # Display unique values for categorical columns
    print("\nUnique values in categorical columns:")
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        value_counts = df[col].value_counts()
        print(f"\n{col} - {len(value_counts)} unique values:")
        if len(value_counts) < 10:  # Only show if not too many unique values
            for val, count in value_counts.items():
                pct = count / len(df) * 100
                print(f"  {val}: {count} ({pct:.2f}%)")
        else:
            print(f"  Top 5 values:")
            for val, count in value_counts.head().items():
                pct = count / len(df) * 100
                print(f"  {val}: {count} ({pct:.2f}%)")

def clinical_analysis(df):
    """
    Analyze the clinical aspects of the dataset
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== CLINICAL DATA ANALYSIS =====")
    
    # Distribution of diagnosis
    print("\nDiagnosis distribution:")
    diagnosis_counts = df['diagnosis'].value_counts()
    for diagnosis, count in diagnosis_counts.items():
        pct = count / len(df) * 100
        print(f"  {diagnosis}: {count} ({pct:.2f}%)")
    
    # Distribution by anatomical site
    print("\nAnatomical site distribution:")
    site_counts = df['anatom_site_general_challenge'].value_counts()
    for site, count in site_counts.items():
        pct = count / len(df) * 100
        print(f"  {site}: {count} ({pct:.2f}%)")
    
    # Distribution by benign/malignant
    print("\nBenign/malignant distribution:")
    benign_malignant_counts = df['benign_malignant'].value_counts()
    for status, count in benign_malignant_counts.items():
        pct = count / len(df) * 100
        print(f"  {status}: {count} ({pct:.2f}%)")
    
    # Age statistics
    print("\nAge statistics:")
    print(f"  Mean age: {df['age_approx'].mean():.2f}")
    print(f"  Median age: {df['age_approx'].median():.2f}")
    print(f"  Min age: {df['age_approx'].min():.2f}")
    print(f"  Max age: {df['age_approx'].max():.2f}")
    
    # Age distribution by benign/malignant
    print("\nAge by benign/malignant status:")
    age_by_status = df.groupby('benign_malignant')['age_approx'].agg(['mean', 'median', 'min', 'max'])
    print(age_by_status)
    
    # Gender distribution by benign/malignant
    print("\nGender by benign/malignant status:")
    gender_malignant = pd.crosstab(df['sex'], df['benign_malignant'], normalize='index') * 100
    print(gender_malignant)
    
    # Anatomical site by target
    print("\nTarget distribution by anatomical site:")
    site_target = pd.crosstab(df['anatom_site_general_challenge'], df['target'])
    site_target_pct = site_target.div(site_target.sum(axis=1), axis=0) * 100
    print(site_target_pct)
    
    # Average age by diagnosis
    print("\nAverage age by diagnosis:")
    print(df.groupby('diagnosis')['age_approx'].mean().sort_values(ascending=False))

def ai_comparison_analysis(df):
    """
    Analyze the comparison between traditional and AI-assisted diagnosis
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== AI VS TRADITIONAL COMPARISON ANALYSIS =====")
    
    # Diagnostic time comparison
    print("\nDiagnostic time comparison:")
    trad_time_mean = df['traditional_diagnosis_time'].mean()
    ai_time_mean = df['ai_diagnosis_time'].mean()
    time_reduction = trad_time_mean - ai_time_mean
    time_reduction_pct = (time_reduction / trad_time_mean) * 100
    
    print(f"  Traditional method average time: {trad_time_mean:.2f} hours")
    print(f"  AI-assisted method average time: {ai_time_mean:.2f} hours")
    print(f"  Time reduction: {time_reduction:.2f} hours ({time_reduction_pct:.2f}%)")
    
    # Time saved statistics
    print("\nTime saved percentage statistics:")
    print(f"  Mean: {df['time_saved_percentage'].mean():.2f}%")
    print(f"  Median: {df['time_saved_percentage'].median():.2f}%")
    print(f"  Min: {df['time_saved_percentage'].min():.2f}%")
    print(f"  Max: {df['time_saved_percentage'].max():.2f}%")
    
    # Cost comparison
    print("\nCost comparison:")
    trad_cost_mean = df['traditional_total_cost'].mean()
    ai_cost_mean = df['ai_total_cost'].mean()
    cost_diff = trad_cost_mean - ai_cost_mean
    cost_diff_pct = (cost_diff / trad_cost_mean) * 100
    
    print(f"  Traditional method average cost: {trad_cost_mean:.2f} EUR")
    print(f"  AI-assisted method average cost: {ai_cost_mean:.2f} EUR")
    print(f"  Cost difference: {cost_diff:.2f} EUR ({cost_diff_pct:.2f}%)")
    
    # Cost difference statistics
    print("\nCost difference percentage statistics:")
    print(f"  Mean: {df['cost_difference_percentage'].mean():.2f}%")
    print(f"  Median: {df['cost_difference_percentage'].median():.2f}%")
    print(f"  Min: {df['cost_difference_percentage'].min():.2f}%")
    print(f"  Max: {df['cost_difference_percentage'].max():.2f}%")
    
    # Survival probability comparison
    print("\nSurvival probability comparison:")
    trad_survival_mean = df['traditional_survival_probability'].mean()
    ai_survival_mean = df['ai_survival_probability'].mean()
    survival_improvement = ai_survival_mean - trad_survival_mean
    
    print(f"  Traditional method average survival probability: {trad_survival_mean:.2f}%")
    print(f"  AI-assisted method average survival probability: {ai_survival_mean:.2f}%")
    print(f"  Survival probability improvement: {survival_improvement:.2f} percentage points")
    
    # Survival improvement statistics
    print("\nSurvival probability improvement statistics:")
    print(f"  Mean: {df['survival_probability_improvement'].mean():.2f} points")
    print(f"  Median: {df['survival_probability_improvement'].median():.2f} points")
    print(f"  Min: {df['survival_probability_improvement'].min():.2f} points")
    print(f"  Max: {df['survival_probability_improvement'].max():.2f} points")
    
    # Efficiency score comparison
    print("\nEfficiency score comparison:")
    trad_efficiency_mean = df['traditional_efficiency_score'].mean()
    ai_efficiency_mean = df['ai_efficiency_score'].mean()
    efficiency_improvement = ai_efficiency_mean - trad_efficiency_mean
    
    print(f"  Traditional method average efficiency score: {trad_efficiency_mean:.2f}")
    print(f"  AI-assisted method average efficiency score: {ai_efficiency_mean:.2f}")
    print(f"  Efficiency score improvement: {efficiency_improvement:.2f} points")
    
    # Statistical testing
    print("\nStatistical comparison of traditional vs AI methods:")
    
    # Time comparison
    t_stat, p_val = stats.ttest_rel(df['traditional_diagnosis_time'], df['ai_diagnosis_time'])
    print(f"  Diagnostic time: t={t_stat:.4f}, p={p_val:.8f} - {'Significant' if p_val < 0.05 else 'Not significant'}")
    
    # Cost comparison
    t_stat, p_val = stats.ttest_rel(df['traditional_total_cost'], df['ai_total_cost'])
    print(f"  Cost: t={t_stat:.4f}, p={p_val:.8f} - {'Significant' if p_val < 0.05 else 'Not significant'}")
    
    # Survival probability comparison
    t_stat, p_val = stats.ttest_rel(df['traditional_survival_probability'], df['ai_survival_probability'])
    print(f"  Survival probability: t={t_stat:.4f}, p={p_val:.8f} - {'Significant' if p_val < 0.05 else 'Not significant'}")
    
    # Efficiency score comparison
    t_stat, p_val = stats.ttest_rel(df['traditional_efficiency_score'], df['ai_efficiency_score'])
    print(f"  Efficiency score: t={t_stat:.4f}, p={p_val:.8f} - {'Significant' if p_val < 0.05 else 'Not significant'}")
    
    # Summary statistics
    print("\nSummary of improvements with AI-assisted diagnosis:")
    print(f"  Time reduction: {time_reduction_pct:.2f}%")
    print(f"  Cost change: {cost_diff_pct:.2f}%")
    print(f"  Survival probability improvement: {survival_improvement:.2f} percentage points")
    print(f"  Efficiency score improvement: {efficiency_improvement:.2f} points")

def geographic_analysis(df):
    """
    Analyze variations across different countries and healthcare systems
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== GEOGRAPHIC AND HEALTHCARE SYSTEM ANALYSIS =====")
    
    # Technology readiness by country
    print("\nTechnology readiness by country:")
    country_tech = df.groupby('country')['tech_readiness_score'].mean().sort_values(ascending=False)
    for country, score in country_tech.items():
        print(f"  {country}: {score:.2f}")
    
    # Time saved by country
    print("\nTime saved percentage by country:")
    country_time = df.groupby('country')['time_saved_percentage'].mean().sort_values(ascending=False)
    for country, time_saved in country_time.items():
        print(f"  {country}: {time_saved:.2f}%")
    
    # Efficiency improvement by hospital tier
    print("\nEfficiency improvement by hospital tier:")
    tier_efficiency = df.groupby('hospital_tier')['efficiency_improvement'].mean().sort_values(ascending=False)
    for tier, improvement in tier_efficiency.items():
        print(f"  {tier}: {improvement:.2f} points")
    
    # Correlation between tech readiness and improvement metrics
    print("\nCorrelation between technology readiness and improvement metrics:")
    tech_metrics = df[['tech_readiness_score', 'time_saved_percentage', 
                     'cost_difference_percentage', 'survival_probability_improvement', 
                     'efficiency_improvement']]
    correlation = tech_metrics.corr()
    print(correlation['tech_readiness_score'].sort_values(ascending=False))
    
    # Hospital tier distribution
    print("\nHospital tier distribution:")
    tier_counts = df['hospital_tier'].value_counts()
    for tier, count in tier_counts.items():
        pct = count / len(df) * 100
        print(f"  {tier}: {count} ({pct:.2f}%)")

def hypothesis_testing(df):
    """
    Perform specific analyses to test the research hypothesis
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== HYPOTHESIS TESTING =====")
    print("Research Hypothesis: Implementation of standardized AI-assisted diagnostic imaging systems in EU hospitals will increase cancer detection accuracy by 15% and reduce diagnostic time from 72 to 24 hours.")
    
    # 1. Check if diagnostic time is reduced from 72 to 24 hours
    mean_trad_time = df['traditional_diagnosis_time'].mean()
    mean_ai_time = df['ai_diagnosis_time'].mean()
    time_reduction_pct = ((mean_trad_time - mean_ai_time) / mean_trad_time) * 100
    
    print(f"\n1. Time Reduction Analysis:")
    print(f"   - Average traditional diagnostic time: {mean_trad_time:.2f} hours")
    print(f"   - Average AI-assisted diagnostic time: {mean_ai_time:.2f} hours")
    print(f"   - Time reduction: {mean_trad_time - mean_ai_time:.2f} hours ({time_reduction_pct:.2f}%)")
    
    # Statistical test for time reduction
    t_stat, p_val = stats.ttest_1samp(df['ai_diagnosis_time'], 24)
    print(f"   - T-test comparing AI diagnostic time to 24 hours: t={t_stat:.4f}, p={p_val:.8f}")
    print(f"   - Conclusion: AI diagnostic time is {'not ' if p_val > 0.05 else ''}significantly different from 24 hours")
    
    # 2. Analyze improvement by diagnosis category
    # First, let's create a diagnosis category column based on the diagnosis
    df['diagnosis_category'] = 'Other'
    df.loc[df['diagnosis'] == 'melanoma', 'diagnosis_category'] = 'Melanoma'
    df.loc[df['diagnosis'].str.contains('keratosis', case=False, na=False), 'diagnosis_category'] = 'Keratosis'
    df.loc[df['diagnosis'] == 'nevus', 'diagnosis_category'] = 'Nevus'
    df.loc[df['diagnosis'].str.contains('lentigo', case=False, na=False), 'diagnosis_category'] = 'Lentigo'
    
    # Calculate survival improvement by diagnosis category
    category_survival = df.groupby('diagnosis_category')['survival_probability_improvement'].mean().sort_values(ascending=False)
    
    # Calculate time saved by diagnosis category
    category_time = df.groupby('diagnosis_category')['time_saved_percentage'].mean().sort_values(ascending=False)
    
    # 3. Analyze relationship between adoption readiness and benefits
    # Calculate correlation between readiness and benefits
    correlation = df[['tech_readiness_score', 'time_saved_percentage', 
                     'survival_probability_improvement']].corr()
    
    print("\n2. Improvement by Diagnosis Category:")
    print(f"   Survival probability improvement:")
    for category, improvement in category_survival.items():
        print(f"   - {category}: {improvement:.2f} percentage points")
    
    print(f"\n   Time saved percentage:")
    for category, time_saved in category_time.items():
        print(f"   - {category}: {time_saved:.2f}%")
    
    print("\n3. Technology Readiness and Benefits:")
    print(f"   Correlation between tech readiness and time saved: {correlation.loc['tech_readiness_score', 'time_saved_percentage']:.4f}")
    print(f"   Correlation between tech readiness and survival improvement: {correlation.loc['tech_readiness_score', 'survival_probability_improvement']:.4f}")
    
    # 4. Analyze malignant case outcomes specifically
    malignant_df = df[df['benign_malignant'] == 'malignant']
    
    malignant_time_reduction = ((malignant_df['traditional_diagnosis_time'].mean() - 
                               malignant_df['ai_diagnosis_time'].mean()) / 
                               malignant_df['traditional_diagnosis_time'].mean() * 100)
    
    malignant_survival_improvement = (malignant_df['ai_survival_probability'].mean() - 
                                    malignant_df['traditional_survival_probability'].mean())
    
    print("\n4. Malignant Case Analysis:")
    print(f"   - Number of malignant cases: {len(malignant_df)}")
    print(f"   - Time reduction for malignant cases: {malignant_time_reduction:.2f}%")
    print(f"   - Survival probability improvement for malignant cases: {malignant_survival_improvement:.2f} percentage points")
    
    # Overall conclusion
    print("\nHypothesis Testing Conclusion:")
    if time_reduction_pct >= 66.67:  # Reduction from 72 to 24 hours is roughly 66.67%
        print("✓ The data supports the hypothesis about reducing diagnostic time from 72 to 24 hours.")
    else:
        print("✗ The data does not fully support the hypothesis about reducing diagnostic time from 72 to 24 hours.")
        print(f"  Actual reduction: {time_reduction_pct:.2f}% (vs. required 66.67%)")
    
    # Generate a summary table
    summary_data = {
        'Metric': [
            'Average Traditional Diagnostic Time (hours)', 
            'Average AI-Assisted Diagnostic Time (hours)',
            'Time Reduction (%)',
            'Average Technology Readiness Score',
            'Average Survival Probability Improvement',
            'Average Cost Difference (%)'
        ],
        'Value': [
            mean_trad_time,
            mean_ai_time,
            time_reduction_pct,
            df['tech_readiness_score'].mean(),
            df['survival_probability_improvement'].mean(),
            df['cost_difference_percentage'].mean()
        ]
    }
    
    summary_df = pd.DataFrame(summary_data)
    print("\nSummary statistics:")
    print(summary_df)

def correlation_analysis(df):
    """
    Analyze correlations between variables
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== CORRELATION ANALYSIS =====")
    
    # Select relevant numeric columns for correlation analysis
    numeric_cols = [
        'age_approx', 'target', 'traditional_diagnosis_time', 'ai_diagnosis_time',
        'time_saved_percentage', 'tech_readiness_score', 'traditional_total_cost',
        'ai_total_cost', 'cost_difference_percentage', 'traditional_survival_probability',
        'ai_survival_probability', 'survival_probability_improvement',
        'traditional_efficiency_score', 'ai_efficiency_score', 'efficiency_improvement'
    ]
    
    # Calculate correlation matrix
    corr_matrix = df[numeric_cols].corr()
    
    # Print correlations with target variable
    print("\nCorrelations with target variable (cancer diagnosis):")
    target_corr = corr_matrix['target'].sort_values(ascending=False)
    for var, corr in target_corr.items():
        if var != 'target':
            print(f"  {var}: {corr:.4f}")
    
    # Print correlations with time saved percentage
    print("\nCorrelations with time saved percentage:")
    time_saved_corr = corr_matrix['time_saved_percentage'].sort_values(ascending=False)
    for var, corr in time_saved_corr.items():
        if var != 'time_saved_percentage':
            print(f"  {var}: {corr:.4f}")
    
    # Print correlations with survival probability improvement
    print("\nCorrelations with survival probability improvement:")
    survival_corr = corr_matrix['survival_probability_improvement'].sort_values(ascending=False)
    for var, corr in survival_corr.items():
        if var != 'survival_probability_improvement':
            print(f"  {var}: {corr:.4f}")
    
    # Print correlations with efficiency improvement
    print("\nCorrelations with efficiency improvement:")
    efficiency_corr = corr_matrix['efficiency_improvement'].sort_values(ascending=False)
    for var, corr in efficiency_corr.items():
        if var != 'efficiency_improvement':
            print(f"  {var}: {corr:.4f}")
    
    # Print strongest overall correlations
    print("\nTop 10 strongest correlations overall:")
    corr_pairs = []
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            corr_value = abs(corr_matrix.iloc[i, j])
            corr_pairs.append((numeric_cols[i], numeric_cols[j], corr_value))
    
    corr_pairs.sort(key=lambda x: x[2], reverse=True)
    for var1, var2, corr in corr_pairs[:10]:
        print(f"  {var1} & {var2}: {corr:.4f}")

def cost_benefit_analysis(df):
    """
    Perform a cost-benefit analysis of AI implementation
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataset to analyze
    """
    print("\n===== COST-BENEFIT ANALYSIS =====")
    
    # Calculate average costs
    trad_cost = df['traditional_total_cost'].mean()
    ai_cost = df['ai_total_cost'].mean()
    cost_diff = trad_cost - ai_cost
    
    # Calculate average times
    trad_time = df['traditional_diagnosis_time'].mean()
    ai_time = df['ai_diagnosis_time'].mean()
    time_diff = trad_time - ai_time
    
    # Calculate staff time savings
    if 'traditional_staff_hours' in df.columns and 'ai_staff_hours' in df.columns:
        staff_time_trad = df['traditional_staff_hours'].mean()
        staff_time_ai = df['ai_staff_hours'].mean()
        staff_time_saved = staff_time_trad - staff_time_ai
        staff_time_saved_pct = (staff_time_saved / staff_time_trad) * 100
    else:
        # Estimate based on time reduction
        time_saved_pct = (time_diff / trad_time) * 100
        staff_time_saved_pct = time_saved_pct * 0.8  # Assumption: 80% of time saved is staff time
    
    # Analyze by hospital tier
    tier_costs = df.groupby('hospital_tier')[['traditional_total_cost', 'ai_total_cost', 'cost_difference_percentage']].mean()
    
    # Calculate ROI by country
    country_costs = df.groupby('country')[['traditional_total_cost', 'ai_total_cost', 'cost_difference_percentage']].mean()
    
    print("\nOverall cost comparison:")
    print(f"  Traditional method average cost: {trad_cost:.2f} EUR")
    print(f"  AI-assisted method average cost: {ai_cost:.2f} EUR")
    print(f"  Average cost difference: {cost_diff:.2f} EUR ({(cost_diff/trad_cost*100):.2f}%)")
    
    print("\nTime efficiency:")
    print(f"  Traditional method average time: {trad_time:.2f} hours")
    print(f"  AI-assisted method average time: {ai_time:.2f} hours")
    print(f"  Average time saved: {time_diff:.2f} hours ({(time_diff/trad_time*100):.2f}%)")
    
    print("\nStaff resource efficiency:")
    print(f"  Estimated staff time saved: {staff_time_saved_pct:.2f}%")
    
    print("\nCost analysis by hospital tier:")
    print(tier_costs)
    
    print("\nCost analysis by country:")
    print(country_costs)
    
    # Calculate potential savings for full implementation
    total_cases = len(df)
    annual_cases_est = total_cases * (365 / 180)  # Assuming dataset covers ~180 days
    total_annual_savings = cost_diff * annual_cases_est
    
    print("\nEstimated annual cost impact:")
    print(f"  Estimated annual cases: {annual_cases_est:.0f}")
    print(f"  Estimated annual savings: {total_annual_savings:.2f} EUR")
    
    # Cost-effectiveness analysis
    if 'survival_probability_improvement' in df.columns:
        avg_survival_improvement = df['survival_probability_improvement'].mean()
        cost_per_percentage_point = cost_diff / avg_survival_improvement if avg_survival_improvement > 0 else 0
        
        print("\nCost-effectiveness analysis:")
        print(f"  Average survival probability improvement: {avg_survival_improvement:.2f} percentage points")
        if cost_diff > 0:
            print(f"  Cost per percentage point improvement: {-cost_per_percentage_point:.2f} EUR (saving)")
        else:
            print(f"  Cost per percentage point improvement: {cost_per_percentage_point:.2f} EUR (additional cost)")

def run_complete_eda():
    """
    Run the complete EDA pipeline
    """
    # Load the data
    df = load_data()
    
    # Run all analyses
    basic_eda(df)
    clinical_analysis(df)
    ai_comparison_analysis(df)
    geographic_analysis(df)
    hypothesis_testing(df)
    correlation_analysis(df)
    cost_benefit_analysis(df)
    
    print("\n===== EDA COMPLETE =====")
    print("All analyses have been completed successfully.")

# Run the analysis if this script is executed directly
if __name__ == "__main__":
    run_complete_eda()

Loading dataset...
Dataset loaded with 33126 rows and 23 columns

===== BASIC DATASET EXPLORATION =====

Data types and missing values:
                                 Data Type  Missing Values  Missing Percentage
image_name                          object               0                0.00
patient_id                          object               0                0.00
sex                                 object              65                0.20
age_approx                         float64              68                0.21
anatom_site_general_challenge       object             527                1.59
diagnosis                           object               0                0.00
benign_malignant                    object               0                0.00
target                               int64               0                0.00
traditional_diagnosis_time         float64               0                0.00
ai_diagnosis_time                  float64               0                

In [6]:
import pandas as pd
import numpy as np

def perform_isic_eda(file_path='ISIC_2020_Training_GroundTruth.csv'):
    """
    Perform a basic exploratory data analysis on the ISIC 2020 dataset
    without any visualizations.
    
    Parameters:
    -----------
    file_path : str
        Path to the ISIC_2020_Training_GroundTruth.csv file
    """
    print("Loading ISIC 2020 dataset...")
    
    # Load the data
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
    except Exception as e:
        print(f"Error loading the dataset: {e}")
        return
    
    # 1. Basic dataset information
    print("\n===== BASIC DATASET INFORMATION =====")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nData types:")
    print(df.dtypes)
    
    print("\nBasic statistics:")
    print(df.describe(include='all').T)
    
    # 2. Missing values analysis
    print("\n===== MISSING VALUES ANALYSIS =====")
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percent.round(2)
    })
    
    print(missing_df)
    
    # 3. Categorical variable analysis
    print("\n===== CATEGORICAL VARIABLES ANALYSIS =====")
    
    # Sex distribution
    print("\nSex distribution:")
    sex_counts = df['sex'].value_counts(dropna=False)
    sex_percent = (sex_counts / len(df)) * 100
    
    for sex, count in sex_counts.items():
        print(f"  {sex if pd.notna(sex) else 'Missing'}: {count} ({sex_percent[sex]:.2f}%)")
    
    # Anatomical site distribution
    print("\nAnatomical site distribution:")
    site_counts = df['anatom_site_general_challenge'].value_counts(dropna=False)
    site_percent = (site_counts / len(df)) * 100
    
    for site, count in site_counts.items():
        print(f"  {site if pd.notna(site) else 'Missing'}: {count} ({site_percent[site]:.2f}%)")
    
    # Diagnosis distribution
    print("\nDiagnosis distribution:")
    diagnosis_counts = df['diagnosis'].value_counts(dropna=False)
    diagnosis_percent = (diagnosis_counts / len(df)) * 100
    
    for diagnosis, count in diagnosis_counts.items():
        print(f"  {diagnosis if pd.notna(diagnosis) else 'Missing'}: {count} ({diagnosis_percent[diagnosis]:.2f}%)")
    
    # Benign/Malignant distribution
    print("\nBenign/Malignant distribution:")
    bm_counts = df['benign_malignant'].value_counts(dropna=False)
    bm_percent = (bm_counts / len(df)) * 100
    
    for status, count in bm_counts.items():
        print(f"  {status if pd.notna(status) else 'Missing'}: {count} ({bm_percent[status]:.2f}%)")
    
    # Target distribution
    print("\nTarget distribution:")
    target_counts = df['target'].value_counts(dropna=False)
    target_percent = (target_counts / len(df)) * 100
    
    for target, count in target_counts.items():
        print(f"  {target if pd.notna(target) else 'Missing'}: {count} ({target_percent[target]:.2f}%)")
    
    # 4. Numerical variable analysis
    print("\n===== NUMERICAL VARIABLES ANALYSIS =====")
    
    # Age analysis
    print("\nAge statistics:")
    age_stats = df['age_approx'].describe()
    print(f"  Count: {age_stats['count']}")
    print(f"  Mean: {age_stats['mean']:.2f}")
    print(f"  Std: {age_stats['std']:.2f}")
    print(f"  Min: {age_stats['min']:.2f}")
    print(f"  25%: {age_stats['25%']:.2f}")
    print(f"  50% (Median): {age_stats['50%']:.2f}")
    print(f"  75%: {age_stats['75%']:.2f}")
    print(f"  Max: {age_stats['max']:.2f}")
    
    # 5. Cross-tabulation analysis
    print("\n===== CROSS-TABULATION ANALYSIS =====")
    
    # Target vs. Sex
    print("\nTarget distribution by sex:")
    sex_target = pd.crosstab(df['sex'], df['target'], margins=True, normalize='index')
    sex_target = sex_target.multiply(100).round(2)
    print(sex_target)
    
    # Target vs. Anatomical site
    print("\nTarget distribution by anatomical site:")
    site_target = pd.crosstab(df['anatom_site_general_challenge'], df['target'], margins=True, normalize='index')
    site_target = site_target.multiply(100).round(2)
    print(site_target)
    
    # Target vs. Diagnosis
    print("\nTarget distribution by diagnosis:")
    diag_target = pd.crosstab(df['diagnosis'], df['target'], margins=True, normalize='index')
    diag_target = diag_target.multiply(100).round(2)
    print(diag_target)
    
    # Benign/Malignant vs. Target (should be perfectly correlated if benign=0, malignant=1)
    print("\nBenign/Malignant vs. Target:")
    bm_target = pd.crosstab(df['benign_malignant'], df['target'], margins=True)
    print(bm_target)
    
    # 6. Age distribution analysis
    print("\n===== AGE DISTRIBUTION ANALYSIS =====")
    
    # Age groups
    age_bins = [0, 18, 30, 45, 60, 75, 100]
    age_labels = ['0-18', '19-30', '31-45', '46-60', '61-75', '76+']
    
    df['age_group'] = pd.cut(df['age_approx'], bins=age_bins, labels=age_labels)
    
    # Age group distribution
    print("\nAge group distribution:")
    age_group_counts = df['age_group'].value_counts(dropna=False).sort_index()
    age_group_percent = (age_group_counts / len(df)) * 100
    
    for age_group, count in age_group_counts.items():
        print(f"  {age_group if pd.notna(age_group) else 'Missing'}: {count} ({age_group_percent[age_group]:.2f}%)")
    
    # Target by age group
    print("\nTarget distribution by age group:")
    age_target = pd.crosstab(df['age_group'], df['target'], margins=True, normalize='index')
    age_target = age_target.multiply(100).round(2)
    print(age_target)
    
    # 7. Patient analysis
    print("\n===== PATIENT ANALYSIS =====")
    
    # Number of unique patients
    unique_patients = df['patient_id'].nunique()
    print(f"\nNumber of unique patients: {unique_patients}")
    
    # Images per patient
    images_per_patient = df.groupby('patient_id').size().describe()
    print("\nImages per patient statistics:")
    print(f"  Mean: {images_per_patient['mean']:.2f}")
    print(f"  Std: {images_per_patient['std']:.2f}")
    print(f"  Min: {images_per_patient['min']:.0f}")
    print(f"  25%: {images_per_patient['25%']:.0f}")
    print(f"  50% (Median): {images_per_patient['50%']:.0f}")
    print(f"  75%: {images_per_patient['75%']:.0f}")
    print(f"  Max: {images_per_patient['max']:.0f}")
    
    # Patients with both benign and malignant lesions
    patient_has_benign = df[df['benign_malignant'] == 'benign']['patient_id'].unique()
    patient_has_malignant = df[df['benign_malignant'] == 'malignant']['patient_id'].unique()
    patients_with_both = set(patient_has_benign).intersection(set(patient_has_malignant))
    
    print(f"\nPatients with both benign and malignant lesions: {len(patients_with_both)}")
    
    # 8. Summary analysis for research hypothesis
    print("\n===== SUMMARY FOR HYPOTHESIS TESTING =====")
    print("For research hypothesis: Implementation of standardized AI-assisted diagnostic imaging systems in EU hospitals will increase cancer detection accuracy by 15% and reduce diagnostic time from 72 to 24 hours.")
    
    print("\nRelevant statistics from ISIC 2020 dataset:")
    print(f"  - Total number of images: {len(df)}")
    print(f"  - Unique patients: {unique_patients}")
    print(f"  - Benign cases: {len(df[df['benign_malignant'] == 'benign'])} ({len(df[df['benign_malignant'] == 'benign'])/len(df)*100:.2f}%)")
    print(f"  - Malignant cases: {len(df[df['benign_malignant'] == 'malignant'])} ({len(df[df['benign_malignant'] == 'malignant'])/len(df)*100:.2f}%)")
    
    # Calculate class imbalance ratio
    imbalance_ratio = len(df[df['benign_malignant'] == 'benign']) / len(df[df['benign_malignant'] == 'malignant'])
    print(f"  - Class imbalance ratio (benign:malignant): {imbalance_ratio:.2f}:1")
    
    return df

if __name__ == "__main__":
    perform_isic_eda()

Loading ISIC 2020 dataset...
Dataset loaded successfully with 33126 rows and 8 columns.

===== BASIC DATASET INFORMATION =====

First 5 rows:
     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_2637011  IP_7279968    male        45.0                     head/neck   
1  ISIC_0015719  IP_3075186  female        45.0               upper extremity   
2  ISIC_0052212  IP_2842074  female        50.0               lower extremity   
3  ISIC_0068279  IP_6890425  female        45.0                     head/neck   
4  ISIC_0074268  IP_8723313  female        55.0               upper extremity   

  diagnosis benign_malignant  target  
0   unknown           benign       0  
1   unknown           benign       0  
2     nevus           benign       0  
3   unknown           benign       0  
4   unknown           benign       0  

Data types:
image_name                        object
patient_id                        object
sex                               object
a

In [7]:
import pandas as pd
import scipy.stats as stats

# Load the dataset
df = pd.read_csv('complete_cancer_detection_dataset.csv')

# Perform paired t-test on traditional vs AI diagnostic times
t_stat, p_value = stats.ttest_rel(df['traditional_diagnosis_time'], df['ai_diagnosis_time'])

print("Paired t-test for Traditional vs AI Diagnostic Time")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.8f}")
print(f"Mean traditional time: {df['traditional_diagnosis_time'].mean():.2f} hours")
print(f"Mean AI time: {df['ai_diagnosis_time'].mean():.2f} hours")
print(f"Mean difference: {(df['traditional_diagnosis_time'] - df['ai_diagnosis_time']).mean():.2f} hours")
print(f"Percentage reduction: {((df['traditional_diagnosis_time'] - df['ai_diagnosis_time']).mean() / df['traditional_diagnosis_time'].mean() * 100):.2f}%")
print(f"Statistical significance: {'Significant' if p_value < 0.05 else 'Not significant'}")

Paired t-test for Traditional vs AI Diagnostic Time
t-statistic: 594.4919
p-value: 0.00000000
Mean traditional time: 69.05 hours
Mean AI time: 23.60 hours
Mean difference: 45.45 hours
Percentage reduction: 65.82%
Statistical significance: Significant
