# Eye Disease Dataset Analysis
## Task 1: Data Exploration and Missing Values Analysis

**Team Members:** Isaac + Jonathan  
**Objective:** Explore dataset, identify missing values, and calculate percentage of missing data

### Dataset Overview
This dataset contains eye disease images categorized into:
- Cataract
- Diabetic Retinopathy  
- Glaucoma
- Normal (healthy eyes)

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

ModuleNotFoundError: No module named 'cv2'

In [None]:
# Define dataset path
dataset_path = "dataset"

# Get all category folders
categories = [folder for folder in os.listdir(dataset_path) 
              if os.path.isdir(os.path.join(dataset_path, folder))]

print(f"Categories found: {categories}")
print(f"Total categories: {len(categories)}")

# Count images in each category
category_counts = {}
total_images = 0

for category in categories:
    category_path = os.path.join(dataset_path, category)
    # Count image files (jpg, jpeg, png)
    image_files = [f for f in os.listdir(category_path) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    category_counts[category] = len(image_files)
    total_images += len(image_files)
    print(f"{category}: {len(image_files)} images")

print(f"\nTotal images in dataset: {total_images}")

In [None]:
# Create a DataFrame for dataset overview
dataset_overview = pd.DataFrame({
    'Category': list(category_counts.keys()),
    'Image_Count': list(category_counts.values())
})

# Calculate percentages
dataset_overview['Percentage'] = (dataset_overview['Image_Count'] / total_images * 100).round(2)

print("Dataset Overview:")
print(dataset_overview)

# Visualize the distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
ax1.bar(dataset_overview['Category'], dataset_overview['Image_Count'], color='skyblue', edgecolor='navy')
ax1.set_title('Number of Images per Category')
ax1.set_xlabel('Eye Disease Category')
ax1.set_ylabel('Number of Images')
ax1.tick_params(axis='x', rotation=45)

# Add value labels on bars
for i, v in enumerate(dataset_overview['Image_Count']):
    ax1.text(i, v + 10, str(v), ha='center', va='bottom')

# Pie chart
ax2.pie(dataset_overview['Image_Count'], labels=dataset_overview['Category'], autopct='%1.1f%%', startangle=90)
ax2.set_title('Distribution of Images by Category')

plt.tight_layout()
plt.show()

In [None]:
# Check for missing/corrupted images and analyze image properties
missing_data_analysis = {
    'category': [],
    'total_files': [],
    'readable_images': [],
    'corrupted_images': [],
    'missing_percentage': [],
    'avg_file_size_mb': [],
    'min_file_size_mb': [],
    'max_file_size_mb': []
}

print("Analyzing images for missing/corrupted data...\n")

for category in categories:
    category_path = os.path.join(dataset_path, category)
    
    # Get all image files
    image_files = [f for f in os.listdir(category_path) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    readable_count = 0
    corrupted_count = 0
    file_sizes = []
    
    for image_file in image_files:
        image_path = os.path.join(category_path, image_file)
        
        try:
            # Try to open and verify the image
            with Image.open(image_path) as img:
                img.verify()  # Verify it's a valid image
                readable_count += 1
                
                # Get file size in MB
                file_size_mb = os.path.getsize(image_path) / (1024 * 1024)
                file_sizes.append(file_size_mb)
                
        except Exception as e:
            corrupted_count += 1
            print(f"Corrupted image found: {image_path} - Error: {str(e)}")
    
    # Calculate statistics
    total_files = len(image_files)
    missing_percentage = (corrupted_count / total_files * 100) if total_files > 0 else 0
    
    # Store results
    missing_data_analysis['category'].append(category)
    missing_data_analysis['total_files'].append(total_files)
    missing_data_analysis['readable_images'].append(readable_count)
    missing_data_analysis['corrupted_images'].append(corrupted_count)
    missing_data_analysis['missing_percentage'].append(missing_percentage)
    
    if file_sizes:
        missing_data_analysis['avg_file_size_mb'].append(np.mean(file_sizes))
        missing_data_analysis['min_file_size_mb'].append(np.min(file_sizes))
        missing_data_analysis['max_file_size_mb'].append(np.max(file_sizes))
    else:
        missing_data_analysis['avg_file_size_mb'].append(0)
        missing_data_analysis['min_file_size_mb'].append(0)
        missing_data_analysis['max_file_size_mb'].append(0)
    
    print(f"{category}:")
    print(f"  - Total files: {total_files}")
    print(f"  - Readable images: {readable_count}")
    print(f"  - Corrupted images: {corrupted_count}")
    print(f"  - Missing/Corrupted percentage: {missing_percentage:.2f}%")
    if file_sizes:
        print(f"  - Average file size: {np.mean(file_sizes):.3f} MB")
    print()

# Create DataFrame for missing data analysis
missing_data_df = pd.DataFrame(missing_data_analysis)
print("Missing Data Analysis Summary:")
print(missing_data_df.round(3))

In [None]:
# Analyze image dimensions
print("Analyzing image dimensions...\n")

dimension_analysis = {
    'category': [],
    'avg_width': [],
    'avg_height': [],
    'min_width': [],
    'max_width': [],
    'min_height': [],
    'max_height': [],
    'unique_dimensions': []
}

for category in categories:
    category_path = os.path.join(dataset_path, category)
    image_files = [f for f in os.listdir(category_path) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    widths = []
    heights = []
    dimensions = set()
    
    # Sample first 50 images for dimension analysis (to speed up)
    sample_files = image_files[:50] if len(image_files) > 50 else image_files
    
    for image_file in sample_files:
        image_path = os.path.join(category_path, image_file)
        try:
            with Image.open(image_path) as img:
                width, height = img.size
                widths.append(width)
                heights.append(height)
                dimensions.add((width, height))
        except Exception:
            continue
    
    if widths and heights:
        dimension_analysis['category'].append(category)
        dimension_analysis['avg_width'].append(np.mean(widths))
        dimension_analysis['avg_height'].append(np.mean(heights))
        dimension_analysis['min_width'].append(np.min(widths))
        dimension_analysis['max_width'].append(np.max(widths))
        dimension_analysis['min_height'].append(np.min(heights))
        dimension_analysis['max_height'].append(np.max(heights))
        dimension_analysis['unique_dimensions'].append(len(dimensions))
        
        print(f"{category} (sample of {len(sample_files)} images):")
        print(f"  - Average dimensions: {np.mean(widths):.0f} x {np.mean(heights):.0f}")
        print(f"  - Width range: {np.min(widths)} - {np.max(widths)}")
        print(f"  - Height range: {np.min(heights)} - {np.max(heights)}")
        print(f"  - Unique dimensions: {len(dimensions)}")
        print()

# Create dimension analysis DataFrame
dimension_df = pd.DataFrame(dimension_analysis)
print("Dimension Analysis Summary:")
print(dimension_df.round(0))

In [None]:
# Create comprehensive summary visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Missing/Corrupted data percentage
ax1.bar(missing_data_df['category'], missing_data_df['missing_percentage'], color='coral')
ax1.set_title('Missing/Corrupted Data Percentage by Category')
ax1.set_xlabel('Category')
ax1.set_ylabel('Percentage (%)')
ax1.tick_params(axis='x', rotation=45)
for i, v in enumerate(missing_data_df['missing_percentage']):
    ax1.text(i, v + 0.1, f'{v:.2f}%', ha='center', va='bottom')

# 2. File size distribution
ax2.bar(missing_data_df['category'], missing_data_df['avg_file_size_mb'], color='lightblue')
ax2.set_title('Average File Size by Category')
ax2.set_xlabel('Category')
ax2.set_ylabel('File Size (MB)')
ax2.tick_params(axis='x', rotation=45)
for i, v in enumerate(missing_data_df['avg_file_size_mb']):
    ax2.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# 3. Image dimensions comparison
if not dimension_df.empty:
    categories_dim = dimension_df['category']
    x_pos = np.arange(len(categories_dim))
    width = 0.35
    
    ax3.bar(x_pos - width/2, dimension_df['avg_width'], width, label='Width', color='skyblue')
    ax3.bar(x_pos + width/2, dimension_df['avg_height'], width, label='Height', color='lightcoral')
    ax3.set_title('Average Image Dimensions by Category')
    ax3.set_xlabel('Category')
    ax3.set_ylabel('Pixels')
    ax3.set_xticks(x_pos)
    ax3.set_xticklabels(categories_dim, rotation=45)
    ax3.legend()

# 4. Data quality summary
readable_images = missing_data_df['readable_images'].sum()
corrupted_images = missing_data_df['corrupted_images'].sum()
total_analyzed = readable_images + corrupted_images

ax4.pie([readable_images, corrupted_images], 
        labels=['Readable Images', 'Corrupted Images'], 
        autopct='%1.2f%%', 
        colors=['lightgreen', 'lightcoral'],
        startangle=90)
ax4.set_title('Overall Data Quality')

plt.tight_layout()
plt.show()

# Overall summary
print("\n" + "="*60)
print("TASK 1: DATA EXPLORATION SUMMARY")
print("="*60)
print(f"Total images analyzed: {total_analyzed}")
print(f"Readable images: {readable_images} ({(readable_images/total_analyzed*100):.2f}%)")
print(f"Corrupted/Missing images: {corrupted_images} ({(corrupted_images/total_analyzed*100):.2f}%)")
print(f"Overall data quality: {(readable_images/total_analyzed*100):.2f}% good")
print("\nCategory-wise missing data:")
for _, row in missing_data_df.iterrows():
    print(f"  - {row['category']}: {row['missing_percentage']:.2f}% missing/corrupted")
print("="*60)

## Task 1 Findings and Observations

### Key Findings:
1. **Dataset Composition**: The dataset contains 4 categories of eye disease images
2. **Missing Data Analysis**: Systematic check for corrupted or unreadable image files
3. **File Quality**: Assessment of image file integrity and readability
4. **Dimension Analysis**: Understanding of image size variations across categories

### Observations:
- **Data Quality**: [Results will show percentage of corrupted/missing images]
- **File Size Consistency**: [Results will show if file sizes are consistent across categories]
- **Image Dimensions**: [Results will show if images have consistent dimensions]
- **Category Balance**: [Results will show if categories are balanced]

### Next Steps for Task 2:
Based on the missing data analysis, we can now:
1. **Identify** the specific missing data handling method needed
2. **Choose** appropriate strategies (removal, interpolation, or data augmentation)
3. **Justify** the chosen method based on the percentage and type of missing data found

### Data Quality Recommendations:
- If corrupted images < 5%: Remove corrupted files
- If corrupted images 5-15%: Consider data augmentation to balance
- If corrupted images > 15%: Investigate data collection process

**Ready for Task 2: Choose and justify missing data handling method** ✅