In [17]:
# In data_preparation.ipynb
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress bars
%matplotlib inline


In [18]:
# Define the paths
RAW_DATA_DIR = '/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/raw/'
PROCESSED_DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/processed'


# Create processed data directory if it doesn't exist
if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)


In [19]:
# List some files in the raw data directory
raw_images = os.listdir(RAW_DATA_DIR)
print(f"Number of raw images: {len(raw_images)}")




Number of raw images: 3925


In [20]:
def preprocess_image(image_path, output_size=(224, 224)):
    # Load the image
    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load image: {image_path}")
        return None
    # Convert to RGB
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Remove non-cell parts (black areas)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([50, 50, 50])
    mask_black = cv2.inRange(img_rgb, lower_black, upper_black)
    img_rgb[mask_black == 255] = [255, 255, 255]

    # Resize image
    img_resized = cv2.resize(img_rgb, output_size)

    # Normalize pixel values (0 to 1)
    img_normalized = img_resized / 255.0

    return img_normalized


In [None]:
# Process and save all images
for image_name in tqdm(raw_images):
    image_path = os.path.join(RAW_DATA_DIR, image_name)
    processed_image = preprocess_image(image_path)

    if processed_image is not None:
        # Convert back to uint8 for saving
        processed_image_uint8 = (processed_image * 255).astype(np.uint8)
        # Save processed image
        save_path = os.path.join(PROCESSED_DATA_DIR, image_name)
        cv2.imwrite(save_path, cv2.cvtColor(processed_image_uint8, cv2.COLOR_RGB2BGR))


  1%|          | 40/3925 [00:28<28:30,  2.27it/s]

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training labels
train_labels = pd.read_csv('/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/Train.csv')

# Display the first few rows
print(train_labels.head())


            Image_ID        class  confidence  ymin  xmin  ymax  xmax
0  id_u3q6jdck4j.jpg  Trophozoite         1.0   712  1241   737  1270
1  id_a6cl90trri.jpg  Trophozoite         1.0   558  1566   600  1604
2  id_qvc2le9sm8.jpg  Trophozoite         1.0  1317  2788  1448  2914
3  id_w8xnbd5rvm.jpg  Trophozoite         1.0   925  1744  1041  1823
4  id_6dop09rk02.jpg          NEG         1.0     0     0     0     0


In [3]:
# Get the unique classes
unique_classes = train_labels['class'].unique()
print(f"Unique classes: {unique_classes}")


Unique classes: ['Trophozoite' 'NEG' 'WBC']


In [4]:
# Calculate the counts for each class
class_counts = train_labels['class'].value_counts()
print(class_counts)


class
Trophozoite    15838
WBC             7004
NEG              688
Name: count, dtype: int64


In [8]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import os
import pandas as pd
from tqdm import tqdm

# Load the CSV file
labels_df = pd.read_csv('/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/Train.csv')

# Define augmentation pipeline with Albumentations
transform = A.Compose([
    A.Rotate(limit=90, p=0.5),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomScale(scale_limit=0.1, p=0.5),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))

# Define paths
images_path = '/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/processed'
output_path = '/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/augmented_images'
os.makedirs(output_path, exist_ok=True)

new_rows = []
for idx, row in tqdm(labels_df.iterrows(), total=len(labels_df)):
    image_path = os.path.join(images_path, row['Image_ID'])
    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load image: {image_path}")
        continue

    # Get image dimensions
    height, width = img.shape[:2]

    # Normalize bounding boxes
    xmin, ymin, xmax, ymax = row['xmin'], row['ymin'], row['xmax'], row['ymax']

    # Skip invalid boxes
    if xmin >= xmax or ymin >= ymax:
        continue

    bbox = [[xmin / width, ymin / height, xmax / width, ymax / height]]
    class_labels = [row['class']]

    # Apply augmentation
    augmented = transform(image=img, bboxes=bbox, class_labels=class_labels)

    # Save augmented image and update labels
    aug_img_name = f'aug_{idx}_{row["Image_ID"]}'
    aug_img_path = os.path.join(output_path, aug_img_name)
    cv2.imwrite(aug_img_path, augmented['image'])

    # Denormalize the bounding boxes to save
    for bbox in augmented['bboxes']:
        new_row = row.copy()
        new_row['Image_ID'] = aug_img_name
        new_row['xmin'] = int(bbox[0] * width)
        new_row['ymin'] = int(bbox[1] * height)
        new_row['xmax'] = int(bbox[2] * width)
        new_row['ymax'] = int(bbox[3] * height)
        new_row['augmented'] = True
        new_rows.append(new_row)

# Combine the original DataFrame with augmented rows
augmented_df = pd.concat([labels_df, pd.DataFrame(new_rows)], ignore_index=True)

# Save updated CSV with augmented bounding boxes
augmented_df.to_csv('/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/augmented_labels.csv', index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clipped_box_areas / denormalized_box_areas >= min_visibility - epsilon)
  & (clippe

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training labels
train_labels = pd.read_csv('/content/mydrive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/augmented_labels.csv')

# Display the first few rows
print(train_labels.head())

# Get the unique classes
unique_classes = train_labels['class'].unique()
print(f"Unique classes: {unique_classes}")

# Calculate the counts for each class
class_counts = train_labels['class'].value_counts()
print(class_counts)

