In [1]:
# In data_preparation.ipynb
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress bars
%matplotlib inline


In [2]:
from config import *

In [3]:
# Define the paths
# RAW_DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/raw/'
# PROCESSED_DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/Malaria_Categorization_Project/data/processed'
RAW_DATA_DIR = GLOBAL_CONFIG_DATA_RAW_IMAGE_PATH
PROCESSED_DATA_DIR = GLOBAL_CONFIG_DATA_PROCESSED_IMAGE_PATH


# Create processed data directory if it doesn't exist
if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)


In [4]:
# List some files in the raw data directory
raw_images = os.listdir(RAW_DATA_DIR)
print(f"Number of raw images: {len(raw_images)}")




Number of raw images: 3925


In [5]:
def preprocess_image(image_path, output_size=(224, 224)):
    # Load the image
    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load image: {image_path}")
        return None
    # Convert to RGB
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Remove non-cell parts (black areas)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([50, 50, 50])
    mask_black = cv2.inRange(img_rgb, lower_black, upper_black)
    img_rgb[mask_black == 255] = [255, 255, 255]

    # Resize image
    img_resized = cv2.resize(img_rgb, output_size)

    # Normalize pixel values (0 to 1)
    img_normalized = img_resized / 255.0

    return img_normalized


In [6]:
# Process and save all images
for image_name in tqdm(raw_images):
    image_path = os.path.join(RAW_DATA_DIR, image_name)
    processed_image = preprocess_image(image_path)

    if processed_image is not None:
        # Convert back to uint8 for saving
        processed_image_uint8 = (processed_image * 255).astype(np.uint8)
        # Save processed image
        save_path = os.path.join(PROCESSED_DATA_DIR, image_name)
        cv2.imwrite(save_path, cv2.cvtColor(processed_image_uint8, cv2.COLOR_RGB2BGR))


100%|██████████| 3925/3925 [05:45<00:00, 11.35it/s]


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training labels
train_labels = pd.read_csv(GLOBAL_CONFIG_DATA_RAW_LABEL_PATH)

# Display the first few rows
print(train_labels.head())


            Image_ID        class  confidence  ymin  xmin  ymax  xmax
0  id_u3q6jdck4j.jpg  Trophozoite         1.0   712  1241   737  1270
1  id_a6cl90trri.jpg  Trophozoite         1.0   558  1566   600  1604
2  id_qvc2le9sm8.jpg  Trophozoite         1.0  1317  2788  1448  2914
3  id_w8xnbd5rvm.jpg  Trophozoite         1.0   925  1744  1041  1823
4  id_6dop09rk02.jpg          NEG         1.0     0     0     0     0


In [9]:
# Get the unique classes
unique_classes = train_labels['class'].unique()
print(f"Unique classes: {unique_classes}")


Unique classes: ['Trophozoite' 'NEG' 'WBC']


In [10]:
# Calculate the counts for each class
class_counts = train_labels['class'].value_counts()
print(class_counts)


class
Trophozoite    15838
WBC             7004
NEG              688
Name: count, dtype: int64


In [11]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
import pandas as pd
import numpy as np
import os
from PIL import Image
import shutil
import matplotlib.pyplot as plt

# Define the data augmentation generator
datagen = ImageDataGenerator(
    rotation_range=90,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

# Load your labels CSV
labels_df = pd.read_csv(GLOBAL_CONFIG_DATA_RAW_LABEL_PATH)

# Add 'augmented' column for original images
labels_df['augmented'] = False

# Paths setup
images_path = GLOBAL_CONFIG_DATA_PROCESSED_IMAGE_PATH
output_path = f'{GLOBAL_CONFIG_DATA_PATH}augmented_images/'

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

# Copy original images to output_path
for img_name in labels_df['Image_ID']:
    src = os.path.join(images_path, img_name)
    dst = os.path.join(output_path, img_name)
    shutil.copy(src, dst)

def augment_class(images_path, class_df, class_name, target_count, output_path):
    current_count = class_df.shape[0]
    augment_needed = target_count - current_count

    if augment_needed <= 0:
        return []  # No augmentation needed

    new_rows = []
    aug_count = 0
    image_indices = class_df.index.tolist()
    num_images = len(image_indices)
    image_counter = 0

    while aug_count < augment_needed:
        idx = image_indices[image_counter % num_images]
        row = class_df.loc[idx]
        image_path = os.path.join(images_path, row['Image_ID'])
        img = load_img(image_path)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)

        for batch in datagen.flow(x, batch_size=1, save_to_dir=output_path, save_prefix=f'aug_{class_name.lower()}_', save_format='jpg'):
            new_image_id = f'aug_{class_name.lower()}_{aug_count}.jpg'
            new_row = row.copy()
            new_row['Image_ID'] = new_image_id
            new_row['augmented'] = True
            new_rows.append(new_row)

            aug_count += 1
            if aug_count >= augment_needed:
                break

        image_counter += 1

    return new_rows



# Filter class DataFrames
wbc_df = labels_df[labels_df['class'] == 'WBC']
neg_df = labels_df[labels_df['class'] == 'NEG']


# Augment WBC and NEG classes
new_wbc_rows = augment_class(output_path, wbc_df, 'WBC', 15838, output_path)
new_neg_rows = augment_class(output_path, neg_df, 'NEG', 15838, output_path)

# Combine all labels
new_df = pd.concat([labels_df, pd.DataFrame(new_wbc_rows), pd.DataFrame(new_neg_rows)], ignore_index=True)

# Save the updated labels DataFrame
new_df.to_csv(f'{GLOBAL_CONFIG_DATA_PATH}augmented_labels.csv', index=False)



In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training labels
train_labels = pd.read_csv(f'{GLOBAL_CONFIG_DATA_PATH}augmented_labels.csv')

# Display the first few rows
print(train_labels.head())

# Get the unique classes
unique_classes = train_labels['class'].unique()
print(f"Unique classes: {unique_classes}")

# Calculate the counts for each class
class_counts = train_labels['class'].value_counts()
print(class_counts)



            Image_ID        class  confidence  ymin  xmin  ymax  xmax  \
0  id_u3q6jdck4j.jpg  Trophozoite         1.0   712  1241   737  1270   
1  id_a6cl90trri.jpg  Trophozoite         1.0   558  1566   600  1604   
2  id_qvc2le9sm8.jpg  Trophozoite         1.0  1317  2788  1448  2914   
3  id_w8xnbd5rvm.jpg  Trophozoite         1.0   925  1744  1041  1823   
4  id_6dop09rk02.jpg          NEG         1.0     0     0     0     0   

   augmented  
0      False  
1      False  
2      False  
3      False  
4      False  
Unique classes: ['Trophozoite' 'NEG' 'WBC']
class
Trophozoite    15838
NEG            15838
WBC            15838
Name: count, dtype: int64
