# Image Processing

library or package installation

In [3]:
'''!pip install fastparquet pyarrow
!pip install imutils
!pip install opencv
!pip install tqdm'''

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import numpy as np
from tqdm import tqdm
import cv2
import os
import imutils

def crop_img(img):
    """
    Finds the extreme points on the image and crops the rectangular out of them.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    gray = cv2.GaussianBlur(gray, (3, 3), 0)

    thresh = cv2.threshold(gray, 45, 255, cv2.THRESH_BINARY)[1]
    thresh = cv2.erode(thresh, None, iterations=2)
    thresh = cv2.dilate(thresh, None, iterations=2)

    cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    c = max(cnts, key=cv2.contourArea)

    extLeft = tuple(c[c[:, :, 0].argmin()][0])
    extRight = tuple(c[c[:, :, 0].argmax()][0])
    extTop = tuple(c[c[:, :, 1].argmin()][0])
    extBot = tuple(c[c[:, :, 1].argmax()][0])

    ADD_PIXELS = 0
    new_img = img[extTop[1]-ADD_PIXELS:extBot[1]+ADD_PIXELS, extLeft[0]-ADD_PIXELS:extRight[0]+ADD_PIXELS].copy()
    
    return new_img

if __name__ == "__main__":
    training = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/archive/Training"
    testing = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/archive/Testing"
    IMG_SIZE = 256

    for dataset in [(training, 'Training'), (testing, 'Testing')]:
        dataset_path, dataset_name = dataset
        for dir in tqdm(os.listdir(dataset_path), desc=f"Processing {dataset_name}"):
            dir_path = os.path.join(dataset_path, dir)
            # Skip if it's not a directory (e.g., .DS_Store files)
            if not os.path.isdir(dir_path):
                continue
            
            save_path = f'/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/{dataset_name}/{dir}'
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            for img in tqdm(os.listdir(dir_path), desc=f"Images in {dir}", leave=False):
                img_path = os.path.join(dir_path, img)
                # Ensure processing of files only
                if os.path.isfile(img_path):
                    image = cv2.imread(img_path)
                    new_img = crop_img(image)
                    new_img = cv2.resize(new_img, (IMG_SIZE, IMG_SIZE))
                    cv2.imwrite(os.path.join(save_path, img), new_img)


Vectorizations

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import os
import h5py
'''
# Load the EfficientNetB0 model, pretrained on ImageNet
base_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
# Assume you want to use the model for feature extraction, so include_top=False and pooling='avg'

# Path to your image directory
image_dir = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/archive/Training/glioma"

# List all jpg images in the directory
image_files = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.jpg')]

# Initialize a list to store the feature vectors
feature_vectors = []

for img_path in image_files:
    # Load the image file, resizing it to 224x224 pixels (required input size for EfficientNet models)
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)  # Add a batch dimension
    img_array = preprocess_input(img_array)  # Preprocess the image

    # Use the model to get the feature vector
    features = base_model.predict(img_array)
    
    # Save the features
    feature_vectors.append(features)

# At this point, 'feature_vectors' contains the feature vectors for all images in your directory
    
'''


KeyboardInterrupt: 

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet_v2 import EfficientNetV2M

from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing import image
import os
import pandas as pd

def extract_feature_vectors_with_labels(image_dir, model, y_label):
    """
    Extracts feature vectors for all images in a specified directory using a given model,
    and associates each vector with a provided label.

    Parameters:
    - image_dir: Path to the directory containing images.
    - model: Preloaded Keras model for feature extraction.
    - y_label: The label to associate with each feature vector.

    Returns:
    - A pandas DataFrame containing an identity column, vector column, and y_label column.
    """
    image_files = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.jpg')]
    feature_vectors = []
    ids = []

    for i, img_path in enumerate(image_files):
        img = image.load_img(img_path, target_size=(256, 256))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)

        features = model.predict(img_array)
        feature_vectors.append(features.flatten())  # Flatten the features to store in DataFrame
        ids.append(i+1)  # Assuming IDs are 1-indexed

    # Create a DataFrame
    df = pd.DataFrame({
        'ID': ids,
        'Vector': feature_vectors,
        'Y_Label': [y_label] * len(feature_vectors)
    })

    # After extracting feature vectors and labels, ensure the 'Vector' column is of type 'object'
    df['Vector'] = df['Vector'].apply(lambda x: np.array(x).astype(np.float32))
    return df





In [11]:
# Usage example
image_dir = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Training/glioma"
# Load EfficientNetV2-M model pre-trained on ImageNet
base_model = EfficientNetV2M(weights='imagenet', include_top=False, pooling='avg')
y_label = 1  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df = extract_feature_vectors_with_labels(image_dir, base_model, y_label)

# Display the DataFrame
print(df.head())
# Now, you can save the DataFrame to a Parquet file
df.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_train_labels_glioma.parquet')

   ID                                             Vector  Y_Label
0   1  [-0.05641356, -0.025539216, -0.11669761, -0.22...        1
1   2  [0.42268622, -0.21720162, 0.0442618, -0.100454...        1
2   3  [0.11961785, -0.108141474, 0.22918089, 0.09274...        1
3   4  [0.3543459, -0.2303296, 0.0899824, -0.21302179...        1
4   5  [0.07599094, 0.33484557, 0.0548122, 0.6904299,...        1


In [8]:
# Usage example
image_dir_test = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Testing/glioma"
# Load EfficientNetV2-M model pre-trained on ImageNet
base_model = EfficientNetV2M(weights='imagenet', include_top=False, pooling='avg')
y_label = 1  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df = extract_feature_vectors_with_labels(image_dir, base_model, y_label)

# Display the DataFrame
print(df.head())
# Now, you can save the DataFrame to a Parquet file
df.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_test_labels_glioma.parquet')

# Usage example
image_dir_test_meningioma = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Testing/meningioma"
  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df_test_meningioma = extract_feature_vectors_with_labels(image_dir_test_meningioma, base_model, y_label)

print(df_test_meningioma.head())
# Now, you can save the DataFrame to a Parquet file
df_test_meningioma.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_test_labels_meningioma.parquet')
# Display the DataFrame

# Usage example
image_dir_p_test= "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Testing/pituitary"
y_label_p_test = 1  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df_test_p = extract_feature_vectors_with_labels(image_dir_p_test, base_model, y_label)

print(df_test_p.head())
# Now, you can save the DataFrame to a Parquet file
df_test_p.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_test_pip.parquet')

   ID                                             Vector  Y_Label
0   1  [-0.05641356, -0.025539216, -0.11669761, -0.22...        1
1   2  [0.42268622, -0.21720162, 0.0442618, -0.100454...        1
2   3  [0.11961785, -0.108141474, 0.22918089, 0.09274...        1
3   4  [0.3543459, -0.2303296, 0.0899824, -0.21302179...        1
4   5  [0.07599094, 0.33484557, 0.0548122, 0.6904299,...        1
   ID                                             Vector  Y_Label
0   1  [-0.18448749, -0.15615335, -0.028925786, 0.159...        1
1   2  [-0.14685443, -0.23942122, 0.20322753, -0.2014...        1
2   3  [-0.13963403, -0.12831333, 0.16567737, -0.2088...        1
3   4  [-0.068504706, -0.18519542, -0.048932426, -0.1...        1
4   5  [0.073948815, -0.20861739, 0.121509865, -0.178...        1


FileNotFoundError: [Errno 2] No such file or directory: '/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Testing/pituary'

   ID                                             Vector  Y_Label
0   1  [0.10123842, -0.23452526, 0.20761436, -0.13618...        1
1   2  [-0.067680925, -0.2385844, 0.028163437, -0.016...        1
2   3  [-0.05409915, -0.250364, 0.17720945, -0.082736...        1
3   4  [-0.11569899, -0.22712232, 0.104525335, -0.003...        1
4   5  [-0.2090607, -0.24660634, -0.15373482, 0.12894...        1


In [10]:
# Usage example
image_dir_non_glio = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Training/meningioma"
base_model = EfficientNetV2M(weights='imagenet', include_top=False, pooling='avg')
y_label_meningioma = 1  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df_meningioma = extract_feature_vectors_with_labels(image_dir_non_glio, base_model, y_label_meningioma)

print(df_meningioma.head())
# Now, you can save the DataFrame to a Parquet file
df_meningioma.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_train_labels_meningioma.parquet')
# Display the DataFrame


   ID                                             Vector  Y_Label
0   1  [-0.18168747, -0.12912048, 0.47092605, 0.03143...        1
1   2  [-0.14429235, -0.10313867, -0.112497166, 0.135...        1
2   3  [-0.205609, -0.23333439, 0.0042702416, -0.0567...        1
3   4  [0.13364857, -0.08731395, 0.5347898, -0.136245...        1
4   5  [-0.08687043, -0.12270126, 0.1888522, -0.23757...        1


In [4]:
# Usage example
image_dir_no_tumor = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Training/notumor"
y_label_no_tumor = 0  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df_no_tumor = extract_feature_vectors_with_labels(image_dir_no_tumor, base_model, y_label_no_tumor)

print(df_no_tumor.head())
# Now, you can save the DataFrame to a Parquet file
df_no_tumor.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_train_notumor.parquet')
# Display the DataFrame

   ID                                             Vector  Y_Label
0   1  [-0.19135198, -0.2522393, 0.09176754, -0.25584...        0
1   2  [-0.13153201, -0.24380103, 0.43674836, -0.2559...        0
2   3  [-0.07430623, -0.22589298, 0.1793621, -0.23937...        0
3   4  [-0.026441343, 0.076843336, 0.15179601, -0.243...        0
4   5  [-0.03311896, -0.24829683, 0.24011546, -0.2432...        0


In [6]:
# Usage example
image_dir_pit = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Training/pituitary"
y_label_p = 1  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df_train_pit = extract_feature_vectors_with_labels(image_dir_pit, base_model, y_label_p)

print(df_no_tumor.head())
# Now, you can save the DataFrame to a Parquet file
df_train_pit.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_train_pituitary.parquet')
# Display the DataFrame

   ID                                             Vector  Y_Label
0   1  [-0.19135198, -0.2522393, 0.09176754, -0.25584...        0
1   2  [-0.13153201, -0.24380103, 0.43674836, -0.2559...        0
2   3  [-0.07430623, -0.22589298, 0.1793621, -0.23937...        0
3   4  [-0.026441343, 0.076843336, 0.15179601, -0.243...        0
4   5  [-0.03311896, -0.24829683, 0.24011546, -0.2432...        0


In [7]:
# Usage example
image_dir_no_tumor = "/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/Testing/notumor"
y_label_no_tumor = 0  # Example label

# Extract feature vectors and labels, and store in a DataFrame
df_test_no_tumor = extract_feature_vectors_with_labels(image_dir_no_tumor, base_model, y_label_no_tumor)

print(df_no_tumor.head())
# Now, you can save the DataFrame to a Parquet file
df_test_no_tumor.to_parquet('/Users/wangzhuoyulucas/Documents/PythonGLMTrans/cleaned/feature_vectors_test_notumor.parquet')
# Display the DataFrame

   ID                                             Vector  Y_Label
0   1  [-0.19135198, -0.2522393, 0.09176754, -0.25584...        0
1   2  [-0.13153201, -0.24380103, 0.43674836, -0.2559...        0
2   3  [-0.07430623, -0.22589298, 0.1793621, -0.23937...        0
3   4  [-0.026441343, 0.076843336, 0.15179601, -0.243...        0
4   5  [-0.03311896, -0.24829683, 0.24011546, -0.2432...        0
