# Feature Extraction

In [None]:
#image_stats
import os
import cv2
import numpy as np
import pandas as pd
from skimage.measure import shannon_entropy
from tqdm import tqdm

# Set path to training image folder
TRAIN_DIR = "../train"  
TEST_DIR = "../test"

def extract_stats_from_image(image_path):
    img = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    features = {}
    for i, channel in enumerate(['R', 'G', 'B']):
        c_data = img_rgb[:, :, i]
        features[f'{channel}_mean'] = np.mean(c_data)
        features[f'{channel}_std'] = np.std(c_data)
        features[f'{channel}_min'] = np.min(c_data)
        features[f'{channel}_max'] = np.max(c_data)
        features[f'{channel}_entropy'] = shannon_entropy(c_data)
    
    features['image_entropy'] = shannon_entropy(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
    features['image_height'] = img.shape[0]
    features['image_width'] = img.shape[1]
    return features

def build_feature_df(metadata_csv, image_dir):
    df = pd.read_csv(metadata_csv)
    feature_list = []
    
    print("Extracting image stats...")
    for path in tqdm(df['image_path']):
        full_path = os.path.join(image_dir, path)
        stats = extract_stats_from_image(full_path)
        stats['Path'] = path
        feature_list.append(stats)

    return pd.DataFrame(feature_list)

# Example usage
train_features_df = build_feature_df("../2025_A2/train/train_metadata.csv", "../2025_A2/train")
test_features_df = build_feature_df("../2025_A2/test/test_metadata.csv", "../2025_A2/test")

# Save to CSV (optional)
train_features_df.to_csv("../2025_A2/train/train_image_stats.csv", index=False)
test_features_df.to_csv("../2025_A2/test/test_image_stats.csv", index=False)


Extracting image stats...


100%|██████████| 5488/5488 [00:02<00:00, 2099.76it/s]


Extracting image stats...


100%|██████████| 2353/2353 [00:01<00:00, 2140.74it/s]
