# Image Feature Extraction

This notebook applies augmentations to images and extracts features (histograms + HOG) to save in a CSV file.

**Dependencies:** numpy, pandas, Pillow, scikit-image, matplotlib, tqdm

In [8]:
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image, ImageOps
from skimage.feature import hog
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
from tqdm import tqdm

ROOT = Path('.')
IMAGES_DIR = ROOT / 'images'
OUTPUT_CSV = ROOT / 'image_features.csv'

In [9]:
def get_image_files(images_dir):
    extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp']
    image_files = []
    for ext in extensions:
        image_files.extend(list(images_dir.glob(ext)))
    return sorted(image_files)

def apply_augmentations(image):
    augmentations = []

    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    augmentations.append((image.copy(), 'original'))
    
    augmentations.append((image.rotate(90, expand=True), 'rot90'))
    
    augmentations.append((ImageOps.mirror(image), 'flip_h'))
    
    gray_img = image.convert('L').convert('RGB')
    augmentations.append((gray_img, 'grayscale'))
    
    return augmentations

def extract_color_histogram(image, bins=16):
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    img_array = np.array(image)
    hist_features = []
    
    for channel in range(3):
        hist, _ = np.histogram(img_array[:,:,channel], bins=bins, range=(0, 255), density=True)
        hist_features.extend(hist)
    
    return np.array(hist_features)

def extract_hog_features(image):
    if image.mode == 'RGB':
        gray_array = rgb2gray(np.array(image))
    else:
        gray_array = np.array(image.convert('L')) / 255.0
    
    features = hog(
        gray_array,
        orientations=9,
        pixels_per_cell=(16, 16),
        cells_per_block=(2, 2),
        block_norm='L2-Hys',
        feature_vector=True
    )
    
    return features

print("Feature extraction functions defined.")

Feature extraction functions defined.


In [10]:
def process_all_images():
    image_files = get_image_files(IMAGES_DIR)
    print(f"Found {len(image_files)} images to process")
    
    all_data = []
    
    for img_path in tqdm(image_files, desc="Processing images"):
        try:
            image = Image.open(img_path)
            filename = img_path.name
            
            augmentations = apply_augmentations(image)
            
            for aug_image, aug_type in augmentations:
                color_hist = extract_color_histogram(aug_image)
                hog_feats = extract_hog_features(aug_image)
                
                combined_features = np.concatenate([color_hist, hog_feats])
                
                row_data = {
                    'filename': filename,
                    'augmentation': aug_type,
                    'width': aug_image.width,
                    'height': aug_image.height
                }

                for i, feat_val in enumerate(combined_features):
                    row_data[f'feature_{i}'] = feat_val
                
                all_data.append(row_data)
                
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue
    
    df = pd.DataFrame(all_data)
    
    df.to_csv(OUTPUT_CSV, index=False)
    
    print(f"Processing complete!")
    print(f"Total rows: {len(df)}")
    print(f"Features per image: {len([col for col in df.columns if col.startswith('feature_')])}")
    print(f"CSV saved to: {OUTPUT_CSV}")
    
    return df

df_results = process_all_images()

Found 11 images to process


Processing images: 100%|██████████| 11/11 [01:07<00:00,  6.12s/it]



Processing complete!
Total rows: 44
Features per image: 224724
CSV saved to: image_features.csv


In [11]:
print("DataFrame Info:")
print(f"Shape: {df_results.shape}")
print(f"Columns: {list(df_results.columns[:10])}...")  # Show first 10 columns

print("\nFirst few rows:")
display(df_results.head())

print("\nAugmentation counts:")
print(df_results['augmentation'].value_counts())

if OUTPUT_CSV.exists():
    print(f"\nCSV file successfully created: {OUTPUT_CSV}")
    print(f"File size: {OUTPUT_CSV.stat().st_size} bytes")
else:
    print(f"\nCSV file was not created!")

print(f"\nProcessing complete! Check {OUTPUT_CSV} for the results.")

DataFrame Info:
Shape: (44, 224728)
Columns: ['filename', 'augmentation', 'width', 'height', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']...

First few rows:


Unnamed: 0,filename,augmentation,width,height,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_224714,feature_224715,feature_224716,feature_224717,feature_224718,feature_224719,feature_224720,feature_224721,feature_224722,feature_224723
0,Best-neutral.jpg,original,960,1280,0.006019,0.006188,0.004027,0.002869,0.003118,0.003477,...,,,,,,,,,,
1,Best-neutral.jpg,rot90,1280,960,0.006019,0.006188,0.004027,0.002869,0.003118,0.003477,...,,,,,,,,,,
2,Best-neutral.jpg,flip_h,960,1280,0.006019,0.006188,0.004027,0.002869,0.003118,0.003477,...,,,,,,,,,,
3,Best-neutral.jpg,grayscale,960,1280,0.006366,0.006514,0.004475,0.00428,0.003872,0.003783,...,,,,,,,,,,
4,Best-smiling.jpg,original,960,1280,0.006713,0.006028,0.00386,0.002811,0.003572,0.003327,...,,,,,,,,,,



Augmentation counts:
augmentation
original     11
rot90        11
flip_h       11
grayscale    11
Name: count, dtype: int64

CSV file successfully created: image_features.csv
File size: 119956493 bytes

Processing complete! Check image_features.csv for the results.
