# 02. Data Preprocessing - GreenSpace CNN

This notebook handles data preprocessing for the multi-task greenspace CNN:
- Survey response cleaning and label processing  
- Image preprocessing and augmentation setup
- Train/validation/test splits
- Data pipeline creation for TensorFlow/Keras

## Steps (tentative)
1. Load and clean survey data
2. Handle multi-rater aggregation 
3. Process images (resize, normalize)
4. Create data splits
5. Set up TensorFlow data pipelines

How to transform the data we have:


In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

### Step 1: validate CSV ↔ images

In [None]:



csv = Path('../data/raw/survey_responses_clean.csv')
imgs = Path('../data/raw/images')
assert csv.exists() and imgs.exists(), 'Missing CSV or images folder'

df = pd.read_csv(csv)
assert 'image_filename' in df.columns, "CSV needs 'image_filename'"
exp = set(df['image_filename'].dropna().astype(str).unique())
present = {p.name for p in imgs.iterdir() if p.is_file()}
missing = sorted(exp - present)
extra = sorted(present - exp)

print(f'expected={len(exp)} present={len(present)} missing={len(missing)} extra={len(extra)}')
if missing: print('missing (first 10):', *missing[:10], sep='\n')
if extra: print('extra (first 10):', *extra[:10], sep='\n')

if missing or extra:
    raise SystemExit('Fix naming/alignment before proceeding.')
print('Alignment OK')


expected=48 present=48 missing=0 extra=0
Alignment OK


### Step 2: aggregate rater labels → soft & hard per image

In [None]:

csv = Path('../data/raw/survey_responses_clean.csv')
out = Path('../data/processed'); out.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(csv)
key = 'image_filename'

# Raw column names (as in the CSV)
bin_cols_raw = ['Sports Field','Multipurpose Open Area',"Children's playground",'Water feature','Gardens','Walking paths','Built structures']
shade_col_raw = 'Shade along paths'
score_col_raw = 'Structured–Unstructured Rating'

# Map to numeric with floats to allow NaN
yn = {'Yes':1, 'No':0, 'yes':1, 'no':0}
for c in bin_cols_raw:
    df[c] = df[c].map(yn).astype(float)

shade_map = {'None':0,'Some':1,'Abundant':2}
df['shade_i'] = df[shade_col_raw].map(shade_map).astype(float)

# Extract leading 1–5 from strings like "5 - Very Unstructured"
df['score_i'] = pd.to_numeric(df[score_col_raw].astype(str).str.extract(r'^(\d)')[0], errors='coerce')

# Group by image
g = df.groupby(key, dropna=False)

# n_ratings
n = g.size().rename('n_ratings')

# Binary soft probs (mean over available ratings)
bin_soft = g[bin_cols_raw].mean().rename(columns={
    'Sports Field':'sports_field_p',
    'Multipurpose Open Area':'multipurpose_open_area_p',
    "Children's playground":'childrens_playground_p',
    'Water feature':'water_feature_p',
    'Gardens':'gardens_p',
    'Walking paths':'walking_paths_p',
    'Built structures':'built_structures_p',
}).astype(float)

# Shade soft probs
shade_probs = g['shade_i'].apply(lambda s: s.value_counts(normalize=True)).unstack(fill_value=0.0)
shade_probs = shade_probs.reindex(columns=[0.0,1.0,2.0], fill_value=0.0).astype(float)
shade_probs.columns = ['shade_p_none','shade_p_some','shade_p_abundant']

# Score soft probs + mean
score_probs = g['score_i'].apply(lambda s: s.value_counts(normalize=True)).unstack(fill_value=0.0)
score_probs = score_probs.reindex(columns=[1.0,2.0,3.0,4.0,5.0], fill_value=0.0).astype(float)
score_probs.columns = [f'score_p_{i}' for i in [1,2,3,4,5]]
score_mean = g['score_i'].mean().rename('score_mean').astype(float)

# Assemble soft labels
soft = pd.concat([n, bin_soft, shade_probs, score_probs, score_mean], axis=1).reset_index()
soft.to_csv(out / 'labels_soft.csv', index=False)

# Build hard labels from soft (fill NaNs with 0 before argmax/threshold)
hard = soft[[key,'n_ratings']].copy()
for col in ['sports_field','multipurpose_open_area','childrens_playground','water_feature','gardens','walking_paths','built_structures']:
    hard[col] = soft[f'{col}_p'].fillna(0.0).ge(0.5).astype(int)

shade_cols = ['shade_p_none','shade_p_some','shade_p_abundant']
shade_arr = soft[shade_cols].fillna(0.0).to_numpy(dtype=float)
hard['shade_class'] = shade_arr.argmax(axis=1)  # 0/1/2

score_cols = [f'score_p_{i}' for i in [1,2,3,4,5]]
score_arr = soft[score_cols].fillna(0.0).to_numpy(dtype=float)
hard['score_class'] = score_arr.argmax(axis=1) + 1  # 1..5

hard.to_csv(out / 'labels_hard.csv', index=False)

print('Wrote:', out/'labels_soft.csv')
print('Wrote:', out/'labels_hard.csv')


Wrote: ../data/processed/labels_soft.csv
Wrote: ../data/processed/labels_hard.csv


### Step 3: label prevalence (from soft labels)

In [5]:


soft_path = Path('../data/processed/labels_soft.csv')
assert soft_path.exists(), f'Missing {soft_path}. Run Step 2 first.'

df = pd.read_csv(soft_path)

# Binary prevalence (mean probability = expected positive rate)
bin_cols = [
    'sports_field_p','multipurpose_open_area_p','childrens_playground_p',
    'water_feature_p','gardens_p','walking_paths_p','built_structures_p'
]
bin_prev = df[bin_cols].mean().sort_values()
print('Binary prevalence (expected positive rate):')
print(bin_prev.to_frame('prevalence').round(3))

# Shade (3-class) distribution (dataset-level)
shade_cols = ['shade_p_none','shade_p_some','shade_p_abundant']
shade_prev = df[shade_cols].mean()
print('\nShade distribution:')
print(shade_prev.rename(lambda c: c.replace('shade_p_','')).to_frame('prob').round(3))

# Structured–Unstructured (1–5) distribution
score_cols = [f'score_p_{i}' for i in [1,2,3,4,5]]
score_prev = df[score_cols].mean()
print('\nStructured–Unstructured (1–5) distribution:')
print(score_prev.rename(lambda c: c.replace('score_p_','score_')).to_frame('prob').round(3))

# Rating count sanity check
if 'n_ratings' in df.columns:
    print('\nNumber of ratings per image (summary):')
    print(df['n_ratings'].describe().to_string())


Binary prevalence (expected positive rate):
                          prevalence
gardens_p                      0.104
childrens_playground_p         0.125
water_feature_p                0.163
sports_field_p                 0.222
built_structures_p             0.330
multipurpose_open_area_p       0.438
walking_paths_p                0.542

Shade distribution:
           prob
none      0.000
some      0.972
abundant  0.028

Structured–Unstructured (1–5) distribution:
          prob
score_1  0.101
score_2  0.160
score_3  0.142
score_4  0.205
score_5  0.392

Number of ratings per image (summary):
count    48.000000
mean      1.708333
std       0.742576
min       1.000000
25%       1.000000
50%       2.000000
75%       2.000000
max       3.000000


## Augmentation Ideas

- Geometric: horizontal/vertical flips, 90° rotations, random crop→resize (90–100% area), small translate (±5%).
- Photometric: brightness/contrast ±10% (keep mild since “shade” is label-like). Avoid heavy color/hue.

### Step 4: oversample (rare vs other) + rotation/brightness augmentation

In [3]:
import tensorflow as tf

hard_p = Path('../data/processed/labels_hard.csv')
soft_p = Path('../data/processed/labels_soft.csv')
img_dir = Path('../data/raw/images')

dh = pd.read_csv(hard_p)
ds = pd.read_csv(soft_p)
# Join to keep flexibility later (we only need hard to detect rarity)
df = dh.merge(ds, on=['image_filename','n_ratings'], how='inner')

df['image_path'] = df['image_filename'].apply(lambda x: str(img_dir / x))
rare_labels = ['gardens','childrens_playground','water_feature','sports_field']
df['is_rare_pos'] = df[rare_labels].any(axis=1)

rare_paths = df.loc[df['is_rare_pos'], 'image_path'].tolist()
other_paths = df.loc[~df['is_rare_pos'], 'image_path'].tolist()
print(f'rare={len(rare_paths)} other={len(other_paths)}')

IMG_SIZE = (512, 512)

def decode(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32) / 255.0
    return img

# Mild, label-preserving augmentation: rotate 0/90/180/270 and brightness ±10%
def augment(img):
    k = tf.random.uniform((), minval=0, maxval=4, dtype=tf.int32)
    img = tf.image.rot90(img, k)
    delta = tf.random.uniform((), minval=-0.1, maxval=0.1)
    img = tf.clip_by_value(img + delta, 0.0, 1.0)
    return img

def make_ds(paths, augment_flag=True, shuffle_seed=123):
    ds = tf.data.Dataset.from_tensor_slices(paths)
    if len(paths) > 1:
        ds = ds.shuffle(len(paths), seed=shuffle_seed, reshuffle_each_iteration=True)
    ds = ds.map(decode, num_parallel_calls=tf.data.AUTOTUNE)
    if augment_flag:
        ds = ds.map(augment, num_parallel_calls=tf.data.AUTOTUNE)
    return ds

rare_ds = make_ds(rare_paths, augment_flag=True, shuffle_seed=123)
other_ds = make_ds(other_paths, augment_flag=True, shuffle_seed=456)

# 50/50 oversampled mix
mixed = tf.data.Dataset.sample_from_datasets([rare_ds, other_ds], weights=[0.5, 0.5], seed=999)
train_preview = mixed.batch(8).prefetch(tf.data.AUTOTUNE)

print('Oversampled + augmented stream ready (preview dataset built).')


rare=24 other=24


2025-09-08 20:34:01.156328: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-09-08 20:34:01.156359: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-09-08 20:34:01.156367: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-09-08 20:34:01.156406: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-09-08 20:34:01.156421: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Oversampled + augmented stream ready (preview dataset built).
