# Finding interesting pixels

In [1]:
import json
import os
import pathlib
import numpy as np
import pandas as pd
from PIL import Image
from scipy import ndimage
from scipy import optimize
from scipy.spatial import distance
from skimage import measure
import tqdm

First we calculate the median value of each pixel's surrounding region.

In [36]:
for part in ['train', 'test']:
    
    for sequence in tqdm.tqdm(list(pathlib.Path(f'data/spotGEO/{part}').glob('*')), position=0):
        os.makedirs(f'data/medians/{part}/{sequence.name}', exist_ok=True)
    
        for frame in sequence.glob('*.png'):
            
            img = Image.open(frame)
            pixels = np.asarray(img)
            img.close()
            med = ndimage.median_filter(pixels, size=(16, 12))
            np.save(f'data/medians/{part}/{sequence.name}/{frame.stem}.npy', med)

100%|██████████| 1281/1281 [59:45<00:00,  2.80s/it] 
100%|██████████| 5120/5120 [4:44:29<00:00,  3.33s/it]  


Now we determine which pixels are brighter than their surroundings.

In [4]:
regions = {}

for part in ['train', 'test']:
    regions[part] = {}
    
    for sequence_path in tqdm.tqdm(list(pathlib.Path(f'data/spotGEO/{part}').glob('*')), position=0):
        if sequence_path.name == '.DS_Store': continue
        sequence = int(sequence_path.name)
        regions[part][sequence] = {}
            
        for frame_path in sequence_path.glob('*.png'):
            frame = int(frame_path.stem)
            
            # Access the necessary data
            img = Image.open(frame_path)
            pixels = np.asarray(img)
            img.close()
            med = np.load(f'data/medians/{part}/{sequence}/{frame}.npy')
            
            # Determine which pixels are brighter than their surroundings
            interesting = pixels > med + 7
            
            # Label each pixel
            labels = measure.label(interesting)
            
            # Group identically labeled pixels into regions
            regions[part][sequence][frame] = measure.regionprops(label_image=labels, intensity_image=pixels)

100%|██████████| 1281/1281 [01:19<00:00, 16.12it/s]
100%|██████████| 5120/5120 [06:15<00:00, 13.62it/s]


Let's now load the provided annotations.

In [5]:
satellites = []

with open('data/spotGEO/train_anno.json') as f:
    for ann in json.load(f):
        for i, coords in enumerate(ann['object_coords']):
            satellites.append({
                'sequence': ann['sequence_id'],
                'frame': ann['frame'],
                'satellite': i + 1,
                'r': int(coords[1] + .5),
                'c': int(coords[0] + .5),
            })
    
satellites = pd.DataFrame(satellites)
satellites = satellites.set_index(['sequence', 'frame', 'satellite'])
satellites.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,r,c
sequence,frame,satellite,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,237,502
1,1,2,222,490
1,1,3,129,141
1,2,1,214,530
1,2,2,199,518


Let's find interesting pixels in the training set.

In [17]:
import collections

interesting = collections.defaultdict(list)

perf = []
part = 'train'

def region_features(r):
    return {
        'area': r.area,
        'extent': r.extent,
        'perimeter': r.perimeter,
        'diameter': r.equivalent_diameter,
        'major_over_minor': r.major_axis_length / (r.minor_axis_length + 1)
    }

for sequence in tqdm.tqdm(regions[part], position=0):
    for frame in regions[part][sequence]:

        try:
            sats = satellites.loc[sequence, frame]
        except KeyError:
            continue
            
        img = Image.open(f'data/spotGEO/{part}/{sequence}/{frame}.png')
        pixels = np.asarray(img)
        img.close()
        
        rs = regions[part][sequence][frame]
        centers = np.asarray([
            r.coords[np.take(pixels, np.ravel_multi_index(r.coords.T, pixels.shape)).argmax()]
            #np.array(region.weighted_centroid).round().astype(int)
            #np.array(region.centroid).round().astype(int)
            for r in rs
        ])
        
        # Compute the distance between each satellite and each interesting location,
        # thus forming a bipartite graph
        distances = distance.cdist(sats, centers, metric='chebyshev')
        
        # Guess which locations correspond to which satellites
        row_ind, col_ind = optimize.linear_sum_assignment(distances)
        
        # Each satellite is assigned, but some of them may be too distant to be likely
        likely = distances[row_ind, col_ind] <= 2
        
        labels = np.full(len(centers), False, dtype=bool)
        labels[col_ind[likely]] = True
        
        perf.append({
            'sequence': sequence,
            'frame': frame,
            'n_sats': len(sats),
            'n_interesting': len(centers),
            'n_assigned': likely.sum(),
        })
        
        for center, label, region in zip(centers, labels, rs):
            interesting['part'].append(part)
            interesting['sequence'].append(sequence)
            interesting['frame'].append(frame)
            interesting['r'].append(center[0])
            interesting['c'].append(center[1])
            interesting['is_satellite'].append(label)
            for k, v in region_features(region).items():
                interesting[k].append(v)

perf = pd.DataFrame(perf)
print()
print(f'- Recall: {perf.n_assigned.sum() / perf.n_sats.sum():.2%}')
print(f'- Number of samples: {perf.n_interesting.sum():,}')

100%|██████████| 1280/1280 [05:12<00:00,  4.10it/s]

- Recall: 83.88%
- Number of samples: 2,059,856


- Recall: 83.88%
- Number of samples: 2,059,856

Now let's do it on the test data.

In [18]:
part = 'test'

for sequence in tqdm.tqdm(regions[part], position=0):
    for frame in regions[part][sequence]:
            
        img = Image.open(f'data/spotGEO/{part}/{sequence}/{frame}.png')
        pixels = np.asarray(img)
        img.close()
        
        rs = regions[part][sequence][frame]
        centers = np.asarray([
            r.coords[np.take(pixels, np.ravel_multi_index(r.coords.T, pixels.shape)).argmax()]
            for r in rs
        ])
        
        for center, region in zip(centers, rs):
            interesting['part'].append(part)
            interesting['sequence'].append(sequence)
            interesting['frame'].append(frame)
            interesting['is_satellite'].append(None)
            interesting['r'].append(center[0])
            interesting['c'].append(center[1])
            for k, v in region_features(region).items():
                interesting[k].append(v)

100%|██████████| 5120/5120 [25:49<00:00,  3.30it/s]


In [19]:
interesting_df = pd.DataFrame(interesting)
interesting_df.head()

In [21]:
interesting_df.to_pickle('data/interesting.pkl')