In [10]:
import numpy as np
import pandas as pd
from skimage import measure
from scipy import ndimage
from scipy import signal


l = np.arange(480) - 1
l[0] = 0
r = np.arange(480) + 1
r[-1] = r[-2]
b = np.arange(640) - 1
b[0] = 0
u = np.arange(640) + 1
u[-1] = u[-2]


def expand(mask):
    new = mask.copy()
    
    for shift in [l, r]:
        new |= mask[shift, :]
    
    for shift in [b, u]:
        new |= mask[:, shift]

    return new


def find_interesting_pixels(img):
    
    med = ndimage.median_filter(img, size=40)
    #mask = img > med + 10
    mask = img > med * 1.3
    
    labels = measure.label(expand(mask))
    
    return pd.DataFrame(
        [
            region.centroid
            for region in measure.regionprops(labels)
            if region.area <= 25
        ],
        columns=['r', 'c']
    )

Example.

In [11]:
from PIL import Image

img = np.asarray(Image.open('data/spotGEO/train/10/1.png'))
find_interesting_pixels(img).shape

(90, 2)

Do it for each image.

In [None]:
import pathlib
from joblib import Parallel, delayed
import tqdm

def f(part, seq, frame):
    img = np.asarray(Image.open(frame))
    return find_interesting_pixels(img).assign(part=part, sequence=int(seq.name), frame=int(frame.stem))

interesting = Parallel(n_jobs=4)(
    delayed(f)(part, seq, frame)
    for part in ['train', 'test']
    for seq in tqdm.tqdm(list(pathlib.Path(f'data/spotGEO/{part}').glob('*')), position=0)
    for frame in seq.glob('*.png')
)

interesting = pd.concat(interesting)
interesting = interesting.set_index(['part', 'sequence', 'frame']).sort_index()
interesting.to_pickle('data/interesting.pkl')

100%|██████████| 1281/1281 [1:50:47<00:00,  5.19s/it]
 40%|███▉      | 2040/5120 [3:06:42<5:34:44,  6.52s/it]

Average number of interesting regions per image.

In [None]:
interesting.groupby(['part', 'sequence', 'frame']).size().mean()

Percentage of pixels this represents.

In [None]:
f'{len(interesting) / (640 * 480 * 31996):%}'

Now let's annotate each interesting region.

In [None]:
import json
import pandas as pd

sats = []

with open('data/spotGEO/train_anno.json') as f:
    for ann in json.load(f):
        for i, coords in enumerate(ann['object_coords']):
            sats.append({
                'sequence': ann['sequence_id'],
                'frame': ann['frame'],
                'satellite': i + 1,
                'r': int(coords[1] + .5),
                'c': int(coords[0] + .5),
            })
    
sats = pd.DataFrame(sats)
sats = sats.set_index(['sequence', 'frame', 'satellite'])
sats.head()

In [None]:
from scipy import optimize 

def assign_labels(interesting, satellites):
    
    # Compute the distance between each satellite and each interesting location,
    # thus forming a bipartite graph
    distances = distance.cdist(satellites, interesting)
    
    # Guess which locations correspond to which satellites
    row_ind, col_ind = optimize.linear_sum_assignment(distances)

    # Each satellite is assigned, but some of them may too distant to be likely
    likely = distances[row_ind, col_ind] < 3
    
    labels = np.full(len(interesting), False, dtype=bool)
    labels[col_ind[likely]] = True
    return labels

Example.

In [None]:
assign_labels(interesting.loc['train', 1, 1], sats.loc[1, 1])

Now assign labels for each frame.

In [None]:
labels = pd.Series(dtype=bool, index=interesting.loc['train'].index)

for (sequence, frame), locations in tqdm.tqdm(interesting.loc['train'].groupby(['sequence', 'frame']), position=0):
    try:
        satellites = sats.loc[sequence, frame]
    except KeyError:
        continue
    labels.loc[sequence, frame] = assign_labels(locations, satellites)
    
interesting['is_satellite'] = None
interesting.loc['train', 'is_satellite'] = labels.values
interesting.to_pickle('data/interesting.pkl')

Determine the amount of satellites that got assigned.

In [None]:
interesting.loc['train']['is_satellite'].sum() / len(sats)