# Solution

## Data loading

We've already determined which pixels are "interesting" in the [previous notebook](Interesting.ipynb).

In [1]:
import pandas as pd

interesting = pd.read_pickle('data/interesting.pkl')
interesting.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,r,c,area,eccentricity,solidity,is_satellite
part,sequence,frame,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
test,1,1,1.0,91.0,5,0.0,1.0,
test,1,1,2.380952,331.0,21,0.941337,0.724138,
test,1,1,0.25,338.0,4,0.790569,1.0,
test,1,1,0.666667,415.333333,9,0.942349,0.75,
test,1,1,7.464789,339.471831,142,0.981029,0.835294,


In [2]:
f'{len(interesting):,}'

'12,807,570'

'8,349,960'

## Feature extraction

In [3]:
import numpy as np
from scipy import stats

def region(img: np.ndarray, r: int, c: int, w: int):
    """Returns the square of length width with (x, y) being at the center."""
    return img[
        max(r - w, 0) : min(r + w + 1, img.shape[0]),
        max(c - w, 0) : min(c + w + 1, img.shape[1])
    ]

def extract_features(img, r, c):
    r3x3 = region(img, r, c, 3).ravel()
    r5x5 = region(img, r, c, 5).ravel()
    r7x7 = region(img, r, c, 7).ravel()
    val = img[r, c]
    return {
        'pixel_value': val,
        '3x3_std': r3x3.std(),
        '3x3_min': val - r3x3.min(),
        '3x3_max': val - r3x3.max(),
        '5x5_std': r5x5.std(),
        '5x5_entropy': stats.entropy(r5x5),
        '5x5_min': val - r5x5.min(),
        '5x5_max': val - r5x5.max(),
        '7x7_std': r7x7.std(),
        '7x7_entropy': stats.entropy(r7x7),
        '7x7_kurtosis': stats.kurtosis(r7x7),
        '7x7_skew': stats.skew(r7x7)
    }

Extract features for each interesting region.

In [5]:
from PIL import Image
import tqdm

samples = []

# There should be 32000 frames (5 * 1280 + 5 * 5120)
for (part, sequence, frame), locations in tqdm.tqdm(interesting.groupby(['part', 'sequence', 'frame']), position=0):

#for (sequence, frame), locations in tqdm.tqdm(interesting.loc['train'].groupby(['sequence', 'frame']), position=0): 
#    part = 'train'
    
    img = np.asarray(Image.open(f'data/spotGEO/{part}/{sequence}/{frame}.png')).astype(np.float32)
    
    for _, location in locations.iterrows():
    
        r = int(location['r'])
        c = int(location['c'])

        samples.append({
            'part': part,
            'sequence': sequence,
            'frame': frame,
            'r': r,
            'c': c,
            
            'is_satellite': location['is_satellite'],
            'area': location['area'],
            'eccentricity': location['eccentricity'],
            'solidity':  location['solidity'],
            
            **extract_features(img, r=r, c=c)
        })
        
samples = pd.DataFrame(samples)
samples = samples.set_index(['part', 'sequence', 'frame', 'r', 'c'])
samples.to_pickle('data/samples.pkl')
samples.head()

100%|██████████| 32000/32000 [2:47:08<00:00,  3.19it/s]   


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,is_satellite,area,eccentricity,solidity,pixel_value,3x3_std,3x3_min,3x3_max,5x5_std,5x5_entropy,5x5_min,5x5_max,7x7_std,7x7_entropy,7x7_kurtosis,7x7_skew
part,sequence,frame,r,c,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
test,1,1,1,91,,5,0.0,1.0,50.0,2.211288,11.0,0.0,2.067833,4.342666,12.0,0.0,1.977597,4.904221,0.794483,0.382591
test,1,1,2,331,,21,0.941337,0.724138,50.0,2.782049,11.0,-1.0,2.842636,4.475348,11.0,-5.0,6.136916,5.002543,3.388442,1.920722
test,1,1,0,338,,4,0.790569,1.0,50.0,2.564226,11.0,0.0,3.345021,4.186919,11.0,-9.0,6.92337,4.777256,1.608331,1.577891
test,1,1,0,415,,9,0.942349,0.75,42.0,2.315377,4.0,-6.0,2.140389,4.188386,4.0,-6.0,1.953611,4.786424,0.004218,0.148943
test,1,1,7,339,,142,0.981029,0.835294,66.0,8.833364,26.0,-2.0,7.947879,4.783086,27.0,-2.0,7.186401,5.405223,1.016366,1.443022


In [6]:
import pandas as pd

samples = pd.read_pickle('data/samples.pkl')

## Learning phase

Split into train and test.

In [7]:
from sklearn import utils

X_train = samples.loc['train'].copy()
y_train = X_train.pop('is_satellite').astype(bool)
X_train, y_train = utils.shuffle(X_train, y_train, random_state=42)

try:
    X_test = samples.loc['test'].drop(columns='is_satellite')
except KeyError:
    X_test = None

Do the LGBM CV dance.

In [8]:
from imblearn import pipeline
from imblearn import under_sampling
import lightgbm
from sklearn import metrics
from sklearn import model_selection
from sklearn import utils

model = lightgbm.LGBMClassifier(
    scale_pos_weight=2,
    num_leaves=2 ** 6,
    learning_rate=.01,
    metric='binary',
    random_state=42,
    min_child_samples=30,
    n_estimators=10_000
)

#sampler = under_sampling.RandomUnderSampler()

cv = model_selection.GroupKFold(n_splits=5)
groups = X_train.index.get_level_values('sequence')

oof = pd.Series(dtype=bool, index=X_train.index)
if X_test is not None:
    y_test = pd.DataFrame(index=X_test.index)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train, groups=groups)):
    
    X_fit = X_train.iloc[fit_idx]
    y_fit = y_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    #X_fit, y_fit = sampler.fit_sample(X_fit, y_fit)
    
    model.fit(
        X_fit, y_fit,
        eval_set=[(X_fit, y_fit), (X_val, y_val)],
        eval_names=['fit', 'val'],
        early_stopping_rounds=20,
        verbose=100
    )
    oof.iloc[val_idx] = model.predict(X_val)
    
    if X_test is not None:
        y_test[i] = model.predict_proba(X_test)[:, 1]
    
    print()

print(metrics.classification_report(y_train, oof, digits=4))

Training until validation scores don't improve for 20 rounds
[100]	fit's binary_logloss: 0.00844038	val's binary_logloss: 0.00882558
[200]	fit's binary_logloss: 0.00679549	val's binary_logloss: 0.00750453
[300]	fit's binary_logloss: 0.00598325	val's binary_logloss: 0.0070257
[400]	fit's binary_logloss: 0.00546135	val's binary_logloss: 0.00680302
[500]	fit's binary_logloss: 0.00505394	val's binary_logloss: 0.0066706
[600]	fit's binary_logloss: 0.00475749	val's binary_logloss: 0.00659431
[700]	fit's binary_logloss: 0.00452712	val's binary_logloss: 0.00654414
[800]	fit's binary_logloss: 0.00432897	val's binary_logloss: 0.00650439
[900]	fit's binary_logloss: 0.00415768	val's binary_logloss: 0.00647043
[1000]	fit's binary_logloss: 0.00399074	val's binary_logloss: 0.00643956
[1100]	fit's binary_logloss: 0.00384083	val's binary_logloss: 0.00641294
[1200]	fit's binary_logloss: 0.00370009	val's binary_logloss: 0.00638777
[1300]	fit's binary_logloss: 0.00356258	val's binary_logloss: 0.00636054
[

precision    recall  f1-score   support

       False     0.9987    0.9996    0.9991   1683412
        True     0.9084    0.7537    0.8239      8899

    accuracy                         0.9983   1692311
   macro avg     0.9536    0.8766    0.9115   1692311
weighted avg     0.9982    0.9983    0.9982   1692311

Feature importances.

In [9]:
pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

7x7_skew        19791
7x7_kurtosis    18805
3x3_std         18285
pixel_value     16102
7x7_std         15435
7x7_entropy     15351
5x5_entropy     14176
5x5_std         13755
eccentricity    13013
area            12607
solidity         9627
3x3_min          8028
5x5_min          8026
5x5_max          5939
3x3_max          5667
dtype: int32

## Out-of-fold predictions

In [10]:
%run toolbox.py

In [11]:
oof.head()

sequence  frame  r    c  
25        2      259  208    False
606       4      234  360    False
1166      2      220  556    False
1021      3      469  17     False
975       1      156  221    False
dtype: bool

In [12]:
save_predictions(oof[oof].sort_index(), path='oof.json', n_sequences=1280)

100%|██████████| 1280/1280 [00:02<00:00, 494.30it/s]


In [13]:
!python validation.py oof.json data/spotGEO/train_anno.json

Score: 0.273675, (MSE: 45254.810955)


Score: 0.277273, (MSE: 45259.145451)

## Test predictions

In [14]:
y_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2,3,4
sequence,frame,r,c,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,91,0.000483,0.000607,0.000484,0.000161,0.000299
1,1,2,331,9.3e-05,0.000428,0.000263,0.000328,0.000377
1,1,0,338,0.000184,0.000134,0.000356,8.7e-05,0.00039
1,1,0,415,3e-06,1.3e-05,9e-06,3e-06,3e-06
1,1,7,339,0.000669,0.000924,0.001222,0.000375,0.000817


In [15]:
import zipfile

sightings = y_test.mean(axis='columns') > .33
save_predictions(sightings[sightings], path='submission.json', n_sequences=5120)

with zipfile.ZipFile('submission.zip', mode='w') as f:
    f.write('submission.json')

100%|██████████| 5120/5120 [00:11<00:00, 437.54it/s]


Next is [PostProcessing.ipynb](PostProcessing.ipynb).