# Solution

## Data loading

We've already determined which pixels are "interesting" in the [previous notebook](Interesting.ipynb).

In [1]:
import pandas as pd

interesting = pd.read_pickle('data/interesting.pkl')
interesting.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,r,c,area,eccentricity,solidity,is_satellite
part,sequence,frame,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
test,1,1,2.0,334.0,5,0.0,1.0,
test,1,1,7.432836,339.350746,134,0.983181,0.853503,
test,1,1,10.727273,264.642857,154,0.978166,0.865169,
test,1,1,6.0,321.0,5,0.0,1.0,
test,1,1,18.99359,40.365385,156,0.97591,0.901734,


In [2]:
f'{len(interesting):,}'

'8,349,960'

## Feature extraction

In [3]:
import numpy as np
from scipy import stats

def region(img: np.ndarray, r: int, c: int, w: int):
    """Returns the square of length width with (x, y) being at the center."""
    return img[
        max(r - w, 0) : min(r + w + 1, img.shape[0]),
        max(c - w, 0) : min(c + w + 1, img.shape[1])
    ]

def extract_features(img, r, c):
    r3x3 = region(img, r, c, 3).ravel()
    r5x5 = region(img, r, c, 5).ravel()
    r7x7 = region(img, r, c, 7).ravel()
    val = img[r, c]
    return {
        'pixel_value': val,
        '3x3_std': r3x3.std(),
        '3x3_min': val - r3x3.min(),
        '3x3_max': val - r3x3.max(),
        '5x5_std': r5x5.std(),
        '5x5_entropy': stats.entropy(r5x5),
        '5x5_min': val - r5x5.min(),
        '5x5_max': val - r5x5.max(),
        '7x7_std': r7x7.std(),
        '7x7_entropy': stats.entropy(r7x7),
        '7x7_kurtosis': stats.kurtosis(r7x7),
        '7x7_skew': stats.skew(r7x7)
    }

Extract features for each interesting region.

In [33]:
from PIL import Image
import tqdm

samples = {}

# There should be 32000 frames (5 * 1280 + 5 * 5120)
for (part, sequence, frame), locations in tqdm.tqdm(interesting.groupby(['part', 'sequence', 'frame']), position=0):

#for (sequence, frame), locations in tqdm.tqdm(interesting.loc['train'].groupby(['sequence', 'frame']), position=0): 
#    part = 'train'
    
    img = np.asarray(Image.open(f'data/spotGEO/{part}/{sequence}/{frame}.png')).astype(np.float32)
    
    for _, location in locations.iterrows():
    
        r = int(location['r'])
        c = int(location['c'])

        samples[part, sequence, frame, r, c] = {
            'is_satellite': location['is_satellite'],
            'area': location['area'],
            'eccentricity': location['eccentricity'],
            'solidity':  location['solidity'],
            **extract_features(img, r=r, c=c)
        }
        
samples = pd.DataFrame.from_dict(samples, orient='index')
samples.index.names = ['part', 'sequence', 'frame', 'r', 'c']
samples.to_pickle('data/samples.pkl')
samples.head()

100%|██████████| 32000/32000 [1:33:12<00:00,  5.72it/s]  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,is_satellite,area,eccentricity,solidity,pixel_value,3x3_std,3x3_min,3x3_max,5x5_std,5x5_entropy,5x5_min,5x5_max,7x7_std,7x7_entropy,7x7_kurtosis,7x7_skew
part,sequence,frame,r,c,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
test,1,1,2,334,,5,0.0,1.0,51.0,2.742543,12.0,0.0,4.62694,4.472527,12.0,-16.0,7.047171,5.000321,1.346526,1.51755
test,1,1,7,339,,134,0.983181,0.853503,66.0,8.833364,26.0,-2.0,7.947879,4.783086,27.0,-2.0,7.186401,5.405223,1.016366,1.443022
test,1,1,10,264,,154,0.978166,0.865169,94.0,18.666742,54.0,-7.0,17.242287,4.750768,55.0,-7.0,15.642723,5.376112,1.676764,1.720753
test,1,1,6,321,,5,0.0,1.0,51.0,2.940476,11.0,0.0,2.627891,4.794033,12.0,0.0,2.687508,5.345273,0.914122,0.866791
test,1,1,18,40,,156,0.97591,0.901734,96.0,18.950119,55.0,-3.0,17.164404,4.751998,59.0,-3.0,15.657648,5.376683,1.718475,1.72562


## Learning phase

Split into train and test.

In [35]:
from sklearn import utils

X_train = samples.loc['train'].copy()
y_train = X_train.pop('is_satellite').astype(bool)
X_train, y_train = utils.shuffle(X_train, y_train, random_state=42)

try:
    X_test = samples.loc['test'].drop(columns='is_satellite')
except KeyError:
    X_test = None

Do the LGBM CV dance.

In [59]:
?lightgbm.LGBMClassifier

[0;31mInit signature:[0m
[0mlightgbm[0m[0;34m.[0m[0mLGBMClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mboosting_type[0m[0;34m=[0m[0;34m'gbdt'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_leaves[0m[0;34m=[0m[0;36m31[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msubsample_for_bin[0m[0;34m=[0m[0;36m200000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobjective[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_split_gain[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_child_weight[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_ch

In [61]:
import lightgbm
from sklearn import metrics
from sklearn import model_selection
from sklearn import utils

model = lightgbm.LGBMClassifier(
    scale_pos_weight=2,
    num_leaves=2 ** 6,
    learning_rate=.01,
    metric='binary',
    random_state=42,
    min_child_samples=30,
    n_estimators=10_000
)

cv = model_selection.GroupKFold(n_splits=5)
groups = X_train.index.get_level_values('sequence')

oof = pd.Series(dtype=bool, index=X_train.index)
if X_test is not None:
    y_test = pd.DataFrame(index=X_test.index)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train, groups=groups)):
    
    X_fit = X_train.iloc[fit_idx]
    y_fit = y_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    model.fit(
        X_fit, y_fit,
        eval_set=[(X_fit, y_fit), (X_val, y_val)],
        eval_names=['fit', 'val'],
        early_stopping_rounds=20,
        verbose=100
    )
    oof.iloc[val_idx] = model.predict(X_val)
    
    if X_test is not None:
        y_test[i] = model.predict_proba(X_test)[:, 1]
    
    print()

print(metrics.classification_report(y_train, oof, digits=4))

Training until validation scores don't improve for 20 rounds
[100]	fit's binary_logloss: 0.010353	val's binary_logloss: 0.0111353
[200]	fit's binary_logloss: 0.00797899	val's binary_logloss: 0.00920705
[300]	fit's binary_logloss: 0.00679675	val's binary_logloss: 0.00850492
[400]	fit's binary_logloss: 0.00611734	val's binary_logloss: 0.00817978
[500]	fit's binary_logloss: 0.00559571	val's binary_logloss: 0.00797129
[600]	fit's binary_logloss: 0.00522142	val's binary_logloss: 0.00783527
[700]	fit's binary_logloss: 0.0049254	val's binary_logloss: 0.00774381
[800]	fit's binary_logloss: 0.00468476	val's binary_logloss: 0.00768711
[900]	fit's binary_logloss: 0.00446799	val's binary_logloss: 0.00764279
[1000]	fit's binary_logloss: 0.00426088	val's binary_logloss: 0.00759326
[1100]	fit's binary_logloss: 0.00406992	val's binary_logloss: 0.00755357
[1200]	fit's binary_logloss: 0.0038895	val's binary_logloss: 0.00751615
[1300]	fit's binary_logloss: 0.00372246	val's binary_logloss: 0.00748116
[140

precision    recall  f1-score   support

       False     0.9987    0.9996    0.9991   1683412
        True     0.9084    0.7537    0.8239      8899

    accuracy                         0.9983   1692311
   macro avg     0.9536    0.8766    0.9115   1692311
weighted avg     0.9982    0.9983    0.9982   1692311

Feature importances.

In [62]:
pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

7x7_skew        16226
3x3_std         15515
7x7_kurtosis    14378
pixel_value     13211
7x7_entropy     12990
7x7_std         12600
5x5_entropy     11923
5x5_std         10648
eccentricity     8375
area             8329
3x3_min          7552
5x5_min          6739
solidity         5105
3x3_max          4495
5x5_max          3051
dtype: int32

## Out-of-fold predictions

In [63]:
%run toolbox.py

In [64]:
oof.head()

sequence  frame  r    c  
17        5      152  623    False
506       4      253  440    False
866       1      162  226    False
238       3      88   432    False
516       1      401  92     False
dtype: bool

In [65]:
save_predictions(oof[oof].sort_index(), path='oof.json', n_sequences=1280)

100%|██████████| 1280/1280 [00:02<00:00, 510.05it/s]


In [66]:
!python validation.py oof.json data/spotGEO/train_anno.json

Score: 0.276092, (MSE: 44956.294525)


In [None]:
Score: 0.276092, (MSE: 44956.294525)

## Test predictions

In [67]:
y_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2,3,4
sequence,frame,r,c,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,2,334,0.000298,0.000424,0.000567,0.000758,0.000713
1,1,7,339,8e-05,7.2e-05,6.3e-05,0.000129,0.000108
1,1,10,264,0.000244,6.9e-05,0.000192,0.000126,0.000172
1,1,6,321,0.000535,0.000689,0.000592,0.000541,0.00103
1,1,18,40,0.000227,7e-05,0.00019,0.000145,0.000179


In [70]:
import zipfile

sightings = y_test.mean(axis='columns') > .5
save_predictions(sightings[sightings], path='submission.json', n_sequences=5120)

with zipfile.ZipFile('submission.zip', mode='w') as f:
    f.write('submission.json')

100%|██████████| 5120/5120 [00:10<00:00, 475.08it/s]


Next is [PostProcessing.ipynb](PostProcessing.ipynb).