In [29]:
# Useful starting lines
%matplotlib inline
import numpy as np
import os
import sys
from PIL import Image
import math
import re
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Get train data

## Get groundtruth

### Functions

In [30]:
def load_data(path_file):
    """load data."""
    data = np.genfromtxt(
        path_file, delimiter=",", skip_header=1)
    y = np.genfromtxt(
        path_file, delimiter=",", skip_header=1, usecols=[1],dtype=int)
    return y

### Load data groundtruth

In [31]:
DATA_FOLDER = 'DATA/'
DATA_TRAIN = 'train_submission.csv'

y = load_data(DATA_FOLDER+DATA_TRAIN)
print(y[:10])
print(y.shape)

[0 0 0 0 0 0 0 0 0 0]
(62500,)


## Get mean_data

In [32]:
mean_patches = []
for i in range(1, 101):
    image_filename = 'DATA/training/images/satImage_' + '%.3d' % i + '.png'
    
    img_number = int(re.search(r"\d+", image_filename).group(0))
    im = mpimg.imread(image_filename)
    patch_size = 16
    for j in range(0, im.shape[1], patch_size):
        for i in range(0, im.shape[0], patch_size):
            patch = im[i:i + patch_size, j:j + patch_size]
            mean_patch = np.mean(patch) #Get patch mean
            mean_patches.append(mean_patch)
            
            
print(len(mean_patches))
print(mean_patches[:10])

62500
[0.19116116, 0.19870813, 0.32159415, 0.36853552, 0.21805046, 0.3175756, 0.20872141, 0.20118976, 0.36087623, 0.253365]


## Normalization

#### Function

In [33]:
def standardize(x_list):
    mean = np.mean(x_list)
    std = np.std(x_list)
    for i in range(len(x_list)):
        x_list[i] = (x_list[i]-mean)/std
    return x_list

In [34]:
mean_patches = standardize(mean_patches)

print(len(mean_patches))
print(mean_patches[:10])
print(np.amax(mean_patches))

62500
[-1.0156374, -0.9559706, 0.0155737065, 0.38669503, -0.8030489, -0.016197154, -0.87680495, -0.9363507, 0.32614022, -0.52385]
5.291933


# Split train and test

## Function(s)

In [35]:
def split_data(x, y, ratio=0.7, seed=1):

    np.random.seed(seed)
    
    N = x.shape[0]
    cut = int(N*ratio)
    permutation = np.random.permutation(N)
    x,y = x[permutation],y[permutation]
    return x[:cut],x[cut:],y[:cut],y[cut:]

## Split

In [36]:
x = np.asarray(mean_patches).T

split_ratio = 0.7

x_tr, x_te, y_tr, y_te = split_data(x, y, ratio=split_ratio)

print(len(x_tr))
print(len(y_tr))
print(len(x_te))
print(len(y_te))

43750
43750
18750
18750


# Scores

## Functions

In [37]:
def classify(y, seuil):
    """projects y on {0,1}"""
    y[y<seuil] = 0
    y[y>=seuil] = 1
    return y

def check_model(x_test,y_test, seuil):
    y = classify(x_test, seuil)
    diff= (y_test == classify(y, seuil))
    accuracy = diff.sum()/len(diff)
    return y,accuracy

## Train Score

In [38]:
seuil = 0.4

check_model(x_tr, y_tr, seuil)

(array([0., 1., 0., ..., 0., 1., 0.], dtype=float32), 0.6219885714285714)

## Test Score

In [39]:
check_model(x_te, y_te, seuil)

(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 0.62544)

# Conclusion

Opt params:
    
    thresh = 0.4

Train score:  0.6220845714285714  +/- 0.0008428768046954921

Test  score:  0.625216  +/- 0.001966712544289519

Aicrownd score : F1 SCORE = 0.260, ACCURACY = 0.608 (Submission #205603)

# Submission

## Get the data

In [12]:
mean_patches = []
for i in range(1, 51):
    image_filename = 'DATA/test_set_images/test_' + str(i) +'/test_' + str(i) + '.png'
    
    img_number = int(re.search(r"\d+", image_filename).group(0))
    im = mpimg.imread(image_filename)
    patch_size = 16
    
    for j in range(0, im.shape[1], patch_size):
        for i in range(0, im.shape[0], patch_size):
            patch = im[i:i + patch_size, j:j + patch_size]
            mean_patch = np.mean(patch) #Get patch mean
            mean_patches.append(mean_patch)
            
            
print(len(mean_patches))
print(mean_patches[:10])

72200
[0.4114022, 0.27899304, 0.35060254, 0.23368056, 0.30746529, 0.2864941, 0.2691636, 0.29165134, 0.33160743, 0.3213797]


## Standerdize

In [13]:
mean_patches = standardize(mean_patches)

print(len(mean_patches))
print(mean_patches[:10])
print(np.amax(mean_patches))

72200
[0.7235832, -0.33005232, 0.23977467, -0.69062287, -0.10348666, -0.27036318, -0.40826926, -0.2293249, 0.08862259, 0.0072361478]
5.397431


## Apply threshold guess

In [14]:
seuil = 0.39

x = np.asarray(mean_patches).T
x = classify(x, seuil)

print(x[:10])

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Write submission file

### Function(s)

In [15]:
def guess_to_submission_strings(image_filename, guess, initial_, img_number):
    #img_number = int(re.search(r"\d+", image_filename).group(0))
        
    im = mpimg.imread(image_filename)
    patch_size = 16
    k = initial_
    
    for j in range(0, im.shape[1], patch_size):
        for i in range(0, im.shape[0], patch_size):
            guess_i = guess[k]            
            patch = im[i:i + patch_size, j:j + patch_size]
            
            yield("{:03d}_{}_{}, {}".format(img_number, j, i, guess_i))
            k = k + 1

def guess_to_submission(submission_filename, *image_filenames, guess):
    with open(submission_filename, 'w') as f:
        f.write('id,prediction\n')
        
        initial_ = 0
        img_number = 1
        
        for fn in image_filenames[0:]:
            f.writelines('{}\n'.format(s) for s in guess_to_submission_strings(image_filename, guess, initial_, img_number))
            initial_ = initial_ + 1443
            img_number = img_number + 1

### Write threshold_guess_submission.csv

In [16]:
seuil = 0.4

guess = classify(x, seuil)

submission_filename = 'threshold_guess_submission.csv'
image_filenames = []
for i in range(1, 51):
    
    image_filename = 'DATA/test_set_images/test_' + str(i) +'/test_' + str(i) + '.png'
    print(image_filename, end="\r")
    image_filenames.append(image_filename)

guess_to_submission(submission_filename, *image_filenames, guess = guess)

DATA/test_set_images/test_50/test_50.png

## Get mask from submission file

### Function(s)

In [27]:
import math

label_file = 'Submissions/threshold_guess_submission.csv'

h = 16
w = h
imgwidth = int(math.ceil((600.0/w))*w)
imgheight = int(math.ceil((600.0/h))*h)
nc = 3

# Convert an array of binary labels to a uint8
def binary_to_uint8(img):
    rimg = (img * 255).round().astype(np.uint8)
    return rimg

def reconstruct_from_labels(image_id):
    im = np.zeros((imgwidth, imgheight), dtype=np.uint8)
    f = open(label_file)
    lines = f.readlines()
    image_id_str = '%.3d_' % image_id
    for i in range(1, len(lines)):
        line = lines[i]
        if not image_id_str in line:
            continue

        tokens = line.split(',')
        id = tokens[0]
        prediction = int(float(tokens[1]))
        tokens = id.split('_')
        i = int(float(tokens[1]))
        j = int(tokens[2])

        je = min(j+w, imgwidth)
        ie = min(i+h, imgheight)
        if prediction == 0:
            adata = np.zeros((w,h))
        else:
            adata = np.ones((w,h))

        im[j:je, i:ie] = binary_to_uint8(adata)

    Image.fromarray(im).save('Predictions/threshold_guess/prediction_' + '%.3d' % image_id + '.png')

    return im

### Images reconstruction

In [28]:
for i in range(1, 51):
    reconstruct_from_labels(i)

# Optimisation and cross validation

## Function(s)

In [147]:
from scipy.stats import t
from scipy import sqrt
from statistics import variance, mean 

def int_ech(values,conf=0.95) :
    n = len(values) 
    m = mean(values) 
    s = variance(values)
    proba = (1-conf)*100 ; proba = (100-proba/2)/100 
    ddl = n - 1
    intervalle = sqrt(s/n) * t.ppf(proba, ddl)
    return(intervalle)

## Scores and cross validation train

In [162]:
x = np.asarray(mean_patches).T

threshold_list = list(np.arange(-0.5, 0.5, 0.1))

seed_list = list(np.arange(0, 10, 1))

mean_scores = np.zeros((len(threshold_list), 3))

scores = []

for i in range(len(threshold_list)):
    thresh_ = threshold_list[i]
    for k in range(len(seed_list)):
        seed_ = seed_list[k]
        x_tr, x_te, y_tr, y_te = split_data(x, y, ratio=0.7, seed = seed_)
    
        scores.append(check_model(x_tr, y_tr, thresh_)[1])
    
    mean_scores[i] = [thresh_, np.mean(scores), int_ech(scores,conf=0.95)]
    scores = []

max_ = np.argmax(mean_scores[:, 1])

print("[thresh = ", mean_scores[max_][0], "], ", mean_scores[max_][1], " +/-", mean_scores[max_][2])

  intervalle = sqrt(s/n) * t.ppf(proba, ddl)


[thresh =  0.3999999999999998 ],  0.6220845714285714  +/- 0.0008428768046954921


## Cross validation test

In [163]:
thresh_ = 0.4

x = np.asarray(mean_patches).T

seed_list = list(np.arange(0, 10, 1))

scores = []
    
for k in range(len(seed_list)):
    seed_ = seed_list[k]
    
    x_tr, x_te, y_tr, y_te = split_data(x, y, ratio=0.7, seed = seed_)
        
    scores.append(check_model(x_te, y_te, thresh_)[1])

print(np.mean(scores), " +/-", int_ech(scores,conf=0.95))

0.625216  +/- 0.001966712544289519


  intervalle = sqrt(s/n) * t.ppf(proba, ddl)
