In [1]:
from __future__ import print_function, division

import torchvision

import skimage
from PIL import Image

import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

from pprint import pprint
import json
from tqdm import tqdm

plt.ion()   # interactive mode

In [2]:
class SatelliteImageDataset(Dataset):
    """Load a satellite dataset"""

    def __init__(self, root_dir, transform=None, device=torch.device("cpu")):
        """
        Create a satellite image dataset
        :param root_dir: String, The path where are stored the images
        :param transform: torchvion transform function, Optional transform to be applied
                on an image.
        :device: Pytorch device: cpu or gpu to move the data into the good device
        """
        self.root_dir = root_dir
        self.L_image_name = os.listdir(root_dir)
        self.transform = transform
        self.device = device

    def __len__(self):
        return len(self.L_image_name)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        image = pil_loader(os.path.join(self.root_dir,
                             self.L_image_name[idx]))
        if self.transform is not None:
            image = self.transform(image)
        return image.to(self.device)

def pil_loader(path):
    """
    Load an image into PIL format and convert it into RGB    
    :param path: String, Complete path of the image file
    :return: PIL image
    """
    image = Image.open(path)
    return image.convert("RGB")
    
def show_tensor_image(tensor):
    """
    Take a tensor and show the corresponding image
    :param tensor: Pytorch Tensor, [channels, height, width]
    :return:
    """
    tensor = tensor.transpose(0, 1)
    tensor = tensor.transpose(1, 2)
    io.imshow(tensor.numpy())

In [3]:
def create_list_label_sample(data_dir, dataset_name):
    def list_sorted(name):
        list_ = []
        for dir_, _, filenames in os.walk(os.path.join(data_dir,dataset_name, name)):
            for f in filenames:
                if f[0] != ".":
                    list_.append(os.path.abspath(os.path.join(dir_, f)))
        list_.sort(key=lambda f: f.split('/')[-1])
        return list_
    list_sample_path, list_label_path = list_sorted("samples"), list_sorted("labels")
    set_sample_id = set(list(map(lambda x: x.split('/')[-1].replace(".jpg",""), list_sample_path)))
    set_label_id = set(list(map(lambda x: x.split('/')[-1].replace(".json",""), list_label_path)))
    id_diff = set_sample_id.difference(set_label_id)
    id_diff.update(set_label_id.difference(set_sample_id))
    list_sample = list(filter(lambda x: x.split('/')[-1].replace(".jpg","") not in id_diff, list_sample_path))
    list_label_path= list(filter(lambda x: x.split('/')[-1].replace(".json","") not in id_diff, list_label_path))
    return list_sample_path, list_label_path
    
# Path
data_dir = "/classification_speed_boat/data/"
dataset_name = "train"

list_sample_path, list_label_path = create_list_label_sample(data_dir, dataset_name)
print(np.array([i.split('/')[-1][:-5] != j.split('/')[-1][:-6] for i,j in (zip(list_sample_path, list_label_path))]).sum())


0


In [4]:
data_dir2 = "/classification_speed_boat/data/"
dataset_name2 = "test_students"

list_sample_path2, list_label_path2 = create_list_label_sample(data_dir2, dataset_name2)
print(np.array([i.split('/')[-1][:-5] != j.split('/')[-1][:-6] for i,j in (zip(list_sample_path2, list_label_path2))]).sum())

0


In [5]:
sample = list_sample_path[0]
label = list_label_path[80]

In [6]:
sample

'/classification_speed_boat/data/train/samples/431c7e24-dab7-41de-b568-d8ad25b39704/ac152bf238910f22d049f9ba315686fc0ad8688e/00007ef36dd75f697905cda4d3c90dc7.jpg'

In [7]:
"""
for label in list_label2:
    with open(label, 'rb') as f:
        boat_info = json.load(f)
        for elem in boat_info["features"]:
            if "record_id" in elem["properties"]:
                pprint(elem["properties"]) 
                break
"""

'\nfor label in list_label2:\n    with open(label, \'rb\') as f:\n        boat_info = json.load(f)\n        for elem in boat_info["features"]:\n            if "record_id" in elem["properties"]:\n                pprint(elem["properties"]) \n                break\n'

In [8]:
with open(label, 'rb') as f:
    boat_info = json.load(f)
pprint(boat_info)

{'features': [{'geometry': {'coordinates': [[[768.0, 600.4],
                                             [751.8, 605.8],
                                             [754.3, 613.6],
                                             [768.0, 609.1],
                                             [768.0, 600.4]]],
                            'type': 'Polygon'},
               'properties': {'angle': 71.7,
                              'comment': None,
                              'confidence': None,
                              'created_at': '2019-09-26T12:22:56.101046',
                              'dataset_id': '21928740-73ad-44db-8517-6f1ea076876c',
                              'id': 'record.5ea3398c-e058-11e9-a9bf-4e46ad189ba4',
                              'image_2_id': None,
                              'image_id': '37c89d0527e0edbe0b0d589e94715780731af818',
                              'job_id': None,
                              'jobtask_id': None,
                              

In [9]:
t = boat_info["features"][0]["geometry"]["coordinates"][0]
boat_info["features"][0]["geometry"]["coordinates"][0]

[[768.0, 600.4],
 [751.8, 605.8],
 [754.3, 613.6],
 [768.0, 609.1],
 [768.0, 600.4]]

In [10]:
t = np.array(t)
np.min(t[:,0])

751.8

In [11]:
def polygone_to_min_max_coordinates(polygone):
    # Remove the last point that correspond to the first one
    polygone = np.array(polygone[:-1])
    # Get min and max coordinates transformed to int
    xmin = int(np.min(polygone[:,0]))
    xmax = int(np.max(polygone[:,0])) + 1
    ymin = int(np.min(polygone[:,1]))
    ymax = int(np.max(polygone[:,1])) + 1
    return xmin, xmax, ymin, ymax

In [55]:
def boat_info_from_json(label_path, image_path):
    # read the json file containing the boat info
    with open(label_path, 'rb') as f:
        boat_info = json.load(f)
    # l_res contains the final info to keep
    l_res = []
    for feature in boat_info["features"]:
        properties = feature["properties"]
        infos = []
        # Finding the speed tag if not exists continue
        if "tags" not in properties:
            continue
        speed = None
        for speed_tag in ["idle", "fast", "slow"]:
            if speed_tag in properties["tags"]:
                speed = speed_tag
        # If there is not a record_id or not speed tag then continue
        if "record_id" not in properties or speed is None:
            continue
        xmin, xmax, ymin, ymax = polygone_to_min_max_coordinates(feature["geometry"]["coordinates"][0])
        infos.append(properties["record_id"])
        # Appending all infos
        infos.append(xmin)
        infos.append(xmax)
        infos.append(ymin)
        infos.append(ymax)
        infos.append(properties["angle"])
        infos.append(properties["length"])
        infos.append(properties["width"])
        infos.append(properties["kept_percentage"])
        infos.append(speed)
        infos.append(image_path)
        l_res.append(infos)
    return l_res

def boat_into_to_csv(file_name_res, list_label_path, list_sample_path):
    l_info = []
    col_names = ["record_id", "xmin", "xmax", "ymin", "ymax", "angle", "length", "width", "kept_percentage", "tag", "image_path"]
    for label_path, image_path in zip(list_label_path, list_sample_path):
        l_info += boat_info_from_json(label_path, image_path)
    # Transform list to Pandas dataframe
    df_info = pd.DataFrame(l_info, columns=col_names)
    # Removing duplicate record_id by keeping only the best 'kept_percentage'
    df_info = df_info.groupby('record_id', group_keys=False).apply(lambda x: x.loc[x.kept_percentage.idxmax()])
    df_info.to_csv(file_name_res, sep=",", index=False)

In [56]:
boat_into_to_csv("toto.csv", list_label_path, list_sample_path)

In [57]:
df_info = pd.read_csv("toto.csv")

In [54]:
df_info["zone_id"].unique().shape

(333,)

In [None]:
# Path
root_dir_train = "/home/maxence/Documents/hackathon/train"
root_dir_valid = "/home/maxence/Documents/hackathon/valid"
# Arg for transformation
size = 258
transform = transforms.Compose([transforms.Resize((size, size)),
                                transforms.ToTensor()])
# Create dataframe
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_dataset = SatelliteImageDataset(root_dir_train, transform, device)
valid_dataset = SatelliteImageDataset(root_dir_train, transform, device)
# create loader
batch_size = 64
num_worker = 0
train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          num_workers=num_worker, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size,
                          num_workers=num_worker, shuffle=True)

In [None]:
batch = next(iter(train_loader))
print(batch.shape)

In [None]:
show_tensor_image(batch[0])