# Introduction

This notebook loads the MapAI data from the Huggingface. It also computes some basic information about the images: <br>
* what percentage of each image is labeled as 1 in the mask, and 
* which images contain buildings and which are empty.

# Download the data

In [1]:
from pathlib import Path
import shutil
from datasets import load_dataset
import pandas as pd
import datetime

In [2]:
import sys
sys.path.append("../src")

In [2]:
dataset = load_dataset("sjyhne/mapai_training_data")
dataset

Found cached dataset mapai_training_data (/home/alex/.cache/huggingface/datasets/sjyhne___mapai_training_data/building_segmentation/1.0.0/b0b52f8c47ddbeae1962ab524cabb5fbed58d91cc70f9ac4c5981c071ad5f248)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'lidar', 'mask'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['image', 'lidar', 'mask'],
        num_rows: 1500
    })
})

Move the data to a user-defined location:

In [3]:
# Update in constants.py to point to wherever you want to store the raw MapAI data
from constants import DATADIR

In [4]:
DATADIR

PosixPath('/home/alex/data/mapai')

In [5]:
move=False

if move: shutil.move("../../data", DATADIR)

In [5]:
train, val = dataset['train'], dataset['validation']

In [6]:
train

Dataset({
    features: ['image', 'lidar', 'mask'],
    num_rows: 7000
})

# Add information about building extent

In [6]:
from PIL import Image
import numpy as np, matplotlib.pyplot as plt

In [7]:
from utils import get_building_percentage, count_buildings

## Compute and store in dataframes

In [10]:
train_data = {'image': train['image'], 'lidar': train['lidar'], 'mask': train['mask'], 'is_val':False}
train_df = pd.DataFrame.from_dict(train_data)

In [10]:
from utils import update_paths

In [12]:
train_df['image'] = train_df['image'].apply(update_paths)
train_df['lidar'] = train_df['lidar'].apply(update_paths)
train_df['mask'] = train_df['mask'].apply(update_paths)

In [13]:
train_df['mask_percentage'] = train_df['mask'].apply(get_building_percentage)

In [14]:
train_df.head()

Unnamed: 0,image,lidar,mask,is_val,mask_percentage
0,/home/alex/data/mapai/data/train/images/6179_4...,/home/alex/data/mapai/data/train/lidar/6179_49...,/home/alex/data/mapai/data/train/masks/6179_49...,False,0.155224
1,/home/alex/data/mapai/data/train/images/6051_6...,/home/alex/data/mapai/data/train/lidar/6051_69...,/home/alex/data/mapai/data/train/masks/6051_69...,False,0.0
2,/home/alex/data/mapai/data/train/images/6121_8...,/home/alex/data/mapai/data/train/lidar/6121_86...,/home/alex/data/mapai/data/train/masks/6121_86...,False,0.017824
3,/home/alex/data/mapai/data/train/images/6173_6...,/home/alex/data/mapai/data/train/lidar/6173_63...,/home/alex/data/mapai/data/train/masks/6173_63...,False,0.0
4,/home/alex/data/mapai/data/train/images/6147_4...,/home/alex/data/mapai/data/train/lidar/6147_48...,/home/alex/data/mapai/data/train/masks/6147_48...,False,0.182664


In [15]:
val_data = {'image': val['image'], 'lidar': val['lidar'], 'mask': val['mask'], 'is_val':True}
val_df = pd.DataFrame.from_dict(val_data)

In [16]:
val_df['image'] = val_df['image'].apply(update_paths)
val_df['lidar'] = val_df['lidar'].apply(update_paths)
val_df['mask'] = val_df['mask'].apply(update_paths)

In [17]:
val_df['mask_percentage'] = val_df['mask'].apply(get_building_percentage)

In [18]:
df = pd.concat([train_df, val_df]).reset_index(drop=True)

In [19]:
df.head()

Unnamed: 0,image,lidar,mask,is_val,mask_percentage
0,/home/alex/data/mapai/data/train/images/6179_4...,/home/alex/data/mapai/data/train/lidar/6179_49...,/home/alex/data/mapai/data/train/masks/6179_49...,False,0.155224
1,/home/alex/data/mapai/data/train/images/6051_6...,/home/alex/data/mapai/data/train/lidar/6051_69...,/home/alex/data/mapai/data/train/masks/6051_69...,False,0.0
2,/home/alex/data/mapai/data/train/images/6121_8...,/home/alex/data/mapai/data/train/lidar/6121_86...,/home/alex/data/mapai/data/train/masks/6121_86...,False,0.017824
3,/home/alex/data/mapai/data/train/images/6173_6...,/home/alex/data/mapai/data/train/lidar/6173_63...,/home/alex/data/mapai/data/train/masks/6173_63...,False,0.0
4,/home/alex/data/mapai/data/train/images/6147_4...,/home/alex/data/mapai/data/train/lidar/6147_48...,/home/alex/data/mapai/data/train/masks/6147_48...,False,0.182664


In [20]:
df['is_building'] = False

In [21]:
df.loc[df['mask_percentage'] > 0, 'is_building'] = True

In [22]:
len(df)

8500

In [23]:
df.is_building.value_counts()

True     6458
False    2042
Name: is_building, dtype: int64

## Save to CSV

In [25]:
df.to_csv(f'../csv/train_val_original-{datetime.date.today()}.csv', index=False)

# Inspect data

In [6]:
df = pd.read_csv(f'../csv/train_val_original-2022-11-24.csv')

In [7]:
from utils import update_paths

In [8]:
df['image'] = df['image'].apply(update_paths)
df['lidar'] = df['lidar'].apply(update_paths)
df['mask'] = df['mask'].apply(update_paths)

In [9]:
df.head()

Unnamed: 0,image,lidar,mask,is_val,mask_percentage,is_building
0,/home/alex/data/mapai/data/train/images/6179_4...,/home/alex/data/mapai/data/train/lidar/6179_49...,/home/alex/data/mapai/data/train/masks/6179_49...,False,0.155224,True
1,/home/alex/data/mapai/data/train/images/6051_6...,/home/alex/data/mapai/data/train/lidar/6051_69...,/home/alex/data/mapai/data/train/masks/6051_69...,False,0.0,False
2,/home/alex/data/mapai/data/train/images/6121_8...,/home/alex/data/mapai/data/train/lidar/6121_86...,/home/alex/data/mapai/data/train/masks/6121_86...,False,0.017824,True
3,/home/alex/data/mapai/data/train/images/6173_6...,/home/alex/data/mapai/data/train/lidar/6173_63...,/home/alex/data/mapai/data/train/masks/6173_63...,False,0.0,False
4,/home/alex/data/mapai/data/train/images/6147_4...,/home/alex/data/mapai/data/train/lidar/6147_48...,/home/alex/data/mapai/data/train/masks/6147_48...,False,0.182664,True


In [10]:
from plotting import plot_image_and_masks_from_df

In [11]:
from ipywidgets import interact, interactive, IntSlider, Select, RadioButtons, fixed, BoundedIntText

In [12]:
interactive_plot = interactive(plot_image_and_masks_from_df, df=fixed(df),
                               imgidx = BoundedIntText(min=0, max=len(df)-1, step=1, value=0),
                               figsize = BoundedIntText(min=4, max=12, step=1, value=6),
                               with_segm= RadioButtons(options=[True,False], value=True, 
                                                      description="With segmentation"))

output = interactive_plot.children[-1]

In [13]:
interactive_plot

interactive(children=(BoundedIntText(value=0, description='imgidx', max=8499), BoundedIntText(value=6, descrip…

# Insert location information

In [14]:
def get_location(fn):
    fn = Path(fn)
    
    return "_".join(fn.stem.split("_")[:-1])

In [15]:
df['location'] = df['image'].apply(get_location)

In [16]:
df.head()

Unnamed: 0,image,lidar,mask,is_val,mask_percentage,is_building,location
0,/home/alex/data/mapai/data/train/images/6179_4...,/home/alex/data/mapai/data/train/lidar/6179_49...,/home/alex/data/mapai/data/train/masks/6179_49...,False,0.155224,True,6179_495
1,/home/alex/data/mapai/data/train/images/6051_6...,/home/alex/data/mapai/data/train/lidar/6051_69...,/home/alex/data/mapai/data/train/masks/6051_69...,False,0.0,False,6051_690
2,/home/alex/data/mapai/data/train/images/6121_8...,/home/alex/data/mapai/data/train/lidar/6121_86...,/home/alex/data/mapai/data/train/masks/6121_86...,False,0.017824,True,6121_865
3,/home/alex/data/mapai/data/train/images/6173_6...,/home/alex/data/mapai/data/train/lidar/6173_63...,/home/alex/data/mapai/data/train/masks/6173_63...,False,0.0,False,6173_630
4,/home/alex/data/mapai/data/train/images/6147_4...,/home/alex/data/mapai/data/train/lidar/6147_48...,/home/alex/data/mapai/data/train/masks/6147_48...,False,0.182664,True,6147_481


In [37]:
df.to_csv(f'../csv/train_val_with_info-{datetime.date.today()}.csv', index=None)