In [1]:
# 1. Generating training dataset for binary classification and the weights: labels_tmi_modified_valid.csv
# 2. Generating training dataset for HL binary classification and the weights: NAIP_data_highlow_50000_valid.csv
# 3. Generating prediction dataset: all_NAIP_2018.csv


In [None]:
# This script is used to pre-process the cropped boarder patches
import os
import numpy as np
import pandas as pd
import rasterio
import sys
import geopandas as gpd
from pathlib import Path
from tqdm import tqdm
from rasterio.mask import mask, raster_geometry_mask
from shapely.geometry import box
from rasterio.enums import Resampling
from itertools import product
from rasterio import windows
import rioxarray
import shapely
import shutil
from osgeo import gdal

path_cur = os.path.abspath('.')
sys.path.append(path_cur)

from os.path import dirname as up

In [2]:
label_file = Path(os.path.join(up(up(path_cur)), 'MarshMapping_local/data/NAIP_data_processing/labels_tmi_modified.csv'))
out_file = Path(os.path.join(up(up(path_cur)), 'MarshMapping_local/data/NAIP_data_processing/labels_tmi_modified_valid.csv'))

In [3]:
df = pd.read_csv(label_file)

In [4]:
testfile = df['patch_name'][0]
rasterio.open(testfile).read().shape

(4, 256, 256)

In [5]:
def filter_image_size(img, size):

    image = rasterio.open(img).read()
    w,h = image.shape[1:]
    
    if w!=size or h!=size:
        return 0
    else:
        return 1

def filter_image_zero(img):
    
    image = rasterio.open(img).read()
    
    zero_count = len(np.where(image==0)[0])
    
    if zero_count > 0:
        return 0
    else:
        return 1

def calculate_weights(img, val):
    
    image_array = rasterio.open(img).read(1)
    val_count = len(image_array[image_array==int(val)])

    return val_count


def get_patch_name(img):
    
    return os.path.basename(img)
        

In [6]:
df['filter_size'] = df['patch_name'].apply(lambda x: filter_image_size(x, 256))
df['filter_zero'] = df['patch_name'].apply(lambda x: filter_image_zero(x))

In [7]:
df['none_marsh'] = df['label'].apply(lambda x: calculate_weights(x, 0))
df['marsh'] = df['label'].apply(lambda x: calculate_weights(x, 1))

In [8]:
# df['valid'] = df.apply(lambda x: get_valid(x), axis=1)
valids = []

for index, row in df.iterrows():
    if row['filter_size']==1 and row['filter_zero']==1 and row['none_marsh']>0 and row['marsh']>0:
        valids.append(1)
    else:
        valids.append(0)

df['valid'] = valids

In [10]:

none_marsh_count = df[df['valid']==1]['none_marsh'].sum()
marsh_count = df[df['valid']==1]['marsh'].sum()

In [11]:
weights = [none_marsh_count / (none_marsh_count + marsh_count), marsh_count / (none_marsh_count + marsh_count)]

In [12]:
weights

[0.8345623386613673, 0.16543766133863266]

In [13]:
df = df.rename(columns={"patch_name": "patch_path", "label": "label_path"})

In [14]:
df['patch_name'] = df['patch_path'].apply(lambda x: get_patch_name(x))

In [16]:
df = df.iloc[:,[-1,0,1,2,3,4,5,6]]

In [17]:
# df.to_csv(out_file, encoding='utf-8', index=False)

In [18]:
df

Unnamed: 0,patch_name,patch_path,label_path,filter_size,filter_zero,none_marsh,marsh,valid
0,m_3607612_ne_18_060_20180830_wgs84_tile_2816-6...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,65536,0,0
1,m_3607612_ne_18_060_20180830_wgs84_tile_6144-1...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,43651,21885,1
2,m_3607612_ne_18_060_20180830_wgs84_tile_6912-5...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,57891,7645,1
3,m_3607501_sw_18_060_20180827_wgs84_tile_256-11...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,60878,4658,1
4,m_3607501_sw_18_060_20180827_wgs84_tile_768-11...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,64032,1504,1
...,...,...,...,...,...,...,...,...
3870,m_3807764_nw_18_060_20180827_wgs84_tile_10240-...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,60930,4606,1
3871,m_3807764_nw_18_060_20180827_wgs84_tile_11008-...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,56441,9095,1
3872,m_3807764_nw_18_060_20180827_wgs84_tile_11264-...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,53419,12117,1
3873,m_3807764_nw_18_060_20180827_wgs84_tile_11520-...,/rapids/notebooks/sciclone/geograd/Miranda/git...,/rapids/notebooks/sciclone/geograd/Miranda/git...,1,1,48535,17001,1
