In [1]:
import os
from pathlib import Path
import pickle
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Isolating samples with high confidence

In [2]:
path="../data/filtered_labels.geojson"
data_path="/n/holyscratch01/tambe_lab/kayan/karena/images/"

In [3]:
label_df = gpd.read_file(path)
# take out any files that are not present in the image directory
dir_ids = [Path(file_name).stem for file_name in os.listdir(data_path)]
label_df = label_df[label_df["unique_id"].isin(dir_ids)]

In [4]:
confidence_threshold = 5

In [16]:
# restrict to only use samples meeting confidence threshold
confident_label_df = label_df[label_df["confidence"] >= confidence_threshold]
unconfident_label_df = label_df[label_df["confidence"] < confidence_threshold]

In [6]:
# number of samples total
len(label_df)

14641

In [7]:
# number of high-confidence samples
len(confident_label_df)

7587

# Maintaining mine proportion in confident data

In [11]:
# proportion of samples with mines in whole dataset
len(label_df[label_df["label"] == 1])/len(label_df)

0.24567993989481593

In [17]:
# separate confident data into mine and no mine
confident_mine_df = confident_label_df[confident_label_df["label"] == 1]
confident_nomine_df = confident_label_df[confident_label_df["label"] == 0]

In [18]:
# proportion of samples with mines in confident dataset
len(confident_mine_df)/len(confident_label_df)

0.08751812310531172

In [19]:
# subsample no-mine data to maintain balance between mine and no-mine samples
num_mines_confident = len(confident_mine_df)
scale_factor = len(label_df[label_df["label"] == 0])/len(label_df[label_df["label"] == 1])
confident_nomine_df = confident_nomine_df.sample(int(num_mines_confident*scale_factor))

confident_label_df = pd.concat([confident_mine_df, confident_nomine_df])

In [20]:
len(confident_label_df)

2702

In [21]:
# verify that proportion of samples with mines in balanced confident dataset matches original
len(confident_label_df[confident_label_df["label"] == 1])/len(confident_label_df)

0.2457438934122872

# Maintain mine proportion in unconfident data

In [22]:
# retain original proportion of mines in unconfident dataset
unconfident_mine_df = unconfident_label_df[unconfident_label_df["label"] == 1]
unconfident_nomine_df = unconfident_label_df[unconfident_label_df["label"] == 0]

num_nomines_unconfident = len(unconfident_nomine_df)
scale_factor = len(label_df[label_df["label"] == 1])/len(label_df[label_df["label"] == 0])
unconfident_mine_df = unconfident_mine_df.sample(int(num_nomines_unconfident*scale_factor))

unconfident_label_df = pd.concat([unconfident_mine_df, unconfident_nomine_df])

In [23]:
# verify that proportion of samples with mines in balanced unconfident dataset matches original
len(unconfident_label_df[unconfident_label_df["label"] == 1])/len(unconfident_label_df)

0.24565257184697054

# Assign train/test/val split

In [26]:
# split confident dataset into train/val/test
stratify_col = "country"
random_state = 42

train_confident, test_confident = train_test_split(confident_label_df, 
        stratify=confident_label_df[stratify_col] if stratify_col is not None else None,
        test_size=0.2,
        random_state=random_state
    )
train_confident, val_confident = train_test_split(train_confident, 
            stratify=train_confident[stratify_col] if stratify_col is not None else None,
            test_size=0.2,
            random_state=random_state
        )

# subsample unconfident labels to maintain train-val-test split sizes
scale_factor = len(label_df[label_df["confidence"] < confidence_threshold])/len(label_df[label_df["confidence"] >= confidence_threshold])
unconfident_label_df = unconfident_label_df.sample(int((len(test_confident)+len(val_confident))*scale_factor))
# split unconfident dataset into train/val/test
val_unconfident, test_unconfident = train_test_split(unconfident_label_df, 
            stratify=unconfident_label_df[stratify_col] if stratify_col is not None else None,
            test_size=5/9,
            random_state=random_state
        )

train = train_confident
val = pd.concat([val_confident, val_unconfident])
test = pd.concat([test_confident, test_unconfident])

In [27]:
len(train), len(val), len(test)

(1728, 835, 1044)

In [None]:
split_ids = {"train": train, "val": val, "test":test}
# save as pickle file
out_path = "../data/split_confidence5_balanced"
with open(out_path, 'wb') as handle:
    pickle.dump(split_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Add some unconfident samples back into training data

In [6]:
# get unique identifiers of desired split
split_path = "../data/split_confidence5_balanced"
with open(split_path,'rb') as handle:
    split_data = pickle.load(handle)

In [15]:
val_test_ids = np.concatenate([split_data["val"], split_data["test"]])

In [20]:
# get IDs for unconfident samples that have not already been used in train set
unused_unconfident_labels=unconfident_label_df[~unconfident_label_df.unique_id.isin(val_test_ids)]
len(unused_unconfident_labels)

6149

In [22]:
# current size of train set
train_size = len(split_data["train"])
print(train_size)

1728


In [26]:
# double the size of training data by adding in unconfident data samples
additional_samples = unused_unconfident_labels.sample(train_size)
additional_sample_ids = additional_samples["unique_id"].values

In [31]:
split_data["train"] = np.concatenate([split_data["train"],additional_sample_ids])

In [33]:
len(split_data["train"]), len(split_data["val"]), len(split_data["test"])

(3456, 835, 1044)

In [35]:
out_path = "../data/split_confidence5_balanced_train_augmented"
with open(out_path, 'wb') as handle:
    pickle.dump(split_data, handle, protocol=pickle.HIGHEST_PROTOCOL)