In [2]:
import pandas as pd
import os
import sys
sys.path.append('../')
import utils.csv as csv
import utils.plot as plot

In [3]:
OUTPUT_DIR = 'D:\\Deutschland\\FUB\\master_thesis\\data\\ref\\all'

REF_CSV = 'reference_pure.csv'
LABEL_CSV = 'label_8pure9.csv'
MERGE_CSV = 'merged_pure.csv'
CLASS_CSV = '8pure_classes.csv'

ref_path = os.path.join(OUTPUT_DIR, REF_CSV)
merge_path = os.path.join(OUTPUT_DIR, MERGE_CSV)
label_path = os.path.join(OUTPUT_DIR, LABEL_CSV)
class_path = os.path.join(OUTPUT_DIR, CLASS_CSV)
aoi_path = 'D:\\Deutschland\\FUB\\master_thesis\\data\\ref\\validation\\merged_aoi.csv'

categories = ['Spruce', 'Silver Fir', 'Douglas Fir', 'Pine', 'Oak', 'Red Oak', 'Beech', 'Sycamore', 'Others']

Pre-Process

In [38]:
def statistics() -> pd.DataFrame:
    """Count percentage of selected species over each polygon"""
    ref = csv.load(file_path=ref_path, index_col='id')
    # selected species
    cols = ['Spruce','Sliver Fir','Douglas Fir','Pine','Oak','Red Oak','Beech','Sycamore','Others','id']
    labels = []
    for index, row in ref.iterrows():
        label = pd.DataFrame(columns=cols, index=[0])
        label.fillna(value=0, inplace=True)
        label['id'] = index
        # if 831 in [row[0], row[1], row[2]]:
        #     continue
        for i in range(8):
            if row[i] == 110:
                label['Spruce'] += row[i + 8]
            elif row[i] == 210:
                label['Sliver Fir'] += row[i + 8]
            elif row[i] == 310:
                label['Douglas Fir'] += row[i + 8]
            elif row[i] == 410:
                label['Pine'] += row[i + 8]
            elif row[i] == 600:
                label['Oak'] += row[i + 8]
            elif row[i] == 630:
                label['Red Oak'] += row[i + 8]
            elif row[i] == 710:
                label['Beech'] += row[i + 8]
            elif row[i] == 821:
                label['Sycamore'] += row[i + 8]
            else:
                label['Others'] += row[i + 8]
        labels.append(label)
    output = pd.concat(labels, ignore_index=True)
    output.set_index(['id'], inplace=True)
    return output

In [41]:
def export_classes() -> pd.DataFrame:
    """Export percentage of selected species as csv file"""
    # load labels
    classes = statistics()
    merged = csv.load(merge_path,'id')
    # select both labels and data are available polygons
    output = pd.merge(classes, merged, on='id', how='inner')
    # delete sum of percentage is not 100%
    # **************change number of column here**************
    output['sum'] = output.iloc[:,0:9].sum(axis=1)
    # ********************************************************
    output = output[output['sum'] == 100]
    # delete useless columns
    output.drop(columns=['sum'], axis=1, inplace=True)
    output = output / 100
    # delete samples which are in the aois
    aoi = csv.load(aoi_path, 'id')
    diff_index = output.index.difference(aoi.index)
    output = output.loc[diff_index]
    # export result as csv
    csv.export(output, class_path, True)
    return output

export_classes()

export file D:\Deutschland\FUB\master_thesis\data\ref\all\8_pure_classes.csv


Unnamed: 0_level_0,Spruce,Sliver Fir,Douglas Fir,Pine,Oak,Red Oak,Beech,Sycamore,Others
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00
2,0.95,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.05
22,0.00,0.0,0.0,0.00,0.90,0.0,0.10,0.00,0.00
43,0.00,0.0,0.0,0.00,0.90,0.0,0.10,0.00,0.00
75,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.05,0.95
...,...,...,...,...,...,...,...,...,...
418098,0.00,0.0,0.0,0.90,0.05,0.0,0.05,0.00,0.00
418103,0.00,0.0,0.0,0.95,0.00,0.0,0.05,0.00,0.00
418105,0.00,0.0,0.0,0.00,0.00,0.0,0.05,0.05,0.90
418109,0.00,0.0,0.0,1.00,0.00,0.0,0.00,0.00,0.00


Multi-Label

In [None]:
def multi_label() -> pd.DataFrame:
    """Build multi-hot label for 8 classes, 1 means existing 0 means not"""
    labels = csv.load(class_path, 'id')
    labels.drop('Others', axis=1, inplace=True)
    labels = labels.applymap(lambda x: (x > 0.05 and 1 or 0))
    csv.export(labels, label_path, True)
    return labels
multi_label()

Multi-Class

In [4]:
def plot_sample_count() -> list:
    """Count sample number for each class based on centain classification strategy"""
    classes = csv.load(class_path, 'id')
    counts = [0 for _ in range(9)]
    # count
    for index, row in classes.iterrows():
        # spruce
        if row[0] >= 0.9:
            counts[0] += 1
        # sliver fir
        elif row[1] >= 0.9:
            counts[1] += 1
        # douglas fir
        elif row[2] >= 0.9:
            counts[2] += 1
        # pine
        elif row[3] >= 0.9:
            counts[3] += 1
        # oak
        elif row[4] >= 0.9:
            counts[4] += 1
        # red oak
        elif row[5] >= 0.9:
            counts[5] += 1
        # beech
        elif row[6] >= 0.9:
            counts[6] += 1
        # sycamore
        elif row[7] >= 0.9:
            counts[7] += 1
        # others
        else:
            counts[8] += 1
    # plot
    plot.draw_bar_chart(categories, counts, 'Unbalanced')
    print('plot successfully')
    return counts

plot_sample_count()

plot successfully


[24099, 750, 3407, 1316, 2018, 1198, 7997, 960, 4277]

<Figure size 700x700 with 0 Axes>

In [5]:
counts = [24099//5, 750, 3407, 1316, 2018, 1198, 7997//2, 960, 4277//4]
plot.draw_bar_chart(categories, counts, 'Resampled')

<Figure size 700x700 with 0 Axes>

In [46]:
def classify() -> pd.DataFrame:
    """
    spruce: 0
    sliver fir: 1
    douglas fir: 2
    pine: 3
    oak: 4
    red oak: 5
    beech: 6
    sycamore: 7
    others: 8
    """
    labels = csv.load(class_path, 'id')
    labels['class'] = 0
    for index, row in labels.iterrows():
        if row[0] >= 0.9:
            labels.loc[index, 'class'] = 0
        elif row[1] >= 0.9:
            labels.loc[index, 'class'] = 1
        elif row[2] >= 0.9:
            labels.loc[index, 'class'] = 2
        elif row[3] >= 0.9:
            labels.loc[index, 'class'] = 3
        elif row[4] >= 0.9:
            labels.loc[index, 'class'] = 4
        elif row[5] >= 0.9:
            labels.loc[index, 'class'] = 5
        elif row[6] >= 0.9:
            labels.loc[index, 'class'] = 6
        elif row[7] >= 0.9:
            labels.loc[index, 'class'] = 7
        else:
            labels.loc[index, 'class'] = 8
    # manually split label
    # **************change number of column here**************
    labels.drop(labels.columns[[i for i in range(9)]],  axis=1, inplace=True)
    # labels = labels[labels['class'] != 7]
    # ********************************************************
    group = labels.groupby('class')
    y_train = []
    y_val = []
    y_test = []
    for key, df in group:
        # shuffle the order of the sample
        df = df.sample(frac=1, random_state=24)
        # resample
        sz = len(df)
        if sz > 24000:
            sz /= 5
        elif sz > 7500:
            sz /= 2
        elif sz > 4000:
            sz /= 4
        train_size = round(sz * 0.8)
        val_size = round(sz * 0.1)
        test_size = round(sz * 0.1)
        head = df.head(train_size)
        tail = df.tail(val_size + test_size)
        y_train.append(head)
        y_val.append(tail.head(val_size))
        y_test.append(tail.tail(test_size))
    output = pd.concat(y_train + y_val + y_test)
    csv.export(output, label_path, True)
    return output

classify()

export file D:\Deutschland\FUB\master_thesis\data\ref\all\label_8pure9.csv


Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
296878,0
341722,0
377214,0
233863,0
18313,0
...,...
413011,8
165076,8
311479,8
22673,8
