In [1]:
import pandas as pd
import os
import sys
sys.path.append('../')
import utils.csv as csv

In [2]:
OUTPUT_DIR = 'D:\\Deutschland\\FUB\\master_thesis\\data\\ref\\all'

REF_CSV = 'reference_pure.csv'
LABEL_CSV = 'label_7pure.csv'
MERGE_CSV = 'merged_10.csv'
CLASS_CSV = '8_pure_classes.csv'

ref_path = os.path.join(OUTPUT_DIR, REF_CSV)
merge_path = os.path.join(OUTPUT_DIR, MERGE_CSV)
label_path = os.path.join(OUTPUT_DIR, LABEL_CSV)
class_path = os.path.join(OUTPUT_DIR, CLASS_CSV)

In [22]:
def statistics() -> pd.DataFrame:
    """Count percentage of selected species over each polygon"""
    ref = csv.load(file_path=ref_path, index_col='id')
    # selected species
    cols = ['Spruce','Sliver Fir','Douglas Fir','Pine','Oak','Red Oak','Beech','Sycamore','Others','id']
    labels = [] 
    for index, row in ref.iterrows():
        label = pd.DataFrame(columns=cols, index=[0])
        label.fillna(value=0, inplace=True)
        label['id'] = index
        if row[0] not in [110,210,310,410,600,630,710,821]:
            continue
        for i in range(8):
            if row[i] == 110:
                label['Spruce'] += row[i + 8]
            elif row[i] == 710:
                label['Beech'] += row[i + 8]
            elif row[i] == 210:
                label['Sliver Fir'] += row[i + 8]
            elif row[i] == 410:
                label['Pine'] += row[i + 8]
            elif row[i] == 310:
                label['Douglas Fir'] += row[i + 8]
            elif row[i] == 600:
                label['Oak'] += row[i + 8]
            elif row[i] == 630:
                label['Red Oak'] += row[i + 8]
            elif row[i] == 821:
                label['Sycamore'] += row[i + 8]
            else:
                label['Others'] += row[i + 8]
        labels.append(label)
    output = pd.concat(labels, ignore_index=True)
    output.set_index(['id'], inplace=True)
    return output

In [24]:
def export_classes() -> pd.DataFrame:
    """Export percentage of selected species as csv file"""
    # load labels
    classes = statistics()
    merged = csv.load(merge_path,'id')
    # select both labels and data are available polygons
    output = pd.merge(classes, merged, on='id', how='inner')
    # delete sum of percentage is not 100%
    # **************change number of column here**************
    output['sum'] = output.iloc[:,0:9].sum(axis=1)
    # ********************************************************
    output = output[output['sum'] == 100]
    # delete useless columns
    output.drop(columns=['sum'], axis=1, inplace=True)
    output = output / 100
    csv.export(output, class_path, True)
    return output

export_classes()

export file D:\Deutschland\FUB\master_thesis\data\ref\all\8_pure_classes.csv


Unnamed: 0_level_0,Spruce,Sliver Fir,Douglas Fir,Pine,Oak,Red Oak,Beech,Sycamore,Others
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.00,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00
2,0.95,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.05
22,0.00,0.0,0.0,0.00,0.90,0.0,0.10,0.0,0.00
43,0.00,0.0,0.0,0.00,0.90,0.0,0.10,0.0,0.00
97,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.9,0.10
...,...,...,...,...,...,...,...,...,...
418096,0.00,0.0,0.0,1.00,0.00,0.0,0.00,0.0,0.00
418098,0.00,0.0,0.0,0.90,0.05,0.0,0.05,0.0,0.00
418103,0.00,0.0,0.0,0.95,0.00,0.0,0.05,0.0,0.00
418109,0.00,0.0,0.0,1.00,0.00,0.0,0.00,0.0,0.00


In [3]:
def multi_label() -> pd.DataFrame:
    """Build multi-hot label for 9 classes, 1 means existing 0 means not"""
    labels = csv.load(class_path, 'id')
    labels.drop('Others', axis=1, inplace=True)
    labels = labels.applymap(lambda x: (x != 0 and 1 or 0))
    csv.export(labels, label_path, True)
    return labels
multi_label()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\label_6multi.csv


Unnamed: 0_level_0,Spruce,Beech,Silver fir,Pine,Douglas fir,Oak
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,1,0,0,0
2,1,1,1,0,1,0
4,1,1,1,0,0,0
5,1,1,1,0,1,0
6,1,1,1,0,1,0
...,...,...,...,...,...,...
11051,1,1,1,0,0,0
11052,1,1,1,0,0,1
11053,1,1,1,1,0,0
11054,1,1,0,1,0,1


In [3]:
def count_sample_number() -> list:
    """Count sample number for each class based on centain classification strategy"""
    classes = csv.load(class_path, 'id')
    counts = [0 for _ in range(9)]
    for index, row in classes.iterrows():
        # spruce
        if row[0] >= 0.9:
            counts[0] += 1
        # sliver fir
        elif row[1] >= 0.9:
            counts[1] += 1
        # douglas fir
        elif row[2] >= 0.9:
            counts[2] += 1
        # pine
        elif row[3] >= 0.9:
            counts[3] += 1
        # oak
        elif row[4] >= 0.9:
            counts[4] += 1
        # red oak
        elif row[5] >= 0.9:
            counts[5] += 1
        # beech
        elif row[6] >= 0.9:
            counts[6] += 1
        # sycamore
        elif row[7] >= 0.:
            counts[7] += 1
        # others
        else:
            counts[8] += 1
    return counts
count_sample_number()

[24106, 751, 3413, 1345, 2019, 1199, 8010, 964, 0]

In [4]:
def classify() -> pd.DataFrame:
    """
    spruce: 0
    douglas fir: 1
    pine: 2
    oak: 3
    red oak: 4
    beech: 5
    sycamore: 6
    """
    labels = csv.load(class_path, 'id')
    labels['class'] = 0
    for index, row in labels.iterrows():
        if row[0] >= 0.9:
            labels.loc[index, 'class'] = 0
        elif row[2] >= 0.9:
            labels.loc[index, 'class'] = 1
        elif row[3] >= 0.9:
            labels.loc[index, 'class'] = 2
        elif row[4] >= 0.9:
            labels.loc[index, 'class'] = 3
        elif row[5] >= 0.9:
            labels.loc[index, 'class'] = 4
        elif row[6] >= 0.9:
            labels.loc[index, 'class'] = 5
        elif row[7] >= 0.9:
            labels.loc[index, 'class'] = 6
        else:
            labels.loc[index, 'class'] = 7
    # manually split label
    # **************change number of column here**************
    labels.drop(labels.columns[[i for i in range(9)]],  axis=1, inplace=True)
    labels = labels[labels['class'] != 7]
    # ********************************************************
    labels.sort_values(by='class', ascending=True, inplace=True)
    group = labels.groupby('class')
    y_train = []
    y_val = []
    y_test = []
    for key, df in group:
        sz = len(df)
        if sz > 24000:
            sz /= 5
        elif sz > 8000:
            sz /= 2
        train_size = round(sz * 0.8)
        val_size = round(sz * 0.1)
        test_size = round(sz * 0.1)
        head = df.head(train_size)
        tail = df.tail(val_size + test_size)
        y_train.append(head)
        y_val.append(tail.head(val_size))
        y_test.append(tail.tail(test_size))
    output = pd.concat(y_train + y_val + y_test)
    csv.export(output, label_path, True)
    return output

classify()

export file D:\Deutschland\FUB\master_thesis\data\ref\all\label_7pure.csv


Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
1,0
267579,0
267570,0
267567,0
267558,0
...,...
75427,6
75452,6
75460,6
127826,6
