In [1]:
import pandas as pd
import os
import sys
sys.path.append('../')
import utils.csv as csv

In [2]:
OUTPUT_DIR = 'D:\\Deutschland\\FUB\\master_thesis\\data\\gee\\output'

REF_CSV = 'reference.csv'
LABEL_CSV = '_label.csv'
MERGE_CSV = 'merged.csv'
CLASS_CSV = '7_classes.csv'

ref_path = os.path.join(OUTPUT_DIR, REF_CSV)
merge_path = os.path.join(OUTPUT_DIR, MERGE_CSV)
label_path = os.path.join(OUTPUT_DIR, LABEL_CSV)
class_path = os.path.join(OUTPUT_DIR, CLASS_CSV)

In [4]:
def statistics() -> pd.DataFrame:
    """Count percentage of selected species over each polygon"""
    ref = csv.load(file_path=ref_path, index_col='id')
    # selected species
    cols = ['Spruce', 'Beech', 'Pine', 'Douglas fir', 'Oak', 'Coniferous', 'Deciduous', 'id']
    # cols = ['Spruce', 'Beech', 'Coniferous', 'Deciduous', 'id']
    labels = [] 
    for index, row in ref.iterrows():
        label = pd.DataFrame(columns=cols, index=[0])
        label.fillna(value=0, inplace=True)
        label['id'] = index
        if row[0] not in [110, 710, 410, 310, 600]:
            continue
        for i in range(8):
            if row[i] == 110:
                label['Spruce'] += row[i + 8]
            elif row[i] == 710:
                label['Beech'] += row[i + 8]
            elif row[i] == 410:
                label['Pine'] += row[i + 8]
            elif row[i] == 310:
                label['Douglas fir'] += row[i + 8]
            elif row[i] == 600:
                label['Oak'] += row[i + 8]
            elif row[i] > 110 and row[i] <= 590:
                label['Coniferous'] += row[i + 8]
            elif row[i] > 600 and row[i] != 710:
                label['Deciduous'] += row[i + 8]
        labels.append(label)
    output = pd.concat(labels, ignore_index=True)
    output.set_index(['id'], inplace=True)
    return output

In [6]:
def export_classes() -> pd.DataFrame:
    """Export percentage of selected species as csv file"""
    # load labels
    classes = statistics()
    merged = csv.load(merge_path,'id')
    # select both labels and data are available polygons
    output = pd.merge(classes, merged, on='id', how='inner')
    # delete sum of percentage is not 100%
    output['sum'] = output.iloc[:,0:7].sum(axis=1)
    output = output[output['sum'] == 100]
    # delete useless columns
    cols = [i for i in range(7, 283)]
    output.drop(output.columns[cols], axis=1, inplace=True)
    output = output / 100
    csv.export(output, class_path, True)
    return output

export_classes()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\7_classes.csv


Unnamed: 0_level_0,Spruce,Beech,Pine,Douglas fir,Oak,Coniferous,Deciduous
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.10,0.55,0.00,0.00,0.00,0.35,0.00
2,0.20,0.05,0.00,0.60,0.00,0.10,0.05
4,0.80,0.05,0.00,0.00,0.00,0.15,0.00
5,0.40,0.05,0.00,0.10,0.00,0.20,0.25
6,0.85,0.05,0.00,0.05,0.00,0.05,0.00
...,...,...,...,...,...,...,...
11049,0.55,0.15,0.00,0.05,0.05,0.20,0.00
11050,0.50,0.10,0.05,0.00,0.05,0.20,0.10
11052,0.30,0.50,0.00,0.00,0.05,0.15,0.00
11054,0.60,0.05,0.30,0.00,0.05,0.00,0.00


In [4]:
def count_sample_number() -> list:
    """Count sample number for each class based on centain classification strategy"""
    classes = csv.load(class_path, 'id')
    counts = [0 for _ in range(21)]
    for index, row in classes.iterrows():
        # Spruce
        if row[0] >= 0.9:
            counts[0] += 1
        elif row[0] >= 0.5 and row[1] == row[4] == row[6] == 0:
            counts[1] += 1
        elif row[0] >= 0.5 and row[2] == row[3] == row[5] == 0:
            counts[2] += 1
        elif row[0] >= 0.5:
            counts[3] += 1
        # Beech
        elif row[1] >= 0.9:
            counts[4] += 1
        elif row[1] >= 0.5 and row[4] == row[6] == 0:
            counts[5] += 1
        elif row[1] >= 0.5 and row[0] == row[2] == row[3] == row[5] == 0:
            counts[6] += 1
        elif row[1] >= 0.5:
            counts[7] += 1
        # Pine
        elif row[2] >= 0.9:
            counts[8] += 1
        elif row[2] >= 0.5 and row[1] == row[4] == row[6] == 0:
            counts[9] += 1
        elif row[2] >= 0.5 and row[0] == row[3] == row[5] == 0:
            counts[10] += 1
        elif row[2] >= 0.5:
            counts[11] += 1
        # Douglas fir
        elif row[3] >= 0.9:
            counts[12] += 1
        elif row[3] >= 0.5 and row[1] == row[4] == row[6] == 0:
            counts[13] += 1
        elif row[3] >= 0.5 and row[0] == row[2] == row[5] == 0:
            counts[14] += 1
        elif row[3] >= 0.5:
            counts[15] += 1
        # Oak
        elif row[4] >= 0.9:
            counts[16] += 1
        elif row[4] >= 0.5 and row[1] == row[6] == 0:
            counts[17] += 1
        elif row[4] >= 0.5 and row[0] == row[2] == row[3] == row[5] == 0:
            counts[18] += 1
        elif row[4] >= 0.5:
            counts[19] += 1
        else:
            counts[20] += 1
    return counts
count_sample_number()

[643, 0, 0, 0, 217, 0, 0, 0, 232, 0, 0, 0, 177, 0, 0, 0, 111, 0, 0, 0, 4969]

In [5]:
def classify() -> pd.DataFrame:
    """
    Spruce90: class 0
    Spruce50 Con.: class 1
    Spruce50 Dec.: class 2
    Spruce50 mixed: class 3
    Beech90: class 4
    Beech50 Con.: class 5
    Beech50 Dec.: class 6
    Beech50 mixed: class 7
    Pine90: class 8
    Pine50 Con.: class 9
    Pine50 Dec.: class 10
    Pine50 mixed: class 11
    Douglas fir90: class 12
    Douglas fir50 Con.: class 13
    Douglas fir50 Dec.: class 14
    Douglas fir50 mixed: class 15
    Oak90: class 16
    Oak50 Con.: class 17
    Oak50 Dec.: class 18
    Oak50 mixed: class 19
    other mixed: class 20
    """
    labels = csv.load(class_path, 'id')
    labels['class'] = 0
    for index, row in labels.iterrows():
        if row[0] >= 0.9:
            labels.loc[index, 'class'] = 0
        # elif row[1] >= 0.9:
        #     labels.loc[index, 'class'] = 1
        # elif row[2] >= 0.9:
        #     labels.loc[index, 'class'] = 2
        # elif row[3] >= 0.9:
        #     labels.loc[index, 'class'] = 3
        # elif row[4] >= 0.9:
        #     labels.loc[index, 'class'] = 4
        # else:
        #     labels.loc[index, 'class'] = 5
        elif row[0] >= 0.5 and row[1] == row[4] == row[6] == 0:
            labels.loc[index, 'class'] = 1
        elif row[0] >= 0.5 and row[2] == row[3] == row[5] == 0:
            labels.loc[index, 'class'] = 2
        elif row[0] >= 0.5:
            labels.loc[index, 'class'] = 3
        elif row[1] >= 0.9:
            labels.loc[index, 'class'] = 4
        elif row[1] >= 0.5 and row[4] == row[6] == 0:
            labels.loc[index, 'class'] = 5
        elif row[1] >= 0.5 and row[0] == row[2] == row[3] == row[5] == 0:
            labels.loc[index, 'class'] = 6
        elif row[1] >= 0.5:
            labels.loc[index, 'class'] = 7
        elif row[2] >= 0.9:
            labels.loc[index, 'class'] = 8
        elif row[2] >= 0.5 and row[1] == row[4] == row[6] == 0:
            labels.loc[index, 'class'] = 9
        elif row[2] >= 0.5 and row[0] == row[3] == row[5] == 0:
            labels.loc[index, 'class'] = 10
        elif row[2] >= 0.5:
            labels.loc[index, 'class'] = 11
        elif row[3] >= 0.9:
            labels.loc[index, 'class'] = 12
        elif row[3] >= 0.5 and row[1] == row[4] == row[6] == 0:
            labels.loc[index, 'class'] = 13
        elif row[3] >= 0.5 and row[0] == row[2] == row[5] == 0:
            labels.loc[index, 'class'] = 14
        elif row[3] >= 0.5:
            labels.loc[index, 'class'] = 15
        elif row[4] >= 0.9:
            labels.loc[index, 'class'] = 16
        elif row[4] >= 0.5 and row[1] == row[6] == 0:
            labels.loc[index, 'class'] = 17
        elif row[4] >= 0.5 and row[0] == row[2] == row[3] == row[5] == 0:
            labels.loc[index, 'class'] = 18
        elif row[4] >= 0.5:
            labels.loc[index, 'class'] = 19
        else:
            labels.loc[index, 'class'] = 20
    labels.drop(labels.columns[[i for i in range(7)]],  axis=1, inplace=True)
    # manually split label
    # labels = labels[labels['class'] != 5]
    # labels.sort_values(by='class', ascending=True, inplace=True)
    # group = labels.groupby('class')
    # # find the least number of one class
    # count = labels.groupby('class')['class'].count()
    # min = count.min()
    # y_train = []
    # y_val = []
    # for key, df in group:
    #     train_size = round(min * 0.8)
    #     val_size = len(df) - train_size
    #     y_train.append(df.head(train_size))
    #     y_val.append(df.tail(val_size))
    # output = pd.concat(y_train + y_val)
    csv.export(labels, label_path, True)
    return labels

classify()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\_label.csv


Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
3208,0
3698,0
8894,0
3705,0
3710,0
...,...
6485,4
6472,4
6447,4
6855,4
