In [2]:
import pandas as pd
import os
import sys
sys.path.append('../')
import utils.csv as csv

In [3]:
OUTPUT_DIR = 'D:\\Deutschland\\FUB\\master_thesis\\data\\ref\\all'

REF_CSV = 'reference_pure.csv'
LABEL_CSV = 'label_6multi.csv'
MERGE_CSV = 'merged_10.csv'
CLASS_CSV = '6_classes.csv'

ref_path = os.path.join(OUTPUT_DIR, REF_CSV)
merge_path = os.path.join(OUTPUT_DIR, MERGE_CSV)
label_path = os.path.join(OUTPUT_DIR, LABEL_CSV)
class_path = os.path.join(OUTPUT_DIR, CLASS_CSV)

In [4]:
def statistics() -> pd.DataFrame:
    """Count percentage of selected species over each polygon"""
    ref = csv.load(file_path=ref_path, index_col='id')
    # selected species
    cols = ['Spruce','Beech','Silver fir','Pine','Douglas fir','Oak', 'Ash','Others','id']
    # cols = ['Spruce','Beech','Coniferous','Deciduous','id']
    labels = [] 
    for index, row in ref.iterrows():
        label = pd.DataFrame(columns=cols, index=[0])
        label.fillna(value=0, inplace=True)
        label['id'] = index
        if row[0] not in [110, 710, 210, 410, 310, 600, 831]:
            continue
        for i in range(8):
            if row[i] == 110:
                label['Spruce'] += row[i + 8]
            elif row[i] == 710:
                label['Beech'] += row[i + 8]
            elif row[i] == 210:
                label['Silver fir'] += row[i + 8]
            elif row[i] == 410:
                label['Pine'] += row[i + 8]
            elif row[i] == 310:
                label['Douglas fir'] += row[i + 8]
            elif row[i] == 600:
                label['Oak'] += row[i + 8]
            elif row[i] == 831:
                label['Ash'] += row[i + 8]
            else:
                label['Others'] += row[i + 8]
        labels.append(label)
    output = pd.concat(labels, ignore_index=True)
    output.set_index(['id'], inplace=True)
    return output

In [5]:
def export_classes() -> pd.DataFrame:
    """Export percentage of selected species as csv file"""
    # load labels
    classes = statistics()
    merged = csv.load(merge_path,'id')
    # select both labels and data are available polygons
    output = pd.merge(classes, merged, on='id', how='inner')
    # delete sum of percentage is not 100%
    # **************change number of column here**************
    output['sum'] = output.iloc[:,0:8].sum(axis=1)
    # ********************************************************
    output = output[output['sum'] == 100]
    # delete useless columns
    output.drop(columns=['sum'], axis=1, inplace=True)
    output = output / 100
    csv.export(output, class_path, True)
    return output

export_classes()

export file D:\Deutschland\FUB\master_thesis\data\ref\all\6_classes.csv


Unnamed: 0_level_0,Spruce,Beech,Silver fir,Pine,Douglas fir,Oak,Ash,Others,sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
88525,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1.0
88528,0.95,0.05,0.00,0.00,0.0,0.0,0.0,0.0,1.0
88541,0.90,0.00,0.10,0.00,0.0,0.0,0.0,0.0,1.0
88551,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1.0
88552,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
415191,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1.0
415204,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1.0
416434,0.95,0.00,0.05,0.00,0.0,0.0,0.0,0.0,1.0
416802,0.95,0.00,0.00,0.05,0.0,0.0,0.0,0.0,1.0


In [3]:
def multi_label() -> pd.DataFrame:
    """Build multi-hot label for 9 classes, 1 means existing 0 means not"""
    labels = csv.load(class_path, 'id')
    labels.drop('Others', axis=1, inplace=True)
    labels = labels.applymap(lambda x: (x != 0 and 1 or 0))
    csv.export(labels, label_path, True)
    return labels
multi_label()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\label_6multi.csv


Unnamed: 0_level_0,Spruce,Beech,Silver fir,Pine,Douglas fir,Oak
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,1,0,0,0
2,1,1,1,0,1,0
4,1,1,1,0,0,0
5,1,1,1,0,1,0
6,1,1,1,0,1,0
...,...,...,...,...,...,...
11051,1,1,1,0,0,0
11052,1,1,1,0,0,1
11053,1,1,1,1,0,0
11054,1,1,0,1,0,1


In [5]:
def count_sample_number() -> list:
    """Count sample number for each class based on centain classification strategy"""
    classes = csv.load(class_path, 'id')
    counts = [0 for _ in range(5)]
    for index, row in classes.iterrows():
        # pure coniferous
        if row[0] + row[2] >= 0.9:
            counts[0] += 1
        # coniferous dominated
        elif row[0] + row[2] >= 0.7:
            counts[1] += 1
        # pure deciduous
        elif row[1] + row[3] >= 0.9:
            counts[2] += 1
        # deciduous domianated
        elif row[1] + row[3] >= 0.7:
            counts[3] += 1
        # mixed
        else:
            counts[4] += 1
    return counts
count_sample_number()

[2293, 1985, 1126, 620, 1098]

In [6]:
def classify() -> pd.DataFrame:
    """
    Coniferous: class 0
    Coniferous Dec.: class 1
    Deciduous: class 2
    Deciduous Con.: class 3
    Mixed: class 4
    """
    labels = csv.load(class_path, 'id')
    labels['class'] = 0
    for index, row in labels.iterrows():
        if row[0] + row[2] >= 0.9:
            labels.loc[index, 'class'] = 0
        elif row[0] + row[2] >= 0.7:
            labels.loc[index, 'class'] = 1
        elif row[1] + row[3] >= 0.9:
            labels.loc[index, 'class'] = 2
        elif row[1] + row[3] >= 0.7:
            labels.loc[index, 'class'] = 3
        else:
            labels.loc[index, 'class'] = 4
    # **************change number of column here**************
    labels.drop(labels.columns[[i for i in range(4)]],  axis=1, inplace=True)
    # ********************************************************
    # csv.export(labels, label_path, True)
    # return labels
    # manually split label
    # labels = labels[labels['class'] != 20]
    labels.sort_values(by='class', ascending=True, inplace=True)
    group = labels.groupby('class')
    # find the least number of one class
    y_train = []
    y_val = []
    for key, df in group:
        sz = len(df)
        if sz > 1500:
            sz = sz / 2
        train_size = round(sz * 0.8)
        val_size = round(train_size * 0.25)
        y_train.append(df.head(train_size))
        y_val.append(df.tail(val_size))
    output = pd.concat(y_train + y_val)
    csv.export(output, label_path, True)
    return output

classify()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\label_con_dec.csv


Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
4134,0
2854,0
2856,0
7183,0
2859,0
...,...
8013,4
8011,4
8005,4
8067,4
