In [116]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()
plt.rcParams['figure.figsize'] = [16, 10]

In [117]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2

In [118]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../telperion')
from SapWood import SapWood

from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [119]:
def read_csv_file(path, seed=42, thr=50000):
    num_lines = sum(1 for line in open(path))
    if num_lines > thr:
        random.seed(seed)
        skip = sorted(random.sample(range(num_lines),num_lines-thr))
        df = pd.read_csv(path, skiprows=skip[1:])
    else:
        df = pd.read_csv(path)
    return df

In [120]:
df_train = read_csv_file('./data/train_QT.csv')
df_train['latitude'] = df_train['y']/(df_train['y'].max()+128) * 2 * 85 - 85
train = {'north': df_train[df_train['latitude']< -45],
            'south': df_train[df_train['latitude']> 45],
            'center': df_train[~(df_train['latitude']< -45) & ~(df_train['latitude']> 45)]}

df_test = read_csv_file('./data/test_QT.csv')
df_test['latitude'] = df_test['y']/(df_test['y'].max()+128) * 2 * 85 - 85
test = {'north': df_test[df_test['latitude']< -45],
            'south': df_test[df_test['latitude']> 45],
            'center': df_test[~(df_test['latitude']< -45) & ~(df_test['latitude']> 45)]}

In [121]:
band = 'center'
x_train = train[band].drop(columns='split') #.copy()
y_train = train[band]['split'] #.copy()

x_test = test[band].drop(columns='split') #.copy()
y_test = test[band]['split'] #.copy()

num_features = []
for feat in x_train.columns:
    if max(x_train[feat]) != 1:
        num_features.append(feat)

pre = Pipeline([
    ('scaler', ColumnTransformer([('scaler', MinMaxScaler(), num_features)], remainder='passthrough')),
    ('features', SelectKBest(chi2, k=5))
])

pre.fit(x_train, y_train)

preprocess = lambda x: pre.transform(x)

## Max Depth 2

In [122]:
max_depth = 2
n_estimators = (5,10)

sw = SapWood(max_depth=max_depth)

sw.fit(preprocess(x_train), y_train.values, metric='gini')

print(round(accuracy_score(y_test.values, sw.predict(preprocess(x_test))),4)*100)

|| E: 50 | L: 0.100995 | A: 87.42% ||: 100%|██████████| 50/50 [00:01<00:00, 35.77it/s]
|| E: 50 | L: 0.119764 | A: 83.48% ||: 100%|██████████| 50/50 [00:00<00:00, 191.92it/s]
|| E: 50 | L: 0.096901 | A: 88.17% ||: 100%|██████████| 50/50 [00:01<00:00, 45.71it/s]


85.25


In [123]:
dt = DecisionTreeClassifier(max_depth=max_depth)
dt.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, dt.predict(preprocess(x_test))),4)*100)

84.28


In [124]:
rf = RandomForestClassifier(n_estimators=n_estimators[0], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

84.99


In [125]:
rf = RandomForestClassifier(n_estimators=n_estimators[1], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

84.99


## Max Depth 3

In [126]:
max_depth = 3

sw = SapWood(max_depth=max_depth)

sw.fit(preprocess(x_train), y_train.values, metric='gini')

print(round(accuracy_score(y_test.values, sw.predict(preprocess(x_test))),4)*100)

|| E: 50 | L: 0.100995 | A: 87.42% ||: 100%|██████████| 50/50 [00:01<00:00, 36.69it/s]
|| E: 50 | L: 0.119764 | A: 83.48% ||: 100%|██████████| 50/50 [00:00<00:00, 188.01it/s]
|| E: 50 | L: 0.072800 | A: 91.48% ||: 100%|██████████| 50/50 [00:00<00:00, 239.03it/s]
|| E: 50 | L: 0.233487 | A: 62.04% ||: 100%|██████████| 50/50 [00:00<00:00, 470.73it/s]
|| E: 50 | L: 0.096901 | A: 88.17% ||: 100%|██████████| 50/50 [00:01<00:00, 45.55it/s]
|| E: 50 | L: 0.174774 | A: 75.23% ||: 100%|██████████| 50/50 [00:00<00:00, 176.32it/s]
|| E: 50 | L: 0.074768 | A: 91.78% ||: 100%|██████████| 50/50 [00:00<00:00, 57.25it/s]


85.25


In [127]:
dt = DecisionTreeClassifier(max_depth=max_depth)
dt.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, dt.predict(preprocess(x_test))),4)*100)

84.92


In [128]:
rf = RandomForestClassifier(n_estimators=n_estimators[0], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

85.2


In [129]:
rf = RandomForestClassifier(n_estimators=n_estimators[1], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

85.32
