In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]


import seaborn as sns
sns.set()

from telperion.Mallorn import Mallorn
from telperion.Lothlorien import Lothlorien
from telperion.utils import plot_decision_domains, accuracy

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2

In [3]:
def read_csv_file(path, seed=42, thr=50000):
    num_lines = sum(1 for line in open(path))
    if num_lines > thr:
        random.seed(seed)
        skip = sorted(random.sample(range(num_lines),num_lines-thr))
        df = pd.read_csv(path, skiprows=skip[1:])
    else:
        df = pd.read_csv(path)
    return df

In [4]:
df_train = read_csv_file('./data/train_QT.csv')
df_train['latitude'] = df_train['y']/(df_train['y'].max()+128) * 2 * 85 - 85
train = {'north': df_train[df_train['latitude']< -45],
            'south': df_train[df_train['latitude']> 45],
            'center': df_train[~(df_train['latitude']< -45) & ~(df_train['latitude']> 45)]}

df_test = read_csv_file('./data/test_QT.csv')
df_test['latitude'] = df_test['y']/(df_test['y'].max()+128) * 2 * 85 - 85
test = {'north': df_test[df_test['latitude']< -45],
            'south': df_test[df_test['latitude']> 45],
            'center': df_test[~(df_test['latitude']< -45) & ~(df_test['latitude']> 45)]}

In [5]:
band = 'center'
x_train = train[band].drop(columns='split') #.copy()
y_train = train[band]['split'] #.copy()

x_test = test[band].drop(columns='split') #.copy()
y_test = test[band]['split'] #.copy()

num_features = []
for feat in x_train.columns:
    if max(x_train[feat]) != 1:
        num_features.append(feat)

pre = Pipeline([
    ('scaler', ColumnTransformer([('scaler', MinMaxScaler(), num_features)], remainder='passthrough')),
    ('features', SelectKBest(chi2, k=5))
])

pre.fit(x_train, y_train)

preprocess = lambda x: pre.transform(x)

## Max Depth 2

In [6]:
max_depth = 2
n_estimators = (5,10)

ml = Mallorn(max_depth=max_depth)

ml.fit(preprocess(x_train), y_train.values, metric='gini')

print(round(accuracy_score(y_test.values, ml.predict(preprocess(x_test))),4)*100)

85.25


In [7]:
dt = DecisionTreeClassifier(max_depth=max_depth)
dt.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, dt.predict(preprocess(x_test))),4)*100)

84.28


In [8]:
rf = RandomForestClassifier(n_estimators=n_estimators[0], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

84.99


In [9]:
rf = RandomForestClassifier(n_estimators=n_estimators[1], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

84.99


## Max Depth 3

In [10]:
max_depth = 3

ml = Mallorn(max_depth=max_depth)

ml.fit(preprocess(x_train), y_train.values, metric='gini')

print(round(accuracy_score(y_test.values, ml.predict(preprocess(x_test))),4)*100)
print(round(accuracy_score(y_test.values, ml.predict(preprocess(x_test))),4)*100)

85.25
85.25


In [11]:
dt = DecisionTreeClassifier(max_depth=max_depth)
dt.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, dt.predict(preprocess(x_test))),4)*100)

84.92


In [12]:
rf = RandomForestClassifier(n_estimators=n_estimators[0], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

85.2


In [13]:
rf = RandomForestClassifier(n_estimators=n_estimators[1], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

85.32


## Max Depth 10

In [14]:
max_depth = 10

ml = Mallorn(max_depth=max_depth)

ml.fit(preprocess(x_train), y_train.values, metric='gini')

print(round(accuracy_score(y_test.values, ml.predict(preprocess(x_test))),4)*100)
print(round(accuracy_score(y_test.values, ml.predict(preprocess(x_test))),4)*100)

85.32
85.32


In [15]:
max_depth = 10

ll = Lothlorien(max_depth=max_depth, n_estimators=10)

ll.fit(preprocess(x_train), y_train.values)

print(round(accuracy_score(y_test.values, ll.predict(preprocess(x_test))),4)*100)

Training Trees:  10%|███████████████▎                                                                                                                                         | 1/10 [03:23<30:34, 203.84s/it]


85.32


In [16]:
dt = DecisionTreeClassifier(max_depth=max_depth)
dt.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, dt.predict(preprocess(x_test))),4)*100)

85.32


In [17]:
rf = RandomForestClassifier(n_estimators=n_estimators[0], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

85.32


In [18]:
rf = RandomForestClassifier(n_estimators=n_estimators[1], max_depth=max_depth, random_state=42)
rf.fit(preprocess(x_train), y_train)
print(round(accuracy_score(y_test, rf.predict(preprocess(x_test))),4)*100)

85.32
