In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading Data

In [35]:
from dataloading import get_all_data

In [36]:
PATH_DATA = '../data/all_vectors.csv'
TEST_SIZE = .2
UNIQUE_COLUMNS = True

X_train, X_test, y_train, y_test = get_all_data(
    path_all_vectors=PATH_DATA,
    test_size=TEST_SIZE,
    unique=UNIQUE_COLUMNS
    )

***

# Model

In [37]:
from sklearn.ensemble import RandomForestClassifier

# MODEL INIT
model = RandomForestClassifier(
    verbose=True,
    max_depth=5,
    max_features=0.1,
    n_estimators=1,
    bootstrap=True,
    oob_score=False,
    )

In [38]:
# TRAINING

_ = model.fit(X_train, y_train)

In [39]:
from collections import defaultdict

count = defaultdict(int)
for tree in model.estimators_:
    # print(tree.get_depth(), end=', ')
    count[tree.get_depth()] += 1
count = dict(count)
for item, key in count.items():
    print(f'{key} trees have length {item}')

1 trees have length 5


In [40]:
# Using test vars:
X_test
y_test

# get predictions
y_pred = model.predict(X_test)

In [42]:
y_test == y_pred

array([False, False,  True, ...,  True, False, False])

In [None]:
import numpy as np

# Using test vars:
X_test
y_test

# get predictions
y_pred = model.predict(X_test)

matches = np.count_nonzero(y_test == y_pred)
print(f'Accuracy: {100 * matches / len(y_test)} %')

y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# WanDB init

In [None]:
assert False, "WanDB not yet needed."

In [None]:
import wandb

# start a new wandb run and add your model hyperparameters
wandb.init(project='Halado_Adatelemzes_Labor', config=model.get_params())

# Add additional configs to wandb
wandb.config.update({"test_size" : TEST_SIZE,
                    "train_len" : len(X_train),
                    "test_len" : len(X_test)})

In [None]:
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

# log additional visualisations to wandb
plot_class_proportions(y_train, y_test, labels)
plot_learning_curve(model, X_train, y_train)
plot_roc(y_test, y_probas, labels)
plot_precision_recall(y_test, y_probas, labels)
plot_feature_importances(model)

In [None]:
# Finish the wandb run
wandb.finish()