In [13]:
import numpy as np
import pandas as pd

import sklearn as skl
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback

In [14]:
df = pd.read_csv('./data/data.csv')

In [15]:
df.head(2)

Unnamed: 0,name,type,C,H,N,O,S,F,Cl,Br,I,Other
0,S1,sy,1,0,1,2,0,0,0,0,0,0
1,O1,o,0,0,0,0,1,0,0,0,0,0


In [16]:
X = df.drop(columns=['type', 'name'])
y = df['type']

X_encoder = preprocessing.LabelEncoder()
y_encoder = preprocessing.LabelEncoder()

name_trans = X_encoder.fit_transform(df['name'].to_numpy().reshape(-1, 1))
X = pd.concat([X, pd.DataFrame(name_trans)], axis=1)
y = y_encoder.fit_transform(y.to_numpy().reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [17]:
X.head(2)

Unnamed: 0,C,H,N,O,S,F,Cl,Br,I,Other,0
0,1,0,1,2,0,0,0,0,0,0,293
1,0,0,0,0,1,0,0,0,0,0,241


In [18]:
y

array([81, 67, 67, ..., 22, 22, 28])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [42]:
def objective(trial):
    dtc = DecisionTreeClassifier(
        max_depth=trial.suggest_int('max_depth', 1, 1000),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 100),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 100),
        min_weight_fraction_leaf=trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
        max_features=trial.suggest_float('max_features', 0.1, 1.0),
        max_leaf_nodes=trial.suggest_int('max_leaf_nodes', 1, 1000),
        class_weight='balanced',
        random_state=0
    )

    y_hat = dtc.fit(X_train, y_train).predict(X_test)

    #print(y_hat)

    return metrics.f1_score(y_test, y_hat, average='macro')

def objective_logreg(trial):
    logreg = LogisticRegression(
        C = trial.suggest_float('C', 0.0, 100.0),
        class_weight='balanced',
        multi_class='multinomial',
        solver='saga',
        random_state=0
    )

    y_hat = logreg.fit(X_train, y_train).predict(X_test)

    print(y_hat)

    return metrics.f1_score(y_test, y_hat, average='macro')

In [43]:
study = optuna.create_study(direction='maximize')

# wandb_kwargs = {"project": "bachelor"}
# wandbc = WeightsAndBiasesCallback(metric_name='F1', wandb_kwargs=wandb_kwargs)

study.optimize(objective, n_trials=100)#, callbacks=[wandbc])

[32m[I 2022-09-06 21:41:10,126][0m A new study created in memory with name: no-name-19aeceeb-ea60-4d4a-b35b-e2f954380031[0m
[32m[I 2022-09-06 21:41:11,415][0m Trial 0 finished with value: 0.009616491364214372 and parameters: {'max_depth': 14, 'min_samples_split': 47, 'min_samples_leaf': 81, 'min_weight_fraction_leaf': 0.41944792524468905, 'max_features': 0.5353431109509181, 'max_leaf_nodes': 325}. Best is trial 0 with value: 0.009616491364214372.[0m
[32m[I 2022-09-06 21:41:12,608][0m Trial 1 finished with value: 0.0197252525647777 and parameters: {'max_depth': 359, 'min_samples_split': 86, 'min_samples_leaf': 24, 'min_weight_fraction_leaf': 0.23772850680658575, 'max_features': 0.55973082550448, 'max_leaf_nodes': 399}. Best is trial 1 with value: 0.0197252525647777.[0m
[32m[I 2022-09-06 21:41:13,705][0m Trial 2 finished with value: 0.009616491364214372 and parameters: {'max_depth': 156, 'min_samples_split': 82, 'min_samples_leaf': 68, 'min_weight_fraction_leaf': 0.39447374046

KeyboardInterrupt: 