In [1]:
import numpy as np
import pandas as pd

import sklearn as skl
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('./data/data.csv')

In [3]:
df.head(2)

Unnamed: 0,name,type,C,H,N,O,S,F,Cl,Br,I,Other
0,S1,sy,1,0,1,2,0,0,0,0,0,0
1,O1,o,0,0,0,0,1,0,0,0,0,0


In [4]:
X = df.drop(columns=['type', 'name'])
y = df['type']

X_encoder = preprocessing.LabelEncoder()
y_encoder = preprocessing.LabelEncoder()

name_trans = X_encoder.fit_transform(df['name'].to_numpy().reshape(-1, 1))
X = pd.concat([X, pd.DataFrame(name_trans)], axis=1)
y = y_encoder.fit_transform(y.to_numpy().reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [5]:
X.head(2)

Unnamed: 0,C,H,N,O,S,F,Cl,Br,I,Other,0
0,1,0,1,2,0,0,0,0,0,0,293
1,0,0,0,0,1,0,0,0,0,0,241


In [6]:
y

array([81, 67, 67, ..., 22, 22, 28])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [8]:
def objective(trial):
    dtc = DecisionTreeClassifier(
        max_depth=trial.suggest_int('max_depth', 1, 1000),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 100),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 100),
        min_weight_fraction_leaf=trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
        max_features=trial.suggest_float('max_features', 0.1, 1.0),
        max_leaf_nodes=trial.suggest_int('max_leaf_nodes', 1, 1000),
        class_weight='balanced',
        random_state=0
    )

    y_hat = dtc.fit(X_train, y_train).predict(X_test)

    #print(y_hat)

    return metrics.f1_score(y_test, y_hat, average='macro')


def objective_logreg(trial):
    logreg = LogisticRegression(
        C=trial.suggest_float('C', 0.0, 100.0),
        class_weight='balanced',
        multi_class='multinomial',
        solver='saga',
        random_state=0
    )

    y_hat = logreg.fit(X_train, y_train).predict(X_test)

    print(y_hat)

    return metrics.f1_score(y_test, y_hat, average='macro')


def objective_knn(trial):
    knn = KNeighborsClassifier(
        n_neighbors=trial.suggest_int('n_neighbors', 1, 200),
        leaf_size=trial.suggest_int('leaf_size', 1, 500),
        p=trial.suggest_int('p', 1, 50),
        n_jobs=-1
    ).fit(X_train, y_train)

    y_hat = knn.predict(X_test)

    return metrics.f1_score(y_test, y_hat, average='macro') 

In [None]:
study = optuna.create_study(direction='maximize')

wandb_kwargs = {"project": "bachelor"}
wandbc = WeightsAndBiasesCallback(metric_name='F1', wandb_kwargs=wandb_kwargs)

study.optimize(objective_knn, n_trials=20, callbacks=[wandbc])

[32m[I 2022-09-11 21:04:15,229][0m A new study created in memory with name: no-name-c59cc6b4-234e-4808-a60e-9efa7dffdc44[0m
  wandbc = WeightsAndBiasesCallback(metric_name='F1', wandb_kwargs=wandb_kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mcaigh[0m. Use [1m`wandb login --relogin`[0m to force relogin


[32m[I 2022-09-11 21:46:29,065][0m Trial 0 finished with value: 0.3128000424247529 and parameters: {'n_neighbors': 55, 'leaf_size': 275, 'p': 29}. Best is trial 0 with value: 0.3128000424247529.[0m
[32m[I 2022-09-11 22:26:49,334][0m Trial 1 finished with value: 0.304437168234546 and parameters: {'n_neighbors': 69, 'leaf_size': 63, 'p': 10}. Best is trial 0 with value: 0.3128000424247529.[0m
[32m[I 2022-09-11 23:08:07,149][0m Trial 2 finished with value: 0.304528173387108 and parameters: {'n_neighbors': 100, 'leaf_size': 282, 'p': 30}. Best is trial 0 with value: 0.3128000424247529.[0m
[32m[I 2022-09-11 23:51:56,263][0m Trial 3 finished with value: 0.3633661287628684 and parameters: {'n_neighbors': 20, 'leaf_size': 83, 'p': 17}. Best is trial 3 with value: 0.3633661287628684.[0m
[32m[I 2022-09-12 00:35:21,960][0m Trial 4 finished with value: 0.3293460385387189 and parameters: {'n_neighbors': 43, 'leaf_size': 226, 'p': 22}. Best is trial 3 with value: 0.3633661287628684.[0