In [46]:
!pip install optuna
!pip install joblib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [54]:
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
from lightgbm import LGBMClassifier
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 100

%matplotlib inline

In [39]:
data = fetch_20newsgroups()

X = data['data']
y = data['target']

In [43]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),   
    ('lgbc', LGBMClassifier(objective='f1_score', n_jobs=1))])

In [49]:
def objective(trial):    
    
    joblib.dump(study, 'study.pkl')
    
    tfidf__analyzer = trial.suggest_categorical('tfidf__analyzer', ['word', 'char', 'char_wb']) 
    tfidf__lowercase = trial.suggest_categorical('tfidf__lowercase', [False, True]) 
    tfidf__max_features = trial.suggest_int('tfidf__max_features', 500, 10_000) 
    lgbc__num_leaves = trial.suggest_int('lgbc__num_leaves', 2, 150) 
    lgbc__max_depth = trial.suggest_int('lgbc__max_depth', 2, 100) 
    lgbc__n_estimators = trial.suggest_int('lgbc__n_estimators', 10, 200) 
    lgbc__subsample_for_bin = trial.suggest_int('lgbc__subsample_for_bin', 2000, 300_000) 
    lgbc__min_child_samples = trial.suggest_int('lgbc__min_child_samples', 20, 500) 
    lgbc__reg_alpha = trial.suggest_uniform('lgbc__reg_alpha', 0.0, 1.0) 
    lgbc__colsample_bytree = trial.suggest_uniform('lgbc__colsample_bytree', 0.6, 1.0) 
    lgbc__learning_rate = trial.suggest_loguniform('lgbc__learning_rate', 1e-5, 1e-0)   
    

    params = {
        'tfidf__analyzer': tfidf__analyzer,
        'tfidf__lowercase': tfidf__lowercase,
        'tfidf__max_features': tfidf__max_features,
        'lgbc__num_leaves': lgbc__num_leaves,
        'lgbc__max_depth': lgbc__max_depth,
        'lgbc__n_estimators': lgbc__n_estimators,
        'lgbc__subsample_for_bin': lgbc__subsample_for_bin,
        'lgbc__min_child_samples': lgbc__min_child_samples,
        'lgbc__reg_alpha': lgbc__reg_alpha,
        'lgbc__colsample_bytree': lgbc__colsample_bytree,
        'lgbc__learning_rate': lgbc__learning_rate
    }
    
    model.set_params(**params)

    return - np.mean(cross_val_score(model, X, y, cv=8))

In [51]:
if os.path.isfile('study.pkl'):
    study = joblib.load('study.pkl')
else:
    study = optuna.create_study()
study.optimize(objective, timeout=3600)

  del sys.path[0]
  
  from ipykernel import kernelapp as app
[32m[I 2022-10-11 12:02:59,971][0m Trial 3 finished with value: -0.09881641685117526 and parameters: {'tfidf__analyzer': 'char_wb', 'tfidf__lowercase': False, 'tfidf__max_features': 2887, 'lgbc__num_leaves': 60, 'lgbc__max_depth': 100, 'lgbc__n_estimators': 181, 'lgbc__subsample_for_bin': 80852, 'lgbc__min_child_samples': 321, 'lgbc__reg_alpha': 0.5025897886267174, 'lgbc__colsample_bytree': 0.6469412942905471, 'lgbc__learning_rate': 1.215591196652765e-05}. Best is trial 1 with value: -0.2358143077053793.[0m
  del sys.path[0]
  
  from ipykernel import kernelapp as app
[32m[I 2022-10-11 12:26:37,829][0m Trial 4 finished with value: -0.48957172345200195 and parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 2998, 'lgbc__num_leaves': 11, 'lgbc__max_depth': 68, 'lgbc__n_estimators': 198, 'lgbc__subsample_for_bin': 109999, 'lgbc__min_child_samples': 99, 'lgbc__reg_alpha': 0.9104101827118

KeyboardInterrupt: ignored

In [52]:
model.set_params(**study.best_params)
model.fit(X, y)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=2998, stop_words='english')),
                ('lgbc',
                 LGBMClassifier(colsample_bytree=0.8795464562976437,
                                learning_rate=4.603344273918418e-05,
                                max_depth=68, min_child_samples=99,
                                n_estimators=198, n_jobs=1, num_leaves=11,
                                objective='f1_score',
                                reg_alpha=0.9104101827118936,
                                subsample_for_bin=109999))])

# Visual 

In [None]:
data = joblib.load('study.pkl')

df = data.trials_dataframe()
df.dropna(inplace=True)
df.reset_index(inplace=True)

df['time'] = df.datetime_complete - df.datetime_start
df['time'] = df.time.astype('int') / (1_000_000_000)
df = df[df.time>0]

names = []

for col in df.columns.values:
    if col[1] == '':
        names.append(col[0])
    else:
        names.append(col[1])

df.columns = names

print('best val:', - round(df.values.min(),4))
a = sns.lineplot(x=df.index, y=-df.value.cummin())
a.set_xlabel('trial number')
sns.scatterplot(x=df.index, y=-df.value, color='red')
a.set_ylabel('f1 score')
a.legend(['best value', "trial's value"]);