In [1]:
import pickle

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.cluster import KMeans

from mymodule import except_outlier, pre_train_test, spearman_and_mse

ModuleNotFoundError: No module named 'mymodule'

In [None]:
base_df = pd.read_csv('dataset/featured//base.csv')

In [102]:
drops = ['protein_sequence', 'pH', 'B', 'J', 'O', 'U', 'X', 'Z']

df = except_outlier(base_df, 'sequence_len')
idx, x, y = pre_train_test(df, True, drops=drops)

In [103]:
cluster_df = x.to_numpy()
# clustering
n_clusters = 4
km_model = KMeans(n_clusters=n_clusters)
cluster_id = km_model.fit_predict(cluster_df)

In [104]:
cid = pd.Series(cluster_id)
c_y = pd.concat((y, cid),axis=1)
c_y.columns = ['tm', 'cluster_id']
c_x = x.copy()
c_x['cluster_id'] = cid

In [134]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1', 
    'metric': 'mean_absolute_error',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 0.8,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 50,
    'n_estimators': 10000,
    "random_state": 123,
    "importance_type": "gain",
}

file_path = 'models/test/cluster_test'

for n in range(n_clusters):
    train_x = c_x[c_x['cluster_id']!=n]
    train_y = c_y[c_y['cluster_id']!=n]
    val_x = c_x[c_x['cluster_id']==n]
    val_y = c_y[c_y['cluster_id']==n]

    train_x, train_y, val_x, val_y = [i.drop('cluster_id', axis=1).to_numpy()
                                    for i in [train_x, train_y, val_x, val_y]]
    train_y = train_y.reshape(-1)
    val_y = val_y.reshape(-1)
                                    

    # train model
    model = lgb.LGBMRegressor(**params)
    model.fit(
        train_x, train_y,
        eval_set=[(train_x, train_y), (val_x, val_y)],
        early_stopping_rounds=50,
        verbose=1000)

    # evaluate prediction with spreaman_correlation_coefficient
    y_pred = model.predict(val_x)
    corr, mse = spearman_and_mse(val_y, y_pred)
    print('correlation: {:.5}'.format(corr,),' | mse: {:.1}'.format(mse))
    
    # save model to file_path
    path = file_path + f'_fold{n}'
    with open(path, 'wb') as f:
        pickle.dump(model, f)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[702]	training's l1: 5.93893	valid_1's l1: 7.5136
correlation: 0.99999  | mse: 1e+02
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[655]	training's l1: 6.1102	valid_1's l1: 6.64933
correlation: 0.99999  | mse: 9e+01
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	training's l1: 7.46063	valid_1's l1: 12.3903
correlation: 0.99999  | mse: 3e+02
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[70]	training's l1: 7.53402	valid_1's l1: 6.43252
correlation: 0.99994  | mse: 1e+02
