In [1]:
import gc
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from time import time
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# Load data
# ============================================================================

print("Load data...")
data_path = "../raw_data/train_demo.csv"
data = pd.read_csv(data_path)

X_text = data['word_seg']
y = data['class'] - 1
num_classes = max(y) + 1
print("The number of samples is: %d" % len(X_text))
print("The number of classes is: %d" % num_classes)

del data
gc.collect()

Load data...
The number of samples is: 4999
The number of classes is: 19


7

In [3]:
# Extract feature vectors
# ============================================================================

# Extractor's hyper-parameters
vect_params = {
    'ngram_range': (1, 2),
    'max_df': 0.9,
    'min_df': 3,
    'max_features': None,
    'sublinear_tf': True
}
print("Vectorizer's hyper-parameters:")
pprint(vect_params)

# Initialize feature extractor
vectorizer = TfidfVectorizer(**vect_params)

print("Extract features...")
t0_extract = time()
X = vectorizer.fit_transform(X_text)
print("Done in %.3f seconds." % (time() - t0_extract))
print("Extract finished! ( ^ _ ^ ) V")

# The number of features
num_feats = len(vectorizer.get_feature_names())
print("The number of features is %d" % num_feats)

Vectorizer's hyper-parameters:
{'max_df': 0.9,
 'max_features': None,
 'min_df': 3,
 'ngram_range': (1, 2),
 'sublinear_tf': True}
Extract features...
Done in 16.612 seconds.
Extract finished! ( ^ _ ^ ) V
The number of features is 190239


In [4]:
# Split data into training and validation set
# ============================================================================

print("Split data into training and validation set...")
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

Split data into training and validation set...




In [5]:
# Tuning the hyper-parameters of LightGBM model and save the results
# ============================================================================

df_params = pd.read_csv("lgb-tfidf-params-2.csv")
num_params = df_params.shape[0]
print("The number of parameter combinations is: %d" % num_params)

for i in range(num_params):
    print()
    print("Parameter combination %d" % (i + 1))
    
    gbm_params = {
        'boosting_type': df_params['type'].values[i],
        'objective': 'multiclass',
        'num_class': num_classes,
        'metric': 'multi_logloss',

        'learning_rate': df_params['lr'].values[i],

        'num_leaves': df_params['n_leaf'].values[i],
        'max_depth': df_params['n_depth'].values[i],
        'min_data_in_leaf': df_params['min_data'].values[i],

        'feature_fraction': df_params['feat_frac'].values[i],
        'bagging_fraction': df_params['bagging_frac'].values[i],
        'bagging_freq': df_params['bagging_freq'].values[i],

        'lambda_l1': df_params['l1'].values[i],
        'lambda_l2': df_params['l2'].values[i],
        'min_gain_to_split': df_params['min_gain'].values[i],
        'min_sum_hessian_in_leaf': df_params['hessian'].values[i],

        'num_threads': 16,
        'verbose': 0,
        'is_training_metric': 'True'
    }
    print("LightGBM's hyper-parameters:")
    pprint(gbm_params)

    print("Start training...")
    t0_train = time()
    evals_result = {}
    gbm = lgb.train(params=gbm_params,
                    train_set=lgb_train,
                    num_boost_round=100,
                    valid_sets=[lgb_train, lgb_val],
                    valid_names=['train', 'val'],
                    evals_result=evals_result,
                    early_stopping_rounds=50,
                    verbose_eval=100)
    print("Done in %.3f seconds" % (time() - t0_extract))
    print("Training finished! ( ^ _ ^ ) V")
    
    best_iter = gbm.best_iteration
    loss_train = evals_result['train']['multi_logloss'][best_iter-1]
    loss_val = evals_result['val']['multi_logloss'][best_iter-1]

    probs_train = gbm.predict(X_train, num_iteration=best_iter)
    preds_train = np.argmax(probs_train, axis=1)
    acc_train = accuracy_score(y_train, preds_train)
    f1_train = f1_score(y_train, preds_train, average='weighted')

    probs_val = gbm.predict(X_val, num_iteration=best_iter)
    preds_val = np.argmax(probs_val, axis=1)
    acc_val = accuracy_score(y_val, preds_val)
    f1_val = f1_score(y_val, preds_val, average='weighted')

    print("Best iteration: %d" % best_iter)
    print("Training Loss: %.5f, Validation Loss: %.5f" % (loss_train, loss_val))
    print("Training Accuracy: %.2f, Validation Accuracy: %.2f" % (acc_train * 100, acc_val * 100))
    print("Training F1 Score: %.5f, Validation F1 Score: %.5f" % (f1_train, f1_val))
    
    res = "%s,%s,%d,%s,%f,%d,%s,%s,%s,%.4f,%d,%d,%d,%.4f,%.4f,%d,%.4e,%.4e,%.4e,%.4e,%.4e,%d,%.5f,%.5f,%.2f,%.2f,%.5f,%.5f\n" % (
        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        "lgb-tfidf-%d" % (i + 1),  # model name
        num_feats,  # number of features
        '1|2',
        0.9,
        3,
        'None',
        'True',
        gbm_params['boosting_type'],
        gbm_params['learning_rate'],
        gbm_params['num_leaves'],
        gbm_params['max_depth'],
        gbm_params['min_data_in_leaf'],
        gbm_params['feature_fraction'],
        gbm_params['bagging_fraction'],
        gbm_params['bagging_freq'],
        gbm_params['lambda_l1'],
        gbm_params['lambda_l2'],
        gbm_params['min_gain_to_split'],
        gbm_params['min_sum_hessian_in_leaf'],
        0.8,  # train size
        best_iter,  # best iteration
        loss_train,  # multi-logloss of training set
        loss_val,  # multi-logloss of validation set
        acc_train,  # accuracy of training set
        acc_val,  # accuracy of validation set
        f1_train,  # f1 score of training set
        f1_val  # f1 score of validation set
    )

    f = open("lgb-tfidf-tuning-results.csv", 'a')
    f.write(res)
    f.close()

The number of parameter combinations is: 1

Parameter combination 1
LightGBM's hyper-parameters:
{'bagging_fraction': 0.9,
 'bagging_freq': 1,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.8,
 'is_training_metric': 'True',
 'lambda_l1': 0,
 'lambda_l2': 100,
 'learning_rate': 0.05,
 'max_depth': 10,
 'metric': 'multi_logloss',
 'min_data_in_leaf': 50,
 'min_gain_to_split': 0,
 'min_sum_hessian_in_leaf': 0.1,
 'num_class': 19,
 'num_leaves': 127,
 'num_threads': 16,
 'objective': 'multiclass',
 'verbose': 0}
Start training...
Training until validation scores don't improve for 50 rounds.
[100]	train's multi_logloss: 1.45408	val's multi_logloss: 1.76493
Did not meet early stopping. Best iteration is:
[100]	train's multi_logloss: 1.45408	val's multi_logloss: 1.76493
Done in 151.519 seconds
Training finished! ( ^ _ ^ ) V
Best iteration: 100
Training Loss: 1.45408, Validation Loss: 1.76493
Training Accuracy: 72.22, Validation Accuracy: 58.20
Training F1 Score: 0.71664, Validation F1 Score