In [1]:
import os
import sys
sys.path.append("../src/")
import argparse
import pandas as pd

from sklearn.metrics import mean_absolute_error

from utils import *
from preprocess import PreProcessor
from model import Model, OptunaProcessor

categorical_feature = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']

def train(args):
    ## load data set
    X_train, y_train, X_valid, y_valid = load_dataset(mode='train')

    ## preprocess data set
    preprocessing = PreProcessor(categorical_feature=categorical_feature)
    encoder_dict = preprocessing.categorical_process_fit(X_train)

    save_pickle(file=encoder_dict, file_name="labelencoder", path=f"{args.pickle_path}{args.version}encoder")

    X_train = preprocessing.run(X_train, encoder=encoder_dict, drop=True)
    X_valid = preprocessing.run(X_valid, encoder=encoder_dict, drop=True)

    X_train, y_train = reset_data(X_train, y_train)
    X_valid, y_valid = reset_data(X_valid, y_valid)

    ## progress optuna
    if args.mode == 'optuna':
        print("     Strat Optuna!")
        optuna = OptunaProcessor(X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, categorical_feature=categorical_feature)
        optuna_process = optuna.run_optuna(n_trials=args.n_trials, model_name=args.model_name)
        best_optuna_params = optuna_process.params
        print("     Done Optuna!")
        
        save_pickle(file=best_optuna_params, file_name=f"{args.model_name}_optuna", path=f"{args.pickle_path}/{args.version}")

    else: 
        best_optuna_params = load_pickle(file_name=f"{args.model_name}_optuna", path=f"{args.pickle_path}/{args.version}")

    ## train model
    model = Model(
        model_params=best_optuna_params, 
        categorical_feature=categorical_feature, 
        X_train=X_train, 
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        model_name=args.model_name
        )
    model = model.fit()
    
    save_pickle(file=model, file_name=args.model_name, path=f"{args.pickle_path}/{args.version}")

    mae = mean_absolute_error(model.predict(X_valid), y_valid)
    print("Model Name : ", args.model_name)
    print("Validatio MAE : ", mae)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pickle_path", "-pp", type=int, default="../pickle", help="path of pickle folder")
    parser.add_argument("--version", "-v", type=str, default="test_version", help="version number")
    parser.add_argument("--mode", "-m", type=str, default="", help="progress optuna")
    parser.add_argument("--n_trials", "-n", type=int, default=2, help="set number of trials")
    parser.add_argument("--model_name", "-mn", type=str, default="lgboost", help="select model")
    args = parser.parse_args()
    train(args)

[I 2023-10-04 14:55:21,616] A new study created in memory with name: no-name-ee9b1c84-344f-417b-abe8-81e178a22406


[LightGBM] [Info] Total Bins 22694
[LightGBM] [Info] Number of data points in the train set: 257208, number of used features: 27


[I 2023-10-04 14:55:38,767] Trial 0 finished with value: 2.7867058059125154 and parameters: {'num_iteration': 641, 'learning_rate': 0.005480573348122462, 'num_leaves': 24, 'max_depth': 6, 'min_data_in_leaf': 259, 'lambda_l1': 0.016784527289964487, 'lambda_l2': 0.00010369273949998598}. Best is trial 0 with value: 2.7867058059125154.


[LightGBM] [Info] Total Bins 22694
[LightGBM] [Info] Number of data points in the train set: 257208, number of used features: 27


[I 2023-10-04 14:55:42,994] Trial 1 finished with value: 1.7187825593401596 and parameters: {'num_iteration': 412, 'learning_rate': 0.03265607036826561, 'num_leaves': 2, 'max_depth': 11, 'min_data_in_leaf': 96, 'lambda_l1': 6.607338345216472, 'lambda_l2': 0.0013966102809988687}. Best is trial 1 with value: 1.7187825593401596.


['SHIP_TYPE_CATEGORY', 'ATA', 'FLAG', 'ID', 'ARI_PO', 'SHIPMANAGER', 'ARI_CO']