In [28]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score, make_scorer
tqdm.pandas()
from utils import read_yaml
MODELING_CONFIG_PATH = "../config/modeling_config.yaml"

def load_fed_data():
    """
    Loader for feature engineered data.
    Args:
    - params(dict): modeling params.
    Returns:
    - x_train(DataFrame): inputs of train set.
    - y_train(DataFrame): target of train set.
    - x_valid(DataFrame): inputs of valid set.
    - y_valid(DataFrame): terget of valid set.
    """

    x_train_path = "../output/x_train_preprocessed.pkl"
    y_train_path = "../output/y_train.pkl"
    x_valid_path = "../output/x_valid_preprocessed.pkl"
    y_valid_path = "../output/y_valid.pkl"
    x_train = joblib.load(x_train_path)
    y_train = np.ravel(joblib.load(y_train_path))
    x_valid = joblib.load(x_valid_path)
    y_valid = np.ravel(joblib.load(y_valid_path))
    return x_train, y_train, x_valid, y_valid

def model():
    base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    param_dist = {'n_estimators' : [25, 50, 100]}
    return base_model, param_dist

In [30]:
def validate(x_valid, y_valid):
    y_pred = result.predict(x_valid)
    report_model = classification_report(y_valid, y_pred,)
    return report_model

In [40]:
def main_training_model(param_model, x_train, y_train, x_valid, y_valid):

    # define model
    base_model, param_dist = param_model

    scoring = 'f1'

    # define search
    search = RandomizedSearchCV(base_model, param_dist, scoring=scoring, cv=3)

    # execute search
    result = search.fit(x_train, y_train)
    validation_result = result.score(x_valid, y_valid)
    
    print('validation score: %s' % validation_result)

    # Dump model name
    joblib.dump(base_model, f'../output/model_name.pkl')
    # Dump best model estimator with best param
    joblib.dump(result.get_params, '../output/best_estimator.pkl')
    # summarize result
    joblib.dump(result, '../output/result.pkl')

    return result

In [41]:
if __name__ == "__main__":
    param_model = model()
    x_train, y_train, x_valid, y_valid = load_fed_data()
    hasil = main_training_model(param_model, x_train, y_train, x_valid, y_valid)



validation score: 0.42094560244026435


In [42]:
joblib.load("../output/best_estimator.pkl")

<bound method BaseEstimator.get_params of RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   param_distributions={'n_estimators': [25, 50, 100]},
                   scoring='f1')>