#### Santander 데이터 세트 로딩 및 전처리

In [0]:
santander_sdf = spark.read.csv("/FileStore/tables/train_santander.csv", header=True, inferSchema=True)
display(santander_sdf.limit(10))

In [0]:
from pyspark.sql.functions import count, isnan, when, col

print(santander_sdf.printSchema())
print('#### Null column과 count')
display(santander_sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in santander_sdf.columns]))

In [0]:
print((santander_sdf.count(), len(santander_sdf.columns)))
display(santander_sdf.describe())

In [0]:
from pyspark.sql.functions import col

santander_sdf.groupBy('var3').count().orderBy(col('count').desc()).show()

In [0]:
from pyspark.sql.functions import when
santander_sdf = santander_sdf.withColumn("var3", when(santander_sdf["var3"] == -999999, 2).otherwise(santander_sdf["var3"]))
santander_sdf = santander_sdf.drop("ID")

In [0]:
display(santander_sdf.describe())

#### 학습과 테스트 데이터 세트를 분리하되 label값의 분포를 맞춰서 분리(Stratified)

In [0]:
santander_sdf.groupBy('TARGET').count().show()

In [0]:
# 학습과 테스트 데이터 세트 분리
train_sdf, test_sdf = santander_sdf.randomSplit([0.8, 0.2], seed=2021)

In [0]:
# 학습과 테스트 데이터 세트의 TARGET 별 레코드 건수 및 비율 확인. 
train_cnt = train_sdf.count()
test_cnt = test_sdf.count()
print(type(train_cnt), type(test_cnt))

# 아래는 DataFrame을 int로 / 할 수 없어서 오류 발생. 
print(train_sdf.groupBy('TARGET').count()/train_cnt)
print(test_sdf.groupBy('TARGET').count()/test_cnt)

In [0]:
from pyspark.sql.functions import col

train_cnt = train_sdf.count()
test_cnt = test_sdf.count()

# DataFrame의 withColumn으로 적용하여 TARGET 별 레코드 건수 및 비율을 확인. 
train_target_count = train_sdf.groupBy('TARGET').count().alias('count')
train_target_count.withColumn('percent', col('count')/train_cnt).show()

test_target_count = test_sdf.groupBy('TARGET').count().alias('count')
test_target_count.withColumn('percent', col('count')/test_cnt).show()

In [0]:
# Spark Framework에서는 scikit learn의 train_test_split(..., stratified=True) label값의 분포도에 따라 정확히 학습, 테스트 데이터 세트로 분리하는 API지원 안됨.
# 아래와 같이 대체하여 분리 가능. 

# TARGET값이 0 또는 1에 따라서 별도의 DataFrame으로 분리 
zeros_sdf = santander_sdf.filter(santander_sdf["TARGET"]==0)
ones_sdf = santander_sdf.filter(santander_sdf["TARGET"]==1)

# TARGET 값이 0과 1에 따라 분리된 별도의 DataFrame 별로 학습과 테스트 데이터 세트 분리. 
train_0_sdf, test_0_sdf = zeros_sdf.randomSplit([0.8,0.2], seed=2021)
train_1_sdf, test_1_sdf = ones_sdf.randomSplit([0.8,0.2], seed=2021)

# 0과 1로 분리된 학습 데이터 세트를 전체 학습 데이터 세트로 합침. 테스트 데이터 세트도 마찬가지로 합침.  
train_sdf = train_0_sdf.union(train_1_sdf)
test_sdf = test_0_sdf.union(test_1_sdf)

# 학습과 테스트 데이터 세트의 TARGET 별 레코드 건수 및 비율 확인. 
train_cnt = train_sdf.count()
test_cnt = test_sdf.count()

train_target_count = train_sdf.groupBy('TARGET').count().alias('count')
train_target_count.withColumn('percent', col('count')/train_cnt).show()

test_target_count = test_sdf.groupBy('TARGET').count().alias('count')
test_target_count.withColumn('percent', col('count')/test_cnt).show()

In [0]:
def stratified_train_test_split(data_sdf, label_name, split_ratio=[0.8, 0.2], seed=0):
    zeros_sdf = data_sdf.filter(data_sdf[label_name]==0)
    ones_sdf = data_sdf.filter(data_sdf[label_name]==1)

    # TARGET 값이 0과 1에 따라 분리된 별도의 DataFrame 별로 학습과 테스트 데이터 세트 분리. 
    train_0_sdf, test_0_sdf = zeros_sdf.randomSplit(split_ratio, seed=seed)
    train_1_sdf, test_1_sdf = ones_sdf.randomSplit(split_ratio, seed=seed)

    # 0과 1로 분리된 학습 데이터 세트를 전체 학습 데이터 세트로 합침. 테스트 데이터 세트도 마찬가지로 합침.  
    train_sdf = train_0_sdf.union(train_1_sdf)
    test_sdf = test_0_sdf.union(test_1_sdf)
    
    return train_sdf, test_sdf

train_sdf, test_sdf = stratified_train_test_split(santander_sdf, 'TARGET', split_ratio=[0.8, 0.2], seed=2021)

#### XGBoost와 LightGBM으로 CrossValidator와 TrainValidationSplit으로 Hyperparameter 튜닝 적용

In [0]:
# santander 데이터 세트에서 feature 컬럼들만 추출. 전체 컬럼에서 TARGET 컬럼 제외
santander_columns = santander_sdf.columns
santander_columns.remove('TARGET')
print(santander_columns)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sparkdl.xgboost import XgboostClassifier

# Feature vectorization -> Estimator 설정, Hyperparameter 최적화, Cross validation을 수행하는 Pipeline 로직으로 변경. 
# 학습과 테스트 데이터 세트 분리 
vector_assembler = VectorAssembler(inputCols=santander_columns, outputCol='features')
xgb_classifier = XgboostClassifier(featuresCol='features', labelCol='TARGET', missing=0.0
                 , num_workers=1, colsample_bytree=0.5, learning_rate=0.01, n_estimators=50)
'''
param_grid = ParamGridBuilder().addGrid(xgb_classifier.max_depth, [5, 7])\
            .addGrid(xgb_classifier.min_child_weight, [1, 3])\
            .addGrid(xgb_classifier.colsample_bytree, [0.5, 0.75])\
            .addGrid(xgb_classifier.n_estimators, [50, 100])\
            .addGrid(xgb_classifier.learning_rate, [0.01, 0.1]).build()
'''
xgb_param_grid = ParamGridBuilder().addGrid(xgb_classifier.learning_rate, [0.01, 0.1]).build()
# ROC AUC 는 BinaryClassificationEvaluator 적용
roc_eval = BinaryClassificationEvaluator(labelCol='TARGET', metricName='areaUnderROC')

xgb_cv = CrossValidator(estimator=xgb_classifier, evaluator=roc_eval, estimatorParamMaps=xgb_param_grid, numFolds=3, seed=2021)

In [0]:
from pyspark.ml import Pipeline

xgb_pipeline = Pipeline(stages=[vector_assembler, xgb_cv])
xgb_pipeline_model = xgb_pipeline.fit(train_sdf)

In [0]:
xgb_cv_model = xgb_pipeline_model.stages[-1]
[{p.name: v for p, v in m.items()} for m in xgb_cv_model.getEstimatorParamMaps()]

In [0]:
xgb_cv_model.avgMetrics

In [0]:
import pandas as pd

def get_cv_result_pdf(cv_model):
    params = [{p.name: v for p, v in m.items()} for m in cv_model.getEstimatorParamMaps()]
    param_result_pdf = pd.DataFrame({'params': params, 'evaluation_result':cv_model.avgMetrics })
                                           
    return param_result_pdf

xgb_cv_model = xgb_pipeline_model.stages[-1]
xgb_cv_result_pdf = get_cv_result_pdf(xgb_cv_model)
xgb_cv_result_pdf.head()

In [0]:
from pyspark.ml.tuning import TrainValidationSplit

xgb_param_grid = ParamGridBuilder().addGrid(xgb_classifier.max_depth, [5, 7])\
            .addGrid(xgb_classifier.min_child_weight, [1, 3])\
            .addGrid(xgb_classifier.colsample_bytree, [0.5, 0.75])\
            .addGrid(xgb_classifier.n_estimators, [50, 100])\
            .addGrid(xgb_classifier.learning_rate, [0.01, 0.1]).build()

xgb_tvs = TrainValidationSplit(estimator=xgb_classifier, evaluator=roc_eval, estimatorParamMaps=xgb_param_grid
                               , trainRatio=0.8, seed=2021)

In [0]:
from pyspark.ml import Pipeline

xgb_pipeline = Pipeline(stages=[vector_assembler, xgb_tvs])
xgb_pipeline_model = xgb_pipeline.fit(train_sdf)

In [0]:
def get_tvs_result_pdf(tvs_model):
    params = [{p.name: v for p, v in m.items()} for m in tvs_model.getEstimatorParamMaps()]
    param_result_pdf = pd.DataFrame({'params': params, 'evaluation_result':tvs_model.validationMetrics })
                                           
    return param_result_pdf

# display시 컬럼값이 길 경우, 잘리지 않기 위해서. 
pd.set_option('max_colwidth', -1)

xgb_tvs_model = xgb_pipeline_model.stages[-1]
xgb_tvs_result_pdf = get_tvs_result_pdf(xgb_tvs_model)
xgb_tvs_result_pdf.sort_values(by='evaluation_result', ascending=False)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from mmlspark.lightgbm import LightGBMClassifier

# Feature vectorization -> Estimator 설정, Hyperparameter 최적화, Cross validation을 수행하는 Pipeline 로직으로 변경. 
# 학습과 테스트 데이터 세트 분리 
vector_assembler = VectorAssembler(inputCols=santander_columns, outputCol="features")

lgbm_classifier = LightGBMClassifier(featuresCol="features", labelCol="TARGET"
                                     , numIterations=50 #n_estimators
                                     , boostFromAverage=False #boost_from_average
                                    )
'''
lgbm_param_grid = ParamGridBuilder().addGrid(lgbm_classifier.maxDepth, [7, 14])\
            .addGrid(lgbm_classifier.numLeaves, [30, 60])\
            .addGrid(lgbm_classifier.featureFraction, [0.5, 0.75])\
            .addGrid(lgbm_classifier.numIterations, [50, 100])\
            .addGrid(lgbm_classifier.learningRate, [0.01, 0.1]).build()
'''
lgbm_param_grid = ParamGridBuilder().addGrid(lgbm_classifier.learningRate, [0.01, 0.1]).build()
# ROC AUC 는 BinaryClassificationEvaluator 적용
roc_eval = BinaryClassificationEvaluator(labelCol='TARGET', metricName='areaUnderROC' )

lgbm_cv = CrossValidator(estimator=lgbm_classifier, evaluator=roc_eval, estimatorParamMaps=lgbm_param_grid, numFolds=3, seed=2021)

In [0]:
from pyspark.ml import Pipeline

lgbm_pipeline = Pipeline(stages=[vector_assembler, lgbm_cv])
lgbm_pipeline_model = lgbm_pipeline.fit(train_sdf)

In [0]:
lgbm_cv_model = lgbm_pipeline_model.stages[-1]
lgbm_cv_result_pdf = get_cv_result_pdf(lgbm_cv_model)
lgbm_cv_result_pdf.head()

In [0]:
from pyspark.ml.tuning import TrainValidationSplit

lgbm_param_grid = ParamGridBuilder().addGrid(lgbm_classifier.maxDepth, [7, 14])\
            .addGrid(lgbm_classifier.numLeaves, [30, 60])\
            .addGrid(lgbm_classifier.featureFraction, [0.5, 0.75])\
            .addGrid(lgbm_classifier.numIterations, [50, 100])\
            .addGrid(lgbm_classifier.learningRate, [0.01, 0.1]).build() 

lgbm_tvs = TrainValidationSplit(estimator=lgbm_classifier, evaluator=roc_eval, estimatorParamMaps=lgbm_param_grid
                           , trainRatio=0.8, seed=2021)

In [0]:
from pyspark.ml import Pipeline

lgbm_pipeline = Pipeline(stages=[vector_assembler, lgbm_tvs])
lgbm_pipeline_model = lgbm_pipeline.fit(train_sdf)

In [0]:
lgbm_tvs_model = lgbm_pipeline_model.stages[-1]
lgbm_tvs_result_pdf = get_tvs_result_pdf(lgbm_tvs_model)

# display시 컬럼값이 길 경우, 잘리지 않기 위해서. 
pd.set_option('max_colwidth', -1)

lgbm_tvs_result_pdf = get_tvs_result_pdf(lgbm_tvs_model)
lgbm_tvs_result_pdf.sort_values(by='evaluation_result', ascending=False)

### HyperOpt를 이용한 Bayesian 최적화

In [0]:
# https://github.com/hyperopt/hyperopt/wiki/FMin 에서 가져옴. 
from hyperopt import fmin, tpe, hp, STATUS_OK

def objective(x):
    print('x:', x)
    return {'loss': x ** 2, 'status': STATUS_OK }

best = fmin(objective,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100)

print('best:', best)

In [0]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def objective(x):
    return {'loss': x ** 2, 'status': STATUS_OK }

trials = Trials()

best = fmin(objective,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100, 
    trials=trials)

print('best:', best)
print(trials.trials)

In [0]:
print(len(trials.results))
print(trials.results)

In [0]:
print(min(trials.losses()))
trials.losses()

In [0]:
def objective_func(search_dict):
    x = search_dict['max_depth']
    y = search_dict['learning_rate']
    #print('x:', x, 'y:', y)
    return {'loss': x ** 2 + y*20, 'status': STATUS_OK, 'x':x, 'y':y }

# max_depth, 5, 15 와 같이 값 지정은 작은값, 큰값 순으로 지정. 
search_space = {'max_depth': hp.quniform('max_depth', 5, 15, 1), 
                'learning_rate': hp.uniform('learning_rate', 0.01, 0.1)}
algo=tpe.suggest
trials = Trials()

best = fmin(fn=objective_func, space=search_space, algo=algo, max_evals=5, trials=trials)
print('best:', best)

In [0]:
print(trials.results)

### HyperOpt를 이용하여 XGBoost와 LightGBM 하이퍼 파라미터 최적화

In [0]:
from pyspark.sql.functions import col, when

# 데이터 로드
santander_sdf = spark.read.csv("/FileStore/tables/train_santander.csv", header=True, inferSchema=True)

# 데이터 전처리
santander_sdf = santander_sdf.withColumn("var3", when(santander_sdf["var3"] == -999999, 2).otherwise(santander_sdf["var3"]))
santander_sdf = santander_sdf.drop("ID")

# 피처 벡터화 대상 컬럼 추출. 
santander_columns = santander_sdf.columns
santander_columns.remove('TARGET')

#학습과 테스트 데이터의 label값을 stratified하게 맞춰서 분할. 
def stratified_train_test_split(data_sdf, label_name, split_ratio=[0.8, 0.2], seed=0):
    zeros_sdf = data_sdf.filter(data_sdf[label_name]==0)
    ones_sdf = data_sdf.filter(data_sdf[label_name]==1)

    # TARGET 값이 0과 1에 따라 분리된 별도의 DataFrame 별로 학습과 테스트 데이터 세트 분리. 
    train_0_sdf, test_0_sdf = zeros_sdf.randomSplit(split_ratio, seed=seed)
    train_1_sdf, test_1_sdf = ones_sdf.randomSplit(split_ratio, seed=seed)

    # 0과 1로 분리된 학습 데이터 세트를 전체 학습 데이터 세트로 합침. 테스트 데이터 세트도 마찬가지로 합침.  
    train_sdf = train_0_sdf.union(train_1_sdf)
    test_sdf = test_0_sdf.union(test_1_sdf)
    
    return train_sdf, test_sdf

train_sdf, test_sdf = stratified_train_test_split(santander_sdf, 'TARGET', split_ratio=[0.8, 0.2], seed=2021)

In [0]:
from pyspark.ml.feature import VectorAssembler

# 피처벡터화 객체 생성. 
vector_assembler = VectorAssembler(inputCols=santander_columns, outputCol="features")
# 학습 데이터 피처 벡터화
train_sdf_vectorized = vector_assembler.transform(train_sdf)
#학습과 검증 데이터로 기존 학습 데이터를 재 분할. 
tr_sdf_vectorized, val_sdf_vectorized = stratified_train_test_split(train_sdf_vectorized, 'TARGET', split_ratio=[0.8, 0.2], seed=2021)
display(tr_sdf_vectorized.limit(10))

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sparkdl.xgboost import XgboostClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

search_space = {'max_depth': hp.quniform('max_depth', 5, 10, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 0.95),
                'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
                'learning_rate': hp.uniform('learning_rate', 0.05, 0.2)
               }

def objective_func(space):
    # max_depth, min_child_weight와 같이 하이퍼파라미터가 정수형이 되어야 하는 경우 반드시 int()로 형변환 해야함.
    xgb_classifier = XgboostClassifier(n_estimators=50, max_depth=int(space['max_depth']) 
                                     , colsample_bytree=space['colsample_bytree']
                                     , min_child_weight=int(space['min_child_weight'])
                                     , learning_rate=space['learning_rate']
                                     , featuresCol='features', labelCol='TARGET', missing=0.0)
    xgb_model = xgb_classifier.fit(tr_sdf_vectorized)
    #평가는 검증 데이터를 기반으로 수행.
    predictions = xgb_model.transform(val_sdf_vectorized)
    roc_eval = BinaryClassificationEvaluator(labelCol='TARGET', metricName='areaUnderROC')
    roc_auc = roc_eval.evaluate(predictions)
    
    print('space:', space, 'roc_auc:', roc_auc)
       
    #최소값을 최적화하므로 roc_auc에 -1을 곱해서 반환. 
    return {'loss': -1*roc_auc, 'status':STATUS_OK}

algo=tpe.suggest
xgb_trials = Trials()

xgb_best = fmin(fn=objective_func, space=search_space, algo=algo, max_evals=14, trials=xgb_trials)
print(xgb_best)
print(xgb_trials.results)


In [0]:
train_sdf_vectorized = vector_assembler.transform(train_sdf)
test_sdf_vectorized = vector_assembler.transform(test_sdf)

# 학습 데이터로 hyperopt로 도출된 하이퍼파라미터를 이용하여 학습시킴. 
xgb_classifier_best = XgboostClassifier(n_estimators=50, max_depth=int(xgb_best['max_depth']) 
                                     , colsample_bytree=xgb_best['colsample_bytree']
                                     , min_child_weight=int(xgb_best['min_child_weight'])
                                     , learning_rate=xgb_best['learning_rate']
                                     , featuresCol='features', labelCol='TARGET', missing=0.0)
xgb_classifier_best_model = xgb_classifier_best.fit(train_sdf_vectorized)

#예측은 테스트 데이터로 예측 
xgb_predictions = xgb_classifier_best_model.transform(test_sdf_vectorized)
roc_eval = BinaryClassificationEvaluator(labelCol='TARGET', metricName='areaUnderROC')
print('roc auc:', roc_eval.evaluate(xgb_predictions))

In [0]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from mmlspark.lightgbm import LightGBMClassifier

search_space = {'maxDepth': hp.quniform('maxDepth', 10, 20, 1),
                'numLeaves': hp.quniform('numLeaves', 32, 78, 4),
                'baggingFraction': hp.uniform('baggingFraction', 0.6, 1.0),
                'featureFraction': hp.uniform('featureFraction', 0.6, 1.0),
                'learningRate': hp.uniform('learningRate', 0.01, 0.1)
               }

def objective_func(space):
    
    lgbm_classifier = LightGBMClassifier(numIterations=50, maxDepth=int(space['maxDepth'])
                                       , numLeaves=int(space['numLeaves'])
                                       , baggingFraction=space['baggingFraction']
                                       , featureFraction=space['featureFraction']
                                       , learningRate=space['learningRate']
                                       , featuresCol="features", labelCol="TARGET")
    
    lgbm_model = lgbm_classifier.fit(tr_sdf_vectorized)
    #평가는 검증 데이터를 기반으로 수행.
    predictions = lgbm_model.transform(val_sdf_vectorized)
    roc_eval = BinaryClassificationEvaluator(labelCol='TARGET', metricName='areaUnderROC' )
    roc_auc = roc_eval.evaluate(predictions)
    
    print('space:', space, "roc_auc:", roc_auc)
    #최소값을 최적화하므로 roc_auc에 -1을 곱해서 반환.
    return {'loss': -1*roc_auc, 'status':STATUS_OK }

algo=tpe.suggest
lgbm_trials = Trials()

lgbm_best = fmin(fn=objective_func, space=search_space, algo=algo, max_evals=14, trials=lgbm_trials)

In [0]:
print(lgbm_best)
print(lgbm_trials.results)

In [0]:
train_sdf_vectorized = vector_assembler.transform(train_sdf)
test_sdf_vectorized = vector_assembler.transform(test_sdf)

# 학습 데이터로 hyperopt로 도출된 하이퍼파라미터를 이용하여 학습시킴. 
lgbm_classifier_best = LightGBMClassifier(numIterations=50, maxDepth=int(lgbm_best['maxDepth'])
                                       , numLeaves=int(lgbm_best['numLeaves'])
                                       , baggingFraction=lgbm_best['baggingFraction']
                                       , featureFraction=lgbm_best['featureFraction']
                                       , learningRate=lgbm_best['learningRate']
                                       , featuresCol="features", labelCol="TARGET")

lgbm_classifier_best_model = lgbm_classifier_best.fit(train_sdf_vectorized)

#예측은 테스트 데이터로 예측 
lgbm_predictions = lgbm_classifier_best_model.transform(test_sdf_vectorized)
roc_eval = BinaryClassificationEvaluator(labelCol='TARGET', metricName='areaUnderROC')
print('roc auc:', lgbm_roc_eval.evaluate(lgbm_predictions))