In [None]:
# from bayes_opt import BayesianOptimization
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, PReLU
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import scipy.stats as stats
import seaborn as sns
import tensorflow as tf


# train data load
train_err = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/train_err_data.csv')
train_quality = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/train_quality_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/train_problem_data.csv')

# test data load
test_err = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/test_err_data.csv')
test_quality = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/test_quality_data.csv')

## Preprocessing

In [None]:
# 16554663개 중에 1개 행렬이므로 해당 행 제거
# train_err = train_err.dropna()

# 혹은 바로 다음 행과 컬럼 배열이 동일하므로 40013으로 대체
train_err = train_err.fillna(40013)

# test_err의 na개수는 4 -> drop
test_err = test_err.dropna()

In [None]:
# string형태를 int로 바꿀 함수 정의
def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        pass
    else:
        return int(x)

In [None]:
def make_datetime(x):
    # string 타입의 Time column에서 월일을 추출
    x     = str(x)
    month = x[4:6]
    day   = x[6:8]
    return int(month + day)

In [None]:
# month_day라는 새로운 컬럼 생성
train_err['month_day'] = train_err['time'].apply(lambda x: make_datetime(x))
test_err['month_day'] = test_err['time'].apply(lambda x: make_datetime(x))

In [None]:
# ','등 불필요한 문자열이 들어가 있어 object로 읽히던 컬럼들을 string2num 함수를 통해 int 형태로 변환

for i in tqdm(range(13)):
    quality = 'quality_' + str(i)
    train_quality[quality] = train_quality[quality].apply(string2num)

for i in tqdm(range(13)):
    quality = 'quality_' + str(i)
    test_quality[quality] = test_quality[quality].apply(string2num)

In [None]:
# object 형태인 fwver은 최빈값으로 대체
# int 또는 float 형태인 quality_0, quality_1, quality_2는 평균값으로 결측값 대체

train_quality['fwver'] = train_quality['fwver'].fillna(train_quality['fwver'].mode()[0])
train_quality['quality_0'] = train_quality['quality_0'].fillna(train_quality['quality_0'].mean())
train_quality['quality_1'] = train_quality['quality_1'].fillna(train_quality['quality_1'].mean())
train_quality['quality_2'] = train_quality['quality_2'].fillna(train_quality['quality_2'].mean())
train_quality['quality_5'] = train_quality['quality_5'].fillna(train_quality['quality_5'].mean())

# # object 형태인 fwver은 최빈값으로
# # int 또는 float 형태인 quality_0, quality_1, quality_2, ... 는 평균값으로 결측값 대체

test_quality['fwver'] = test_quality['fwver'].fillna(test_quality['fwver'].mode()[0])
test_quality['quality_0'] = test_quality['quality_0'].fillna(test_quality['quality_0'].mean())
test_quality['quality_1'] = test_quality['quality_1'].fillna(test_quality['quality_1'].mean())
test_quality['quality_2'] = test_quality['quality_2'].fillna(test_quality['quality_2'].mean())
test_quality['quality_5'] = test_quality['quality_5'].fillna(test_quality['quality_5'].mean())
test_quality['quality_6'] = test_quality['quality_6'].fillna(test_quality['quality_6'].mean())

In [None]:
train_err['fwver'] = train_err['fwver'].apply(string2num)
test_err['fwver'] = test_err['fwver'].apply(string2num)

## Errtype count

In [None]:
# 데이콘 베이스라인 Modeling
# errtype counts를 변수로 problem에서 
# 한 번이라도 user_id가 나오면 오류가 발생했다고 설정

train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,41))

for person_idx, err in tqdm(id_error):
    if err >= 30:
        error[person_idx - train_user_id_min,err - 2] += 1
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    else:
        error[person_idx - train_user_id_min,err - 1] += 1
print(error.shape)

# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem = np.zeros(15000)
problem[train_problem.user_id.unique()-10000] = 1 
print(problem.shape)

In [None]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,41))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    if err >= 30:
        test_x[person_idx - test_user_id_min,err - 2] += 1
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    else:
        test_x[person_idx - test_user_id_min,err - 1] += 1

print(test_x.shape)

# Quality count

In [None]:
# train_quality 데이터 또한 위와 동일한 방법으로 처리
# 단, quality값의 총합을 변수로 생성

id_quality = train_quality[['user_id','quality_0','quality_1','quality_2','quality_5','quality_6',
                            'quality_7','quality_8','quality_9','quality_10','quality_11','quality_12']].values

quality = np.zeros((train_user_number,11))

# ffill등의 방식으로 결측치를 처리하면 float64형태인 컬럼으로 인해 
# id_quality 또한 float64형태로 바뀌면서 인덱싱에 문제가 생김
# 따라서, person_idx에 int()를 씌워주면서 값 대입

for person_idx, q0, q1, q2, q5, q6, q7, q8, q9, q10, q11, q12 in tqdm(id_quality):

    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    if q0 != 0 :
        quality[int(person_idx) - train_user_id_min, 0] += q0
    if q1 != 0 :
        quality[int(person_idx) - train_user_id_min, 1] += q1
    if q2 != 0 :
        quality[int(person_idx) - train_user_id_min, 2] += q2
    if q5 != 0 :
        quality[int(person_idx) - train_user_id_min, 3] += q5
    if q6 != 0 :
        quality[int(person_idx) - train_user_id_min, 4] += q6
    if q7 != 0 :
        quality[int(person_idx) - train_user_id_min, 5] += q7
    if q8 != 0 :
        quality[int(person_idx) - train_user_id_min, 6] += q8
    if q9 != 0 :
        quality[int(person_idx) - train_user_id_min, 7] += q9
    if q10 != 0 :
        quality[int(person_idx) - train_user_id_min, 8] += q10
    if q11 != 0 :
        quality[int(person_idx) - train_user_id_min, 9] += q11
    if q12 != 0 :
        quality[int(person_idx) - train_user_id_min, 10] += q12

In [None]:
# test_quality 처리
# trainset에서 행한 방식과 동일하게 train_quality 처리

id_quality = test_quality[['user_id','quality_0','quality_1','quality_2','quality_5','quality_6','quality_7','quality_8','quality_9','quality_10','quality_11','quality_12']].values
quality_test = np.zeros((test_user_number,11))

for person_idx, q0, q1, q2, q5, q6, q7, q8, q9, q10, q11,q12 in tqdm(id_quality):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    if q0 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 0] += q0
    if q1 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 1] += q1
    if q2 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 2] += q2
    if q5 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 3] += q5
    if q6 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 4] += q6
    if q7 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 5] += q7
    if q8 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 6] += q8
    if q9 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 7] += q9
    if q10 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 8] += q10
    if q11 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 9] += q11
    if q12 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 10] += q12

## Date count

In [None]:
# train_err의 Date count
id_error = train_err[['user_id','month_day']].values
order = train_err['month_day'].unique()
order.sort()
month_day = np.zeros((train_user_number,33))

for person_idx, mon_day in tqdm(id_error):
    month_day[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
# test_err의 Date count
# test_err는 train_err 이후의 일주일(7일) 데이터가 일부 있음
id_error = test_err[['user_id','month_day']].values
order = test_err['month_day'].unique()
order.sort()
month_day_test = np.zeros((test_user_number,39))

for person_idx, mon_day in tqdm(id_error):
    month_day_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
test_date_df = pd.DataFrame(month_day_test)
test_date = test_date_df.iloc[:,:33]
test_date

## Fwver count

In [None]:
train_err_fwv = train_err['fwver'].unique()
test_err_fwv = test_err['fwver'].unique()

# train_err와 test_err의 공통 부분만 추출
intersec_err_fwv = np.array([x for x in test_err_fwv if x in train_err_fwv])
print(len(intersec_err_fwv))

In [None]:
# train_err_fwv 교집합을 제외한 부분
remove_fwv = [x for x in train_err_fwv if x not in intersec_err_fwv]

first_remove_fwv = train_err[train_err['fwver'] == remove_fwv[0]].index
train_err_drop = train_err.drop(first_remove_fwv)

for drop_fwv in remove_fwv[1:]:
    rmv_fwv = train_err[train_err['fwver'] == drop_fwv].index
    train_err_drop = train_err_drop.drop(rmv_fwv)

In [None]:
id_error = train_err_drop[['user_id','fwver']].values
order = train_err_drop['fwver'].unique()
order.sort()
fwv_lst = np.zeros((train_user_number,31))

for person_idx, fwv in tqdm(id_error):
    fwv_lst[person_idx - train_user_id_min, np.where(order == fwv)[0][0]] += 1

In [None]:
# test_err_fwv 교집합을 제외한 부분
remove_fwv_test = [x for x in test_err_fwv if x not in intersec_err_fwv]

first_remove_fwv = test_err[test_err['fwver'] == remove_fwv_test[0]].index
test_err_drop = test_err.drop(first_remove_fwv)

for drop_fwv in remove_fwv_test[1:]:
    rmv_fwv = test_err[test_err['fwver'] == drop_fwv].index
    test_err_drop = test_err_drop.drop(rmv_fwv)

In [None]:
id_error = test_err_drop[['user_id','fwver']].values
order = test_err_drop['fwver'].unique()
order.sort()
fwv_test_lst = np.zeros((test_user_number,31))

for person_idx, fwv in tqdm(id_error):
    fwv_test_lst[person_idx - test_user_id_min, np.where(order == fwv)[0][0]] += 1

In [None]:
# 위에서 만든 변수들을 concatenate
X = np.concatenate((error, quality), axis = 1)
X = np.concatenate((X, month_day), axis = 1)
X = np.concatenate((X, fwv_lst), axis = 1)
Y = problem
print(X.shape)
print(Y.shape)

X_test = np.concatenate((test_x, quality_test), axis = 1)
X_test = np.concatenate((X_test, test_date.values), axis = 1)
X_test = np.concatenate((X_test, fwv_test_lst), axis = 1)
print(X_test.shape)

In [None]:
train_x = X
train_y = problem

print(train_x.shape)
print(train_y.shape)

## LGBM

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve, auc, recall_score, precision_score

def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True

#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015,
               
                'application': 'binary',
                'max_depth ': 6,
                'is_unbalance': 'true',
                'num_leaves': 55,
                'feature_fraction': 0.7,
                'bagging_fraction': 0.7,
                # 'subsample ': 0.9,
                'bagging_freq': 20,
                'learning_rate': 0.008,
                'verbose': 0
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgbm.Dataset(X, y)
    d_val  = lgbm.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgbm.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 10000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 1000
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

In [None]:
print(np.mean(auc_scores))

In [None]:
pred_y_list = []
for model in models:
    pred_y = model.predict(X_test)
    pred_y_list.append(pred_y.reshape(-1,1))
    
lgbm_pred_ensemble = np.mean(pred_y_list, axis = 0)
lgbm_pred_ensemble