## Data load and Module mount

In [None]:
# from bayes_opt import BayesianOptimization
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, PReLU
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import scipy.stats as stats
import seaborn as sns
import tensorflow as tf


# train data load
train_err = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/train_err_data.csv')
train_quality = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/train_quality_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/train_problem_data.csv')

# test data load
test_err = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/test_err_data.csv')
test_quality = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/test_quality_data.csv')

## Preprocessing

In [None]:
# 16554663개 중에 1개 행렬이므로 해당 행 제거
# train_err = train_err.dropna()

# 혹은 바로 다음 행과 컬럼 배열이 동일하므로 40013으로 대체
train_err = train_err.fillna(40013)

# test_err의 na개수는 4 -> drop
test_err = test_err.dropna()

In [None]:
# string형태를 int로 바꿀 함수 정의
def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        pass
    else:
        return int(x)

In [None]:
def make_datetime(x):
    # string 타입의 Time column에서 월일을 추출
    x     = str(x)
    month = x[4:6]
    day   = x[6:8]
    return int(month + day)

In [None]:
# month_day라는 새로운 컬럼 생성
train_err['month_day'] = train_err['time'].apply(make_datetime)
test_err['month_day'] = test_err['time'].apply(make_datetime)

In [None]:
# ','등 불필요한 문자열이 들어가 있어 object로 읽히던 컬럼들을 string2num 함수를 통해 int 형태로 변환

for i in tqdm(range(13)):
    quality = 'quality_' + str(i)
    train_quality[quality] = train_quality[quality].apply(string2num)

for i in tqdm(range(13)):
    quality = 'quality_' + str(i)
    test_quality[quality] = test_quality[quality].apply(string2num)

In [None]:
# object 형태인 fwver은 최빈값으로 대체
# int 또는 float 형태인 quality_0, quality_1, quality_2는 평균값으로 결측값 대체

train_quality['fwver'] = train_quality['fwver'].fillna(train_quality['fwver'].mode()[0])
train_quality['quality_0'] = train_quality['quality_0'].fillna(train_quality['quality_0'].mean())
train_quality['quality_1'] = train_quality['quality_1'].fillna(train_quality['quality_1'].mean())
train_quality['quality_2'] = train_quality['quality_2'].fillna(train_quality['quality_2'].mean())
train_quality['quality_5'] = train_quality['quality_5'].fillna(train_quality['quality_5'].mean())

# # object 형태인 fwver은 최빈값으로
# # int 또는 float 형태인 quality_0, quality_1, quality_2, ... 는 평균값으로 결측값 대체

test_quality['fwver'] = test_quality['fwver'].fillna(test_quality['fwver'].mode()[0])
test_quality['quality_0'] = test_quality['quality_0'].fillna(test_quality['quality_0'].mean())
test_quality['quality_1'] = test_quality['quality_1'].fillna(test_quality['quality_1'].mean())
test_quality['quality_2'] = test_quality['quality_2'].fillna(test_quality['quality_2'].mean())
test_quality['quality_5'] = test_quality['quality_5'].fillna(test_quality['quality_5'].mean())
test_quality['quality_6'] = test_quality['quality_6'].fillna(test_quality['quality_6'].mean())

In [None]:
train_err['fwver'] = train_err['fwver'].apply(string2num)
test_err['fwver'] = test_err['fwver'].apply(string2num)

## Errtype count

In [None]:
# 데이콘 베이스라인 Modeling
# errtype counts를 변수로 problem에서 
# 한 번이라도 user_id가 나오면 오류가 발생했다고 설정

train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,41))

for person_idx, err in tqdm(id_error):
    if err >= 30:
        error[person_idx - train_user_id_min,err - 2] += 1
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    else:
        error[person_idx - train_user_id_min,err - 1] += 1
print(error.shape)

# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem = np.zeros(15000)
problem[train_problem.user_id.unique()-10000] = 1 
print(problem.shape)

In [None]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,41))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    if err >= 30:
        test_x[person_idx - test_user_id_min,err - 2] += 1
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    else:
        test_x[person_idx - test_user_id_min,err - 1] += 1

print(test_x.shape)

## Quality count

In [None]:
# train_quality 데이터 또한 위와 동일한 방법으로 처리
# 단, quality값의 총합을 변수로 생성

id_quality = train_quality[['user_id','quality_0','quality_1','quality_2','quality_5','quality_6',
                            'quality_7','quality_8','quality_9','quality_10','quality_11','quality_12']].values

quality = np.zeros((train_user_number,11))

# ffill등의 방식으로 결측치를 처리하면 float64형태인 컬럼으로 인해 
# id_quality 또한 float64형태로 바뀌면서 인덱싱에 문제가 생김
# 따라서, person_idx에 int()를 씌워주면서 값 대입

for person_idx, q0, q1, q2, q5, q6, q7, q8, q9, q10, q11, q12 in tqdm(id_quality):

    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    if q0 != 0 :
        quality[int(person_idx) - train_user_id_min, 0] += q0
    if q1 != 0 :
        quality[int(person_idx) - train_user_id_min, 1] += q1
    if q2 != 0 :
        quality[int(person_idx) - train_user_id_min, 2] += q2
    if q5 != 0 :
        quality[int(person_idx) - train_user_id_min, 3] += q5
    if q6 != 0 :
        quality[int(person_idx) - train_user_id_min, 4] += q6
    if q7 != 0 :
        quality[int(person_idx) - train_user_id_min, 5] += q7
    if q8 != 0 :
        quality[int(person_idx) - train_user_id_min, 6] += q8
    if q9 != 0 :
        quality[int(person_idx) - train_user_id_min, 7] += q9
    if q10 != 0 :
        quality[int(person_idx) - train_user_id_min, 8] += q10
    if q11 != 0 :
        quality[int(person_idx) - train_user_id_min, 9] += q11
    if q12 != 0 :
        quality[int(person_idx) - train_user_id_min, 10] += q12

In [None]:
# test_quality 처리
# trainset에서 행한 방식과 동일하게 train_quality 처리

id_quality = test_quality[['user_id','quality_0','quality_1','quality_2','quality_5','quality_6','quality_7','quality_8','quality_9','quality_10','quality_11','quality_12']].values
quality_test = np.zeros((test_user_number,11))

for person_idx, q0, q1, q2, q5, q6, q7, q8, q9, q10, q11,q12 in tqdm(id_quality):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    if q0 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 0] += q0
    if q1 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 1] += q1
    if q2 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 2] += q2
    if q5 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 3] += q5
    if q6 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 4] += q6
    if q7 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 5] += q7
    if q8 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 6] += q8
    if q9 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 7] += q9
    if q10 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 8] += q10
    if q11 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 9] += q11
    if q12 != 0 :
        quality_test[int(person_idx) - test_user_id_min, 10] += q12

## Date count

In [None]:
# train_err의 Date count
id_error = train_err[['user_id','month_day']].values
order = train_err['month_day'].unique()
order.sort()
month_day = np.zeros((train_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    month_day[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
# test_err의 Date count
# test_err는 train_err 이후의 일주일(7일) 데이터가 일부 있음
id_error = test_err[['user_id','month_day']].values
order = test_err['month_day'].unique()
order.sort()
month_day_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    month_day_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
test_date_df = pd.DataFrame(month_day_test)
# test_date = test_date_df.iloc[:,:33]
# test_date

In [None]:
# test_date_df[32] = test_date_df[33] + test_date_df[34] + test_date_df[35] + test_date_df[36] + test_date_df[37] + test_date_df[38]
test_date = test_date_df.iloc[:,:33]
test_date

## Fwver count

In [None]:
train_err_fwv = train_err['fwver'].unique()
test_err_fwv = test_err['fwver'].unique()

# train_err와 test_err의 fwver 중 공통 부분만 추출
intersec_err_fwv = np.array([x for x in test_err_fwv if x in train_err_fwv])

print(len(train_err_fwv))
print(len(test_err_fwv))
print(len(intersec_err_fwv))

In [None]:
# 추출한 공통 부분에 해당하는 행들을 따로 추출
train_err_drop = train_err[train_err['fwver'].isin(intersec_err_fwv)]
test_err_drop = test_err[test_err['fwver'].isin(intersec_err_fwv)]

In [None]:
# 공통 부분에 해당하지 않는 행들의 fwver은
# 기타로 취급하여 999999로 설정
train_err_etc = train_err[~train_err['fwver'].isin(intersec_err_fwv)]
test_err_etc = test_err[~test_err['fwver'].isin(intersec_err_fwv)]

train_err_etc['fwver'] = 999999
test_err_etc['fwver'] = 999999

# train_err의 fwv값을 count
id_error = train_err_etc[['user_id','fwver']].values
order = train_err_etc['fwver'].unique()
order.sort()
fwv_etc_lst = np.zeros((train_user_number,len(order)))

for person_idx, fwv in tqdm(id_error):
    fwv_etc_lst[person_idx - train_user_id_min, np.where(order == fwv)[0][0]] += 1

id_error = test_err_etc[['user_id','fwver']].values
order = test_err_etc['fwver'].unique()
order.sort()
fwv_etc_test_lst = np.zeros((test_user_number,len(order)))

for person_idx, fwv in tqdm(id_error):
    fwv_etc_test_lst[person_idx - test_user_id_min, np.where(order == fwv)[0][0]] += 1

In [None]:
# train_err의 fwv값을 count
id_error = train_err_drop[['user_id','fwver']].values
order = train_err_drop['fwver'].unique()
order.sort()
fwv_lst = np.zeros((train_user_number,len(order)))

for person_idx, fwv in tqdm(id_error):
    fwv_lst[person_idx - train_user_id_min, np.where(order == fwv)[0][0]] += 1

In [None]:
id_error = test_err_drop[['user_id','fwver']].values
order = test_err_drop['fwver'].unique()
order.sort()
fwv_test_lst = np.zeros((test_user_number,len(order)))

for person_idx, fwv in tqdm(id_error):
    fwv_test_lst[person_idx - test_user_id_min, np.where(order == fwv)[0][0]] += 1

In [None]:
# quality data들의 fwver count

train_quality['fwver'] = train_quality['fwver'].apply(string2num)
test_quality['fwver'] = test_quality['fwver'].apply(string2num)

In [None]:
train_quality_fwv = train_quality['fwver'].unique()
test_quality_fwv = test_quality['fwver'].unique()

# train_quality와 test_quality의 fwver 중 공통 부분만 추출
intersec_quality_fwv = np.array([x for x in test_quality_fwv if x in train_quality_fwv])
print(len(train_quality_fwv))
print(len(test_quality_fwv))
print(len(intersec_quality_fwv))
print(len([x for x in test_quality_fwv if x not in intersec_quality_fwv]))

In [None]:
# 추출한 공통 부분에 해당하는 행들을 따로 추출
# test_quality에는 train_quality와 안 겹치는 fwver이 없으므로 따로 행 추출 필요 x

train_quality_drop = train_quality[train_quality['fwver'].isin(intersec_quality_fwv)]

In [None]:
# train_quality_fwv 교집합을 제외한 부분
id_error = train_quality_drop[['user_id','fwver']].values
order = train_quality_drop['fwver'].unique()
order.sort()
fwv_quality_lst = np.zeros((train_user_number,len(order)))

for person_idx, fwv in tqdm(id_error):
    fwv_quality_lst[person_idx - train_user_id_min, np.where(order == fwv)[0][0]] += 1

In [None]:
# test_quality에는 train_quality와 겹치는 fwver밖에 없음

id_error = test_quality[['user_id','fwver']].values
order = test_quality['fwver'].unique()
order.sort()
fwv_quality_test_lst = np.zeros((test_user_number,len(order)))

for person_idx, fwv in tqdm(id_error):
    fwv_quality_test_lst[person_idx - test_user_id_min, np.where(order == fwv)[0][0]] += 1

## Model_nm

In [None]:
train_err['model_nm'] = train_err['model_nm'].apply(string2num)
test_err['model_nm'] = test_err['model_nm'].apply(string2num)

In [None]:
# train_err, test_err에는 model_nm이 0~8까지로 9개 존재
# model_nm을 count

id_error = train_err[['user_id','model_nm']].values
model_nm = np.zeros((train_user_number,9))

for person_idx, name in tqdm(id_error):
    model_nm[person_idx - train_user_id_min, name] += 1

In [None]:
id_error = test_err[['user_id','model_nm']].values
model_nm_test = np.zeros((test_user_number,9))

for person_idx, name in tqdm(id_error):
    model_nm_test[person_idx - test_user_id_min, name] += 1

## Hour

In [None]:
def make_hour(x):
    # string 타입의 Time column에서 월일을 추출
    x     = str(x)
    hour = x[8:10]
    return int(hour)

In [None]:
# hour라는 새로운 컬럼 생성
train_err['hour'] = train_err['time'].apply(make_hour)
test_err['hour'] = test_err['time'].apply(make_hour)

In [None]:
print(len(train_err['hour'].unique()))
print(len(test_err['hour'].unique()))

In [None]:
# train_err의 hour count
id_error = train_err[['user_id','hour']].values
order = train_err['hour'].unique()
order.sort()
hour = np.zeros((train_user_number,len(order)))

for person_idx, h in tqdm(id_error):
    hour[person_idx - train_user_id_min, np.where(order == h)[0][0]] += 1

In [None]:
# test_err의 hour count
id_error = test_err[['user_id','hour']].values
order = test_err['hour'].unique()
order.sort()
hour_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    hour_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
hour_df = pd.DataFrame(hour)
# hour_df.head()
hour_df['0_1'] = hour_df[23] + hour_df[0]
hour_df['2_3'] = hour_df[1] + hour_df[2]
hour_df['4_5'] = hour_df[3] + hour_df[4]
hour_df['6_7'] = hour_df[5] + hour_df[6]
hour_df['8_9'] = hour_df[7] + hour_df[8]
hour_df['10_11'] = hour_df[9] + hour_df[10]
hour_df['12_13'] = hour_df[11] + hour_df[12]
hour_df['14_15'] = hour_df[13] + hour_df[14]
hour_df['16_17'] = hour_df[15] + hour_df[16]
hour_df['18_19'] = hour_df[17] + hour_df[18]
hour_df['20_21'] = hour_df[19] + hour_df[20]
hour_df['22_23'] = hour_df[21] + hour_df[22]
# # hour_df.head()
hour_12 = hour_df.iloc[:, 24:]
hour_12 = hour_12/2
hour_12.head()

# hour_df['0_6'] = hour_df[0] + hour_df[1] + hour_df[2] + hour_df[3] + hour_df[4] + hour_df[5]
# hour_df['6_12'] = hour_df[6] + hour_df[7] + hour_df[8] + hour_df[9] + hour_df[10] + hour_df[11]
# hour_df['13_18'] = hour_df[12] + hour_df[13] + hour_df[14] + hour_df[15] + hour_df[16] + hour_df[17]
# hour_df['19_24'] = hour_df[18] + hour_df[19] + hour_df[20] + hour_df[21] + hour_df[22] + hour_df[23]

# hour_4 = hour_df.iloc[:, 24:]
# hour_4 = hour_4/6
# hour_4.head()

## Minute

In [None]:
def make_minute(x):
    # string 타입의 Time column에서 월일을 추출
    x     = str(x)
    minute = x[10:12]
    return int(minute)

In [None]:
# hour라는 새로운 컬럼 생성
train_err['minute'] = train_err['time'].apply(make_minute)
test_err['minute'] = test_err['time'].apply(make_minute)

In [None]:
print(len(train_err['minute'].unique()))
print(len(test_err['minute'].unique()))

In [None]:
# train_err의 minute count
id_error = train_err[['user_id','minute']].values
order = train_err['minute'].unique()
order.sort()
minute = np.zeros((train_user_number,len(order)))

for person_idx, h in tqdm(id_error):
    minute[person_idx - train_user_id_min, np.where(order == h)[0][0]] += 1

In [None]:
# test_err의 minute count
id_error = test_err[['user_id','minute']].values
order = test_err['minute'].unique()
order.sort()
minute_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    minute_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

## Quality Date count

In [None]:
# month_day라는 새로운 컬럼 생성
train_quality['month_day'] = train_quality['time'].apply(make_datetime)
test_quality['month_day'] = test_quality['time'].apply(make_datetime)

In [None]:
# train_quality의 Date count
# 1031 ~ 1130 31개
id_error = train_quality[['user_id','month_day']].values
order = train_quality['month_day'].unique()
order.sort()
quality_month_day = np.zeros((train_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    quality_month_day[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

# for person_idx, mon_day in tqdm(id_error):
#     month_day[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
# test_quality의 Date count
# 1031 ~ 1130 31개
id_error = test_quality[['user_id','month_day']].values
order = test_quality['month_day'].unique()
order.sort()
quality_month_day_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    quality_month_day_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

# for person_idx, mon_day in tqdm(id_error):
#     month_day_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

## Quality Hour count

In [None]:
# month_day라는 새로운 컬럼 생성
train_quality['hour'] = train_quality['time'].apply(make_hour)
test_quality['hour'] = test_quality['time'].apply(make_hour)

In [None]:
# train_quality의 hour count

id_error = train_quality[['user_id','hour']].values
order = train_quality['hour'].unique()
order.sort()
quality_hour = np.zeros((train_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    quality_hour[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

# test_quality의 hour count

id_error = test_quality[['user_id','hour']].values
order = test_quality['hour'].unique()
order.sort()
quality_hour_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    quality_hour_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

## Errcode count

In [None]:
# str이 든 데이터를 임의로 9999로 매핑

def string2num2(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 9999
    else:
        return int(x)

In [None]:
train_err['code'] = train_err['errcode'].apply(string2num2)
test_err['code'] = test_err['errcode'].apply(string2num2)

In [None]:
train_err_code = train_err.groupby(train_err['code'])['code'].count()
test_err_code = test_err.groupby(test_err['code'])['code'].count()

In [None]:
# errcode count가 1000개 이상인 errcode들 추출

train_err_code = train_err_code[train_err_code > 1000]
test_err_code = test_err_code[test_err_code > 1000]

err_code_idx = train_err_code.index
err_code_test_idx = test_err_code.index

err_code_idx = list(err_code_idx)
err_code_test_idx = list(err_code_test_idx)

In [None]:
intersec = [x for x in err_code_test_idx if x in err_code_idx]
print(len(err_code_idx))
print(len(err_code_test_idx))
print(len(intersec))

In [None]:
train_errcode = train_err[train_err['code'].isin(intersec)]
test_errcode = test_err[test_err['code'].isin(intersec)]
print(len(train_errcode))
print(len(test_errcode))

In [None]:
# 공통 부분에 해당하지 않는 행들의 code은
# 기타로 취급하여 999999로 설정
train_errcode_etc = train_err[~train_err['code'].isin(intersec)]
test_errcode_etc = test_err[~test_err['code'].isin(intersec)]

train_errcode_etc['code'] = 999999
test_errcode_etc['code'] = 999999

id_error = train_errcode_etc[['user_id','code']].values
order = train_errcode_etc['code'].unique()
order.sort()
errcode_etc = np.zeros((train_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    errcode_etc[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

id_error = test_errcode_etc[['user_id','code']].values
order = test_errcode_etc['code'].unique()
order.sort()
errcode_etc_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    errcode_etc_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
id_error = train_errcode[['user_id','code']].values
order = intersec
order.sort()
errcode = np.zeros((train_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    errcode[person_idx - train_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
id_error = test_errcode[['user_id','code']].values
order = intersec
order.sort()
errcode_test = np.zeros((test_user_number,len(order)))

for person_idx, mon_day in tqdm(id_error):
    errcode_test[person_idx - test_user_id_min, np.where(order == mon_day)[0][0]] += 1

In [None]:
print(errcode.shape)
print(errcode_test.shape)

## Concatenate

In [None]:
# 위에서 만든 변수들을 concatenate
X = np.concatenate((error, quality), axis = 1)
X = np.concatenate((X, month_day), axis = 1)
X = np.concatenate((X, fwv_lst), axis = 1)
# X = np.concatenate((X, fwv_etc_lst), axis = 1)
# X = np.concatenate((X, fwv_quality_lst), axis = 1)
X = np.concatenate((X, model_nm), axis = 1)
X = np.concatenate((X, hour), axis = 1)
# X = np.concatenate((X, minute), axis = 1)
# X = np.concatenate((X, quality_month_day), axis = 1)
# X = np.concatenate((X, quality_hour), axis = 1)
X = np.concatenate((X, errcode), axis = 1)
# X = np.concatenate((X, errcode_etc), axis = 1)
Y = problem
print(X.shape)
print(Y.shape)

X_test = np.concatenate((test_x, quality_test), axis = 1)
X_test = np.concatenate((X_test, test_date.values), axis = 1)
X_test = np.concatenate((X_test, fwv_test_lst), axis = 1)
# X_test = np.concatenate((X_test, fwv_etc_test_lst), axis = 1)
# X_test = np.concatenate((X_test, fwv_quality_test_lst), axis = 1)
X_test = np.concatenate((X_test, model_nm_test), axis = 1)
X_test = np.concatenate((X_test, hour_test), axis = 1)
# X_test = np.concatenate((X_test, minute_test), axis = 1)
# X_test = np.concatenate((X_test, quality_month_day_test), axis = 1)
# X_test = np.concatenate((X_test, quality_hour_test), axis = 1)
X_test = np.concatenate((X_test, errcode_test), axis = 1)
# X_test = np.concatenate((X_test, errcode_etc_test), axis = 1)
print(X_test.shape)

In [None]:
# error, quality, month_day, fwv_lst, fwv_quality_lst, model_nm, hour -> 0.81873
# error, quality, month_day, fwv_lst, fwv_quality_lst, model_nm, hour, errcode -> 0.82052
# error, quality, month_day, fwv_lst, fwv_quality_lst, model_nm, hour, minute, errcode -> 0.81811
# error, quality, month_day, fwv_lst, model_nm, hour, errcode -> 0.82091

train_x = X
train_y = problem

print(train_x.shape)
print(train_y.shape)

## NN

In [None]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
# def build_nn():
models     = []
auc_scores   = []

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    # d_train= lgbm.Dataset(X, y)
    # d_val  = lgbm.Dataset(valid_x, valid_y)

    earlystop = EarlyStopping(patience=30)
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=2,
                                            factor=0.8,
                                            min_lr=1e-7,
                                            verbose=1)
    model_check = ModelCheckpoint( #에포크마다 현재 가중치를 저장    
            filepath="./lg_nn.h5", #모델 파일 경로
            monitor='val_loss',  # val_loss 가 좋아지지 않으면 모델 파일을 덮어쓰지 않음.
            save_best_only=True)

    callbacks = [earlystop, learning_rate_reduction, model_check]

    model = Sequential()
    model.add(Dense(1024, kernel_initializer='he_normal',
                    input_shape=(X.shape[1],)))  # input_shape로 input layer의 역할까지도 처리
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(512, kernel_initializer='he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(256, kernel_initializer='he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(1, activation='sigmoid', kernel_initializer='he_normal'))

    model.compile(optimizer=Adam(learning_rate=0.005),
                loss='binary_crossentropy',
                metrics=[AUC()])

    history = model.fit(X,
                        y,
                        epochs=1000,
                        verbose=1,
                        validation_data=(valid_x,valid_y),
                        callbacks=callbacks)
    
    # cal valid prediction
    valid_prob = model.predict_proba(valid_x)
    valid_prob = valid_prob.reshape(-1)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    valid_pred = valid_pred.reshape(-1)
    print(valid_prob)
    print(valid_y)
    print(valid_pred)
    # cal scores
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    auc_scores.append(auc_score)
    
    # return history

In [None]:
print(np.mean(auc_scores))

## LGBM

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve, auc, recall_score, precision_score

def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p)
    return "pr_auc", score, True

#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015,
               
                'application': 'binary',
                'max_depth ': 20,
                'is_unbalance': 'true',
                # 'num_leaves': 1000,
                'feature_fraction': 0.75,
                'bagging_fraction': 0.75,
                # 'subsample ': 0.9,
                'bagging_freq': 20,
                'learning_rate': 0.008,
                'verbose': 0
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgbm.Dataset(X, y)
    d_val  = lgbm.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgbm.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 10000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 1000
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    # print(valid_prob)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    # print(valid_pred)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

In [None]:
print(np.mean(auc_scores))

In [None]:
pred_y_list = []
for model in models:
    pred_y = model.predict(X_test)
    pred_y_list.append(pred_y.reshape(-1,1))
    
lgbm_pred_ensemble = np.mean(pred_y_list, axis = 0)
lgbm_pred_ensemble

In [None]:
sample_submssion = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/sample_submission.csv')
sample_submssion['problem'] = lgbm_pred_ensemble.reshape(-1)
sample_submssion.to_csv("/content/drive/MyDrive/Colab/data/LG/lgbm_0.82163.csv", index = False)
sample_submssion

## Xgboost

In [None]:
import xgboost as xgb

def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                # 'boosting_type' : 'gbdt',
                'objective'     : 'binary:logistic',
                'eval_metric'   : 'auc',
                'seed': 1015,
            
                # 'application': 'binary',
                'max_depth ': 4,
                'is_unbalance': 'true',
                # 'num_leaves': 55,
                # 'feature_fraction': 0.75,
                # 'bagging_fraction': 0.75,
                # 'bagging_freq': 20,
                'eta': 0.028,
                # 'reg_lambda': 1,
                'subsample' : 0.7, 
                'verbose': 0
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= xgb.DMatrix(X, y)
    d_val  = xgb.DMatrix(valid_x, valid_y)
    
    d_valid_x = xgb.DMatrix(valid_x)

    #run traning
    model = xgb.train(
                        params,
                        dtrain       = d_train,
                        num_boost_round = 10000,
                        evals      = [(d_train, 'train'), (d_val, 'eval')],
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 500
                       )
    
    # # cal valid prediction
    valid_prob = model.predict(d_valid_x)
    # print(valid_prob)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

In [None]:
print(np.mean(auc_scores))

In [None]:
d_test_x = xgb.DMatrix(X_test)

pred_y_list = []
for model in models:
    pred_y = model.predict(d_test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
xgb_pred_ensemble = np.mean(pred_y_list, axis = 0)
print(xgb_pred_ensemble)

## Catboost

In [None]:
!pip install catboost

In [None]:
# l2-2, depth:6 -> 82066
# l2-2, depth:8, subsample:0.9, Bernoulli -> 82163
# l2-2, depth:9, subsample:0.9, Bernoulli -> 82172
from catboost import CatBoostClassifier

cat_params = {
        'n_estimators': 100000,
        'learning_rate': 0.012,
        'eval_metric': 'AUC',
        # 'loss_function': 'Logloss',
        'bootstrap_type': 'Bernoulli',
        'loss_function': 'MultiClass',
        'random_seed': 42,
        'metric_period': 500,
        'od_wait': 500,
        'task_type': 'GPU',
        'l2_leaf_reg' : 2,
        'depth': 9,
        'subsample' : 0.9,
        'use_best_model': True
    }

In [None]:
models     = []
auc_scores   = []
threshold = 0.5

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    #run traning
    model = CatBoostClassifier(**cat_params)
    model.fit(X, y, eval_set = (valid_x, valid_y), verbose=True)
    # print(model.get_best_score())

    # cal valid prediction
    valid_prob = model.predict_proba(valid_x)
    valid_prob = valid_prob[:, 1]
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    valid_pred = valid_pred.reshape(-1)

    # cal scores
    auc_score = roc_auc_score(valid_y, valid_prob)

    # append scores
    models.append(model)
    auc_scores.append(auc_score)

    print('==========================================================')

In [None]:
print(np.mean(auc_scores))

In [None]:
pred_y_list = []
for model in models:
    pred_y = model.predict_proba(X_test)
    pred_y = pred_y[:, 1]
    pred_y_list.append(pred_y.reshape(-1,1))
    
catboost_pred_ensemble = np.mean(pred_y_list, axis = 0)
catboost_pred_ensemble

In [None]:
cat_lgbm_ensemble = (catboost_pred_ensemble.ravel() + lgbm_pred_ensemble.ravel() + xgb_pred_ensemble.ravel()) / 3
cat_lgbm_ensemble

In [None]:
sample_submssion = pd.read_csv('/content/drive/MyDrive/Colab/data/LG/sample_submission.csv')
sample_submssion['problem'] = cat_lgbm_ensemble
sample_submssion.to_csv("/content/drive/MyDrive/Colab/data/LG/cat82197_lgbm82163_xgb81988_ensemble.csv", index = False)
sample_submssion

In [None]:
## 더미

# # train_err_fwv 교집합을 제외한 부분
# remove_fwv = [x for x in train_err_fwv if x not in intersec_err_fwv]

# first_remove_fwv = train_err[train_err['fwver'] == remove_fwv[0]].index
# train_err_drop = train_err.drop(first_remove_fwv)

# for drop_fwv in remove_fwv[1:]:
#     rmv_fwv = train_err[train_err['fwver'] == drop_fwv].index
#     train_err_drop = train_err_drop.drop(rmv_fwv)

# # test_err_fwv 교집합을 제외한 부분
# remove_fwv_test = [x for x in test_err_fwv if x not in intersec_err_fwv]

# first_remove_fwv = test_err[test_err['fwver'] == remove_fwv_test[0]].index
# test_err_drop = test_err.drop(first_remove_fwv)

# for drop_fwv in remove_fwv_test[1:]:
#     rmv_fwv = test_err[test_err['fwver'] == drop_fwv].index
#     test_err_drop = test_err_drop.drop(rmv_fwv)


# # train_quality_fwv 교집합을 제외한 부분
# remove_quality_fwv = [x for x in train_quality_fwv if x not in intersec_quality_fwv]

# first_remove_fwv = train_quality[train_quality['fwver'] == remove_quality_fwv[0]].index
# train_quality_drop = train_quality.drop(first_remove_fwv)

# for drop_fwv in remove_quality_fwv[1:]:
#     rmv_fwv = train_quality[train_quality['fwver'] == drop_fwv].index
#     train_quality_drop = train_quality_drop.drop(rmv_fwv)