In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
import seaborn as sns
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour, mim)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = '/kaggle/input/dacon-lg/'

### 현재까지 종합한 생성가능 파생변수들
- model_nm 변화 횟수 1회, 2회, 3회
- model_nm의 전체 더미변수
- fwver 변화 횟수 1~5회
- fwver의 전체 더미 변수
- 모든 errtype이 발생한 day의 평균, mode
- 모든 errtype이 발생한 hour의 평균, mode
- 모든 errtype이 발생한 weekday의 평균, mode
- 이하는 모두 errcode 관련 파생변수들
    - 1과 0
    - connection out 류
    - 알파벳 류
    - 다른 word codes (active, standby, http, ...) 각 word를 하나의 column으로 설정
    - 음수
    - 10번대 ~ 90번대
    - 1000번대 ~ 9000번대
    - 10000번 이상

In [2]:
'''
train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qual = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err = pd.read_csv(PATH+'test_err_data.csv')
test_qual = pd.read_csv(PATH+'test_quality_data.csv')
display(train_err.head())
display(train_qual.head())
'''

"\ntrain_err  = pd.read_csv(PATH+'train_err_data.csv')\ntrain_qual = pd.read_csv(PATH+'train_quality_data.csv')\ntrain_prob = pd.read_csv(PATH+'train_problem_data.csv')\n\ntest_err = pd.read_csv(PATH+'test_err_data.csv')\ntest_qual = pd.read_csv(PATH+'test_quality_data.csv')\ndisplay(train_err.head())\ndisplay(train_qual.head())\n"

In [3]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
test_err = pd.read_csv(PATH+'test_err_data.csv')
display(train_err)
display(test_err)

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1
2,10000,20201101030309,model_3,05.15.2138,11,1
3,10000,20201101050514,model_3,05.15.2138,16,1
4,10000,20201101050515,model_3,05.15.2138,4,0
...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0


Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1
...,...,...,...,...,...,...
16532643,44998,20201130210050,model_1,04.16.3553,40,0
16532644,44998,20201130211831,model_1,04.16.3553,31,1
16532645,44998,20201130211832,model_1,04.16.3553,15,1
16532646,44998,20201130212259,model_1,04.16.3553,16,1


### train_err와 test_err의 공통점
- model_nm의 unique values는 같다
- errtype의 unique values는 같다

### train_err와 test_err의 차이점
- errcode에서, train과 test의 유니크값이 다르다 (각각에만 존재하는 errcode가 있다)
- fwver도 마찬가지로 각각에만 존재하는 errcode가 있다


test_err에서 10월 12월 데이터는 모두 합쳐 76개 밖에 안되므로 데이터의 범위를 11월달 한달로 지정한다. 따라서 10월 12월의 데이터는 삭제

라고 생각했지만 딱히 데이터의 날짜 범위를 지정할 필요는 없다. 왜냐하면 삭제한 data가 problem을 나타냈을 수도 있고, 삭제한 data가 quality_data에 있을 수도 있기때문에 그렇게 되면 quality_data에서도 다시한번 삭제 작업을 해주어야 한다. EDA당시에 10월 12월 데이터를 삭제한 이유는 11월 한달간의 error발생 횟수를 시각화 편의를 위해서 그런것이고 실제 전처리할때는 굳이 데이터를 삭제할 필요는 없다

In [4]:
print("train과 test의 fwver의 교집합:/n", np.intersect1d(train_err.fwver.unique(), test_err.fwver.unique()))
print("train과 test의 fwver의 합집합:/n", np.union1d(train_err.fwver.unique(), test_err.fwver.unique()))
print("train에만 있고 test에는 없는 fwver(차집합):/n", np.setdiff1d(train_err.fwver.unique(), test_err.fwver.unique()))
print("test에만 있고 train에는 없는 fwver(차집합):/n", np.setdiff1d(test_err.fwver.unique(), train_err.fwver.unique()))

train과 test의 fwver의 교집합:/n ['03.11.1141' '03.11.1149' '03.11.1167' '04.16.3439' '04.16.3553'
 '04.16.3569' '04.16.3571' '04.22.1656' '04.22.1666' '04.22.1684'
 '04.22.1750' '04.22.1778' '04.33.1125' '04.33.1149' '04.33.1171'
 '04.33.1185' '04.33.1261' '04.73.2237' '04.73.2571' '04.82.1684'
 '04.82.1730' '04.82.1778' '05.15.2092' '05.15.2114' '05.15.2120'
 '05.15.2138' '05.15.3104' '05.66.3237' '05.66.3571' '10' '8.5.3']
train과 test의 fwver의 합집합:/n ['03.11.1141' '03.11.1149' '03.11.1167' '04.16.2641' '04.16.3345'
 '04.16.3439' '04.16.3553' '04.16.3569' '04.16.3571' '04.22.1170'
 '04.22.1442' '04.22.1448' '04.22.1478' '04.22.1608' '04.22.1656'
 '04.22.1666' '04.22.1684' '04.22.1750' '04.22.1772' '04.22.1778'
 '04.33.1095' '04.33.1125' '04.33.1149' '04.33.1171' '04.33.1185'
 '04.33.1261' '04.73.2237' '04.73.2569' '04.73.2571' '04.73.2577'
 '04.82.1684' '04.82.1730' '04.82.1778' '05.15.2090' '05.15.2092'
 '05.15.2114' '05.15.2120' '05.15.2122' '05.15.2138' '05.15.3104'
 '05.66.3237' '05.66.

- 파생변수를 만들때 train과 test의 fwver의 합집합으로 생성해야한다.

errcode에는 null값이 존재하므로 0으로 대치한다. 사실 최빈값인 1로 대치해야 하지만 null값이 별로 없기때문에 그 다음 최빈값인 0으로 해도 무방하다.

In [5]:
# 결측치를 대치할때 곧바로 fillna를 적용하기 위해서 inplace=True로 설정한다
train_err.errcode.fillna('0', inplace=True)
display(train_err.isnull().sum())
test_err.errcode.fillna('0', inplace=True)
display(test_err.isnull().sum())

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     0
dtype: int64

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     0
dtype: int64

In [6]:
print("train과 test의 errcode의 교집합:/n", np.intersect1d(train_err.errcode.unique(), test_err.errcode.unique()))
print("train과 test의 errcode의 합집합:/n", np.union1d(train_err.errcode.unique(), test_err.errcode.unique()))
print("train에만 있고 test에는 없는 errcode(차집합):/n", np.setdiff1d(train_err.errcode.unique(), test_err.errcode.unique()))
print("test에만 있고 train에는 없는 errcode(차집합):/n", np.setdiff1d(test_err.errcode.unique(), train_err.errcode.unique()))

train과 test의 errcode의 교집합:/n ['-269' '-270' '0' ... 'scanning timeout' 'standby'
 'terminate by peer user']
train과 test의 errcode의 합집합:/n ['-1010' '-269' '-270' ... 'standby' 'tVer' 'terminate by peer user']
train에만 있고 test에는 없는 errcode(차집합):/n ['10005' '10018' '10073' ... 'Y-00004' 'Y-00005' 'http']
test에만 있고 train에는 없는 errcode(차집합):/n ['-1010' '10020' '10029' ... 'eDes' 'me="' 'tVer']


- 모델링을 위해서 errcode의 파생변수는 train와 test의 교집합에서만 생성해야한다.

# 전처리 함수

In [7]:
def preprocessing(data):
    # time의 최소값
    data_time_min = make_datetime(data.time.min())
    # time을 datetime형태로 변경
    data["datetime"] = data["time"].apply(make_datetime)
    
    # 경과일 컬럼 생성
    data["days"] = (data["datetime"] - data_time_min).dt.days + 1
    # 시간대 컬럼 생성
    data["hour"] = data["datetime"].dt.hour
    # 요일 컬럼 생성
    data["weekday"] = data["datetime"].dt.weekday
    
    del data["time"], data["datetime"]
    return data

전처리 실행

In [8]:

start_minute = dt.datetime.now()

preprocessing(train_err)
preprocessing(test_err)

end_minute = dt.datetime.now()
print(f"경과시간: {(end_minute - start_minute)}")

display(train_err)
display(test_err)

경과시간: 0:01:13.696921


Unnamed: 0,user_id,model_nm,fwver,errtype,errcode,days,hour,weekday
0,10000,model_3,05.15.2138,15,1,1,2,6
1,10000,model_3,05.15.2138,12,1,1,3,6
2,10000,model_3,05.15.2138,11,1,1,3,6
3,10000,model_3,05.15.2138,16,1,1,5,6
4,10000,model_3,05.15.2138,4,0,1,5,6
...,...,...,...,...,...,...,...,...
16554658,24999,model_3,05.15.2138,15,1,30,16,0
16554659,24999,model_3,05.15.2138,16,1,30,17,0
16554660,24999,model_3,05.15.2138,4,0,30,17,0
16554661,24999,model_3,05.15.2138,4,0,30,17,0


Unnamed: 0,user_id,model_nm,fwver,errtype,errcode,days,hour,weekday
0,30000,model_1,04.16.3553,31,1,1,3,6
1,30000,model_1,04.16.3553,33,2,1,3,6
2,30000,model_1,04.16.3553,15,1,1,3,6
3,30000,model_1,04.16.3553,22,1,1,3,6
4,30000,model_1,04.16.3553,11,1,1,3,6
...,...,...,...,...,...,...,...,...
16532643,44998,model_1,04.16.3553,40,0,30,21,0
16532644,44998,model_1,04.16.3553,31,1,30,21,0
16532645,44998,model_1,04.16.3553,15,1,30,21,0
16532646,44998,model_1,04.16.3553,16,1,30,21,0


# 파생변수 생성

## 1) model_nm 더미변수 생성, model_nm 변화횟수 column 생성

In [9]:
model_nm_dummies = pd.get_dummies(train_err["model_nm"])
model_nm_dummies = pd.concat([train_err.user_id, model_nm_dummies], axis=1)

In [10]:
# 각 아이디당 모델의 출현횟수
model_nm_dummies = model_nm_dummies.groupby(model_nm_dummies.user_id).sum()

In [11]:
# 하지만 우리가 알고 싶은 것은 각 id에 어떤 모델이 있냐 없냐만 알고싶기때문에 counting 한것을 모두 1로 변환
# 즉, 해당 model_nm이 나타나면 1 아니면 0으로 바꾼다.
def one_zero(num):
    if num != 0:
        num = 1
    return num

for col in model_nm_dummies.columns:
    model_nm_dummies[str(col)] = model_nm_dummies[str(col)].apply(one_zero)
    
model_nm_dummies

Unnamed: 0_level_0,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
24997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# 그리고 한 id가 몇개의 모델을 갖고 있는지 확인하기위해
# 한 행에서 출현한 숫자를 모두 더한다
# model_change_cnt가
# 1이면 1개의 모델
# 2이면 2개의 모델 -> 1번 변화
# 3이면 3개의 모델 -> 2번 변화
change_cnt = []
for i in range(len(model_nm_dummies.index)):
    change_cnt.append(model_nm_dummies.iloc[i].sum())
    
model_nm_dummies["model_change_cnt"] = change_cnt

display(model_nm_dummies)
# model_change_cnt의 value_counts를 시도하여 모델변화가 있는 사용자가 몇명인지 알아낼 수 있다.
model_nm_dummies.model_change_cnt.value_counts()

Unnamed: 0_level_0,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_change_cnt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
10001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10002,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
10003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24996,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
24997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24998,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


1.0    14297
2.0      702
3.0        1
Name: model_change_cnt, dtype: int64

## 2) fwver 더미변수 생성, fwver 변화횟수 column 생성
- model_nm과 같은 방식으로 column 생성한다

In [13]:
fwver_dummies = pd.get_dummies(train_err["fwver"])
fwver_dummies = pd.concat([train_err.user_id, fwver_dummies], axis=1)

# 각 아이디당 fwver의 출현횟수
fwver_dummies = fwver_dummies.groupby(fwver_dummies.user_id).sum()

# 하지만 우리가 알고 싶은 것은 각 id에 어떤 fwver이 있냐 없냐만 알고싶기때문에 counting 한것을 모두 1로 변환
# 즉, 해당 Fwver이 나타나면 1 아니면 0으로 바꾼다.
for col in fwver_dummies.columns:
    fwver_dummies[str(col)] = fwver_dummies[str(col)].apply(one_zero)
    
# 그리고 한 id가 몇개의 fwver을 갖고 있는지 확인하기위해
# 한 행에서 출현한 숫자를 모두 더한다
# fwver_change_cnt가
# 1이면 1개의 fwver
# 2이면 2개의 fwver -> 1번 변화
# 3이면 3개의 fwver -> 2번 변화
# 4이면 4개의 fwver -> 3번 변화
change_cnt = []
for i in range(len(fwver_dummies.index)):
    change_cnt.append(fwver_dummies.iloc[i].sum())
    
fwver_dummies["fwver_change_cnt"] = change_cnt

display(fwver_dummies)
# fwver_change_cnt의 value_counts를 시도하여 fwver 변화가 있는 사용자가 몇명인지 알아낼 수 있다.
fwver_dummies.fwver_change_cnt.value_counts()

Unnamed: 0_level_0,03.11.1141,03.11.1149,03.11.1167,04.16.2641,04.16.3345,04.16.3439,04.16.3553,04.16.3569,04.16.3571,04.22.1442,...,05.15.2114,05.15.2120,05.15.2122,05.15.2138,05.15.3104,05.66.3237,05.66.3571,10,8.5.3,fwver_change_cnt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
24997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
24998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


2.0    8141
1.0    6420
3.0     396
4.0      43
Name: fwver_change_cnt, dtype: int64

- 많은 사용자들이 펌웨어 버전의 변화를 겪은 것으로 확인되었다.
- 15000 - 6420 = 8580명의 사용자가 fwver 변화를 겪었다

## 3) 모든 errtype이 발생한 days의 평균

- 인덱스만 있는 DataFrame을 만들고 한 컬럼씩 붙여나가는 방식

In [14]:
'''
# user_id 15000개의 인덱스만 있는 dataframe 생성
errtype_days_mean = pd.DataFrame(index=train_err.user_id.unique())
# errtype의 유니크 값
errtypes = np.sort(train_err.errtype.unique())

for t in tqdm(errtypes):
    # 에러타입이 t인 row로 이뤄진 dataframe
    each_errtype = train_err.loc[train_err["errtype"] == t]
    # 그 dataframe(each_errtype)에서 유니크한 id를 추출
    id_uniq = set(each_errtype.user_id)
    errtype_days_ls = []
    for i in range(10000, 25000):
        # id 10000 ~ 24999에서 id_uniq에 id가 있으면 days의 평균을 구하고,
        if i in id_uniq:
            errtype_days_ls.append(each_errtype.loc[each_errtype["user_id"] == i].days.mean())
        # 없으면 0
        else:
            errtype_days_ls.append(0)
    # 마지막에 다 모아놓은 list를 한 컬럼으로 치고 이어붙이기
    errtype_days_mean["errtype"+str(t)+"days_mean"] = errtype_days_ls
'''

'\n# user_id 15000개의 인덱스만 있는 dataframe 생성\nerrtype_days_mean = pd.DataFrame(index=train_err.user_id.unique())\n# errtype의 유니크 값\nerrtypes = np.sort(train_err.errtype.unique())\n\nfor t in tqdm(errtypes):\n    # 에러타입이 t인 row로 이뤄진 dataframe\n    each_errtype = train_err.loc[train_err["errtype"] == t]\n    # 그 dataframe(each_errtype)에서 유니크한 id를 추출\n    id_uniq = set(each_errtype.user_id)\n    errtype_days_ls = []\n    for i in range(10000, 25000):\n        # id 10000 ~ 24999에서 id_uniq에 id가 있으면 days의 평균을 구하고,\n        if i in id_uniq:\n            errtype_days_ls.append(each_errtype.loc[each_errtype["user_id"] == i].days.mean())\n        # 없으면 0\n        else:\n            errtype_days_ls.append(0)\n    # 마지막에 다 모아놓은 list를 한 컬럼으로 치고 이어붙이기\n    errtype_days_mean["errtype"+str(t)+"days_mean"] = errtype_days_ls\n'

- 인덱스와 컬럼을 모두 생성하고 0행렬 DataFrame에서 loc으로 각 위치를 매핑하는 방식

In [15]:
'''
# 인덱스와 컬럼을 모두 지정한 0행렬 dataframe 생성
# 컬럼은 errtype의 유니크 값들
errtypes = np.sort(train_err.errtype.unique())
errtype_days_mean = pd.DataFrame(index=train_err.user_id.unique(),
                                columns=errtypes)
errtype_days_mean.fillna(0, inplace=True)

for t in tqdm(errtypes):
    each_errtype = train_err.loc[train_err["errtype"] == t]
    id_uniq = set(each_errtype.user_id)
    for i in id_uniq:
        # 생성한 dataframe에서 loc으로 바로 찾아서 days의 평균을 매핑한다.
        errtype_days_mean.loc[i,t] = each_errtype.loc[each_errtype["user_id"] == i].days.mean()
    
display(errtype_days_mean)

# 하지만 연산 속도는 생각보다 느렸다.
# 내 코드실력이 여기까지인 것일 수도..
'''

'\n# 인덱스와 컬럼을 모두 지정한 0행렬 dataframe 생성\n# 컬럼은 errtype의 유니크 값들\nerrtypes = np.sort(train_err.errtype.unique())\nerrtype_days_mean = pd.DataFrame(index=train_err.user_id.unique(),\n                                columns=errtypes)\nerrtype_days_mean.fillna(0, inplace=True)\n\nfor t in tqdm(errtypes):\n    each_errtype = train_err.loc[train_err["errtype"] == t]\n    id_uniq = set(each_errtype.user_id)\n    for i in id_uniq:\n        # 생성한 dataframe에서 loc으로 바로 찾아서 days의 평균을 매핑한다.\n        errtype_days_mean.loc[i,t] = each_errtype.loc[each_errtype["user_id"] == i].days.mean()\n    \ndisplay(errtype_days_mean)\n\n# 하지만 연산 속도는 생각보다 느렸다.\n# 내 코드실력이 여기까지인 것일 수도..\n'

- 파생변수 생성 방법 두가지로 해봤는데 첫번째 방법이 더 빨랐다.
- 두번째 방법(loc으로 바로 매핑하기)는 의외로 느렸음

In [16]:
#너무 오래걸리기 때문에 csv파일로 저장후 불러오기
#errtype_days_mean.to_csv("errtype_days_mean.csv", index=False)
errtype_days_mean = pd.read_csv("/kaggle/input/errtype-days-mean/errtype_days_mean (1).csv")
display(errtype_days_mean)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
0,0.0,0.0,11.375,15.807692,0.000000,15.0,15.0,0.0,0.0,10.857143,...,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00
1,0.0,0.0,0.000,0.000000,19.716981,5.0,5.0,0.0,0.0,0.000000,...,16.0000,13.666667,0.0,12.0,12.0,0.0,0.0,11.283186,14.625000,21.00
2,0.0,0.0,18.500,13.742424,17.000000,16.0,16.0,0.0,0.0,21.000000,...,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00
3,0.0,0.0,0.000,0.000000,11.500000,5.0,5.0,0.0,0.0,0.000000,...,17.7500,0.000000,0.0,10.0,10.0,8.0,0.0,12.117647,10.000000,0.00
4,0.0,0.0,0.000,10.000000,0.000000,10.0,11.0,0.0,0.0,0.000000,...,15.3125,0.000000,0.0,25.0,25.0,0.0,0.0,12.000000,0.000000,28.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.000,0.000000,20.000000,25.6,25.6,0.0,0.0,0.000000,...,25.0000,0.000000,0.0,0.0,0.0,0.0,0.0,24.888889,20.285714,26.25
14996,0.0,0.0,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00
14997,0.0,0.0,0.000,17.000000,28.000000,17.0,17.0,0.0,0.0,0.000000,...,15.1250,4.000000,0.0,19.0,19.0,0.0,0.0,11.275862,20.000000,25.00
14998,0.0,0.0,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,14.4000,10.000000,0.0,3.0,3.0,0.0,0.0,17.000000,0.000000,0.00


## 4) 모든 errtype이 발생한 hour의 평균

In [17]:
'''
# user_id 15000개의 인덱스만 있는 dataframe 생성
errtype_hour_mean = pd.DataFrame(index=train_err.user_id.unique())
# errtype의 유니크 값
errtypes = np.sort(train_err.errtype.unique())

for t in tqdm(errtypes):
    # 에러타입이 t인 row로 이뤄진 dataframe
    each_errtype = train_err.loc[train_err["errtype"] == t]
    # 그 dataframe(each_errtype)에서 유니크한 id를 추출
    id_uniq = set(each_errtype.user_id)
    errtype_hour_ls = []
    for i in range(10000, 25000):
        # id 10000 ~ 24999에서 id_uniq에 id가 있으면 hour의 평균을 구하고,
        if i in id_uniq:
            errtype_hour_ls.append(each_errtype.loc[each_errtype["user_id"] == i].hour.mean())
        # 없으면 0
        else:
            errtype_hour_ls.append(0)
    # 마지막에 다 모아놓은 list를 한 컬럼으로 치고 이어붙이기
    errtype_hour_mean["errtype"+str(t)+"_hour_mean"] = errtype_hour_ls
    
display(errtype_hour_mean)
'''

'\n# user_id 15000개의 인덱스만 있는 dataframe 생성\nerrtype_hour_mean = pd.DataFrame(index=train_err.user_id.unique())\n# errtype의 유니크 값\nerrtypes = np.sort(train_err.errtype.unique())\n\nfor t in tqdm(errtypes):\n    # 에러타입이 t인 row로 이뤄진 dataframe\n    each_errtype = train_err.loc[train_err["errtype"] == t]\n    # 그 dataframe(each_errtype)에서 유니크한 id를 추출\n    id_uniq = set(each_errtype.user_id)\n    errtype_hour_ls = []\n    for i in range(10000, 25000):\n        # id 10000 ~ 24999에서 id_uniq에 id가 있으면 hour의 평균을 구하고,\n        if i in id_uniq:\n            errtype_hour_ls.append(each_errtype.loc[each_errtype["user_id"] == i].hour.mean())\n        # 없으면 0\n        else:\n            errtype_hour_ls.append(0)\n    # 마지막에 다 모아놓은 list를 한 컬럼으로 치고 이어붙이기\n    errtype_hour_mean["errtype"+str(t)+"_hour_mean"] = errtype_hour_ls\n    \ndisplay(errtype_hour_mean)\n'

In [18]:
#errtype_hour_mean.to_csv("errtype_hour_mean.csv", index=False)
errtype_hour_mean = pd.read_csv("/kaggle/input/errtype-days-mean/errtype_hour_mean.csv")
display(errtype_hour_mean)

Unnamed: 0,errtype1_hour_mean,errtype2_hour_mean,errtype3_hour_mean,errtype4_hour_mean,errtype5_hour_mean,errtype6_hour_mean,errtype7_hour_mean,errtype8_hour_mean,errtype9_hour_mean,errtype10_hour_mean,...,errtype33_hour_mean,errtype34_hour_mean,errtype35_hour_mean,errtype36_hour_mean,errtype37_hour_mean,errtype38_hour_mean,errtype39_hour_mean,errtype40_hour_mean,errtype41_hour_mean,errtype42_hour_mean
0,0.0,0.0,13.25,9.423077,0.000000,4.000000,4.0,0.0,0.0,14.571429,...,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.0,0.0,0.00,0.000000,14.207547,4.000000,4.0,0.0,0.0,0.000000,...,6.3000,14.611111,0.0,3.0,3.0,0.0,0.0,12.938053,13.267857,2.0
2,0.0,0.0,19.00,14.651515,4.000000,19.000000,19.0,0.0,0.0,19.000000,...,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,0.0,0.0,0.00,0.000000,21.000000,2.000000,2.0,0.0,0.0,0.000000,...,6.7500,0.000000,0.0,7.0,7.0,12.5,0.0,11.470588,21.000000,0.0
4,0.0,0.0,0.00,3.000000,0.000000,4.333333,8.5,0.0,0.0,0.000000,...,3.1250,0.000000,0.0,3.0,3.0,0.0,0.0,18.000000,0.000000,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.00,0.000000,22.000000,14.000000,14.0,0.0,0.0,0.000000,...,5.4000,0.000000,0.0,0.0,0.0,0.0,0.0,17.666667,17.000000,2.0
14996,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
14997,0.0,0.0,0.00,23.000000,12.000000,1.000000,1.0,0.0,0.0,0.000000,...,6.8125,20.000000,0.0,3.0,3.0,0.0,0.0,12.655172,21.250000,3.0
14998,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,6.0000,18.000000,0.0,5.0,5.0,0.0,0.0,16.500000,0.000000,0.0


## 5) 모든 errtype이 발생한 weekday의 평균

In [19]:
'''
# user_id 15000개의 인덱스만 있는 dataframe 생성
errtype_weekday_mean = pd.DataFrame(index=train_err.user_id.unique())
# errtype의 유니크 값
errtypes = np.sort(train_err.errtype.unique())

for t in tqdm(errtypes):
    # 에러타입이 t인 row로 이뤄진 dataframe
    each_errtype = train_err.loc[train_err["errtype"] == t]
    # 그 dataframe(each_errtype)에서 유니크한 id를 추출
    id_uniq = set(each_errtype.user_id)
    errtype_weekday_ls = []
    for i in range(10000, 25000):
        # id 10000 ~ 24999에서 id_uniq에 id가 있으면 weekday의 평균을 구하고,
        if i in id_uniq:
            errtype_weekday_ls.append(each_errtype.loc[each_errtype["user_id"] == i].weekday.mean())
        # 없으면 0
        else:
            errtype_weekday_ls.append(0)
    # 마지막에 다 모아놓은 list를 한 컬럼으로 치고 이어붙이기
    errtype_weekday_mean["errtype"+str(t)+"_weekday_mean"] = errtype_weekday_ls
    
display(errtype_weekday_mean)
'''

'\n# user_id 15000개의 인덱스만 있는 dataframe 생성\nerrtype_weekday_mean = pd.DataFrame(index=train_err.user_id.unique())\n# errtype의 유니크 값\nerrtypes = np.sort(train_err.errtype.unique())\n\nfor t in tqdm(errtypes):\n    # 에러타입이 t인 row로 이뤄진 dataframe\n    each_errtype = train_err.loc[train_err["errtype"] == t]\n    # 그 dataframe(each_errtype)에서 유니크한 id를 추출\n    id_uniq = set(each_errtype.user_id)\n    errtype_weekday_ls = []\n    for i in range(10000, 25000):\n        # id 10000 ~ 24999에서 id_uniq에 id가 있으면 weekday의 평균을 구하고,\n        if i in id_uniq:\n            errtype_weekday_ls.append(each_errtype.loc[each_errtype["user_id"] == i].weekday.mean())\n        # 없으면 0\n        else:\n            errtype_weekday_ls.append(0)\n    # 마지막에 다 모아놓은 list를 한 컬럼으로 치고 이어붙이기\n    errtype_weekday_mean["errtype"+str(t)+"_weekday_mean"] = errtype_weekday_ls\n    \ndisplay(errtype_weekday_mean)\n'

In [20]:
#errtype_weekday_mean.to_csv("errtype_weekday_mean.csv", index=False)
errtype_weekday_mean = pd.read_csv("/kaggle/input/errtype-days-mean/errtype_weekday_mean.csv")
display(errtype_weekday_mean)

Unnamed: 0,errtype1_weekday_mean,errtype2_weekday_mean,errtype3_weekday_mean,errtype4_weekday_mean,errtype5_weekday_mean,errtype6_weekday_mean,errtype7_weekday_mean,errtype8_weekday_mean,errtype9_weekday_mean,errtype10_weekday_mean,...,errtype33_weekday_mean,errtype34_weekday_mean,errtype35_weekday_mean,errtype36_weekday_mean,errtype37_weekday_mean,errtype38_weekday_mean,errtype39_weekday_mean,errtype40_weekday_mean,errtype41_weekday_mean,errtype42_weekday_mean
0,0.0,0.0,3.25,3.038462,0.000000,6.000000,6.00,0.0,0.0,2.857143,...,0.000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.0,0.0,0.00,0.000000,3.320755,3.000000,3.00,0.0,0.0,0.000000,...,3.500,1.944444,0.0,3.0,3.0,0.0,0.0,2.345133,2.500000,5.0
2,0.0,0.0,2.50,3.363636,1.000000,0.000000,0.00,0.0,0.0,5.000000,...,0.000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,0.0,0.0,0.00,0.000000,2.500000,3.000000,3.00,0.0,0.0,0.000000,...,3.500,0.000000,0.0,1.0,1.0,2.5,0.0,3.117647,1.000000,0.0
4,0.0,0.0,0.00,1.000000,0.000000,3.333333,3.75,0.0,0.0,0.000000,...,3.250,0.000000,0.0,2.0,2.0,0.0,0.0,3.000000,0.000000,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.00,0.000000,4.000000,4.000000,4.00,0.0,0.0,0.000000,...,2.000,0.000000,0.0,0.0,0.0,0.0,0.0,4.222222,4.285714,1.5
14996,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.000000,...,0.000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
14997,0.0,0.0,0.00,1.000000,5.000000,1.000000,1.00,0.0,0.0,0.000000,...,2.625,2.000000,0.0,3.0,3.0,0.0,0.0,2.155172,4.000000,3.4
14998,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.000000,...,2.600,1.000000,0.0,1.0,1.0,0.0,0.0,1.000000,0.000000,0.0


## 6)-1 errcode (1과 0)
- 각 user_id 당, 1의 횟수와 0의 횟수를 count 한다.

In [21]:
# errcode가 1인 row를 추출
errcode_1_dummies = pd.get_dummies(train_err["errcode"] == "1")
# train_err의 user_id를 index로 설정
errcode_1_dummies = pd.concat([train_err.user_id, errcode_1_dummies.iloc[:,1]], axis=1)
# user_id로 groupby.sum() 실행
errcode_1_dummies = errcode_1_dummies.groupby(errcode_1_dummies.user_id).sum()
# 컬럼명 변경
errcode_1_dummies.columns = ["errcode_1"]
display(errcode_1_dummies)

Unnamed: 0_level_0,errcode_1
user_id,Unnamed: 1_level_1
10000,212.0
10001,1274.0
10002,172.0
10003,229.0
10004,529.0
...,...
24995,125.0
24996,3.0
24997,598.0
24998,125.0


In [22]:
# errcode가 0인 row를 추출
errcode_0_dummies = pd.get_dummies(train_err["errcode"] == "0")
# train_err의 user_id를 index로 설정
errcode_0_dummies = pd.concat([train_err.user_id, errcode_0_dummies.iloc[:,1]], axis=1)
# user_id로 groupby.sum() 실행
errcode_0_dummies = errcode_0_dummies.groupby(errcode_0_dummies.user_id).sum()
# 컬럼명 변경
errcode_0_dummies.columns = ["errcode_0"]
display(errcode_0_dummies)

Unnamed: 0_level_0,errcode_0
user_id,Unnamed: 1_level_1
10000,104.0
10001,183.0
10002,132.0
10003,42.0
10004,98.0
...,...
24995,22.0
24996,0.0
24997,146.0
24998,13.0


## 6)-2 errcode(connection이 포함된 errcode counting)
- 'connection timeout'
- 'connection fail to establish',
- 'connectionterminated by local host',
- 'connection fail for LMP response timout'
- 'L2CAP connection cancelled'

In [23]:
# errcode 중에 connect 가 포함된 값 추출
connection_err = []
for c in train_err.errcode.unique():
    if "connect" in c:
        connection_err.append(c)
        
print("connection 에러에 대한 errcode:", connection_err)


connection_vari = pd.DataFrame(index=train_err.user_id.unique())

for code in tqdm(connection_err):
    # train_err.user_id의 컬럼만 갖고있는 connection_frame DataFrame 생성
    connection_frame = pd.DataFrame(train_err.user_id)
    # errcode가 code인 row를 추출
    connection_dummies = pd.get_dummies(train_err["errcode"] == code)
    # 컬럼명을 code로 하는 column 생성
    connection_frame[code] = connection_dummies.iloc[:,1] #두번째 컬럼이 조건식 True인 column
    # user_id로 groupby.sum() 실행
    connection_frame = connection_frame.groupby(connection_frame.user_id).sum()
    # 최종 variables dataframe에 그룹바이 한 column 추가
    connection_vari[code] = connection_frame[code]

display(connection_vari)

  0%|          | 0/5 [00:00<?, ?it/s]

connection 에러에 대한 errcode: ['connection timeout', 'connection fail to establish', 'connectionterminated by local host', 'connection fail for LMP response timout', 'L2CAP connection cancelled']


100%|██████████| 5/5 [00:09<00:00,  1.88s/it]


Unnamed: 0,connection timeout,connection fail to establish,connectionterminated by local host,connection fail for LMP response timout,L2CAP connection cancelled
10000,0.0,0.0,0,0,0
10001,0.0,0.0,0,0,0
10002,0.0,0.0,0,0,0
10003,7.0,0.0,0,0,0
10004,104.0,1.0,0,0,0
...,...,...,...,...,...
24995,0.0,0.0,0,0,0
24996,0.0,0.0,0,0,0
24997,13.0,0.0,1,0,0
24998,0.0,0.0,0,0,0


## 6)-3 알파벳이 포함된 errcode
- EDA를 통해 '알파벳-숫자' 형태의 errcode가 있다는 것을 알았고, 특정 알파벳을 직접 눈으로 확인하였다.

In [24]:
# EDA를 통해 확인한 errcode의 알파벳 종류
alphabet = ['P-', 'B-', 'Q-', 'S-', 'U-', 'C-', 'H-', 'J-', 'E-', 'En', 'Y-', 'D-', 'M-']
errcode_uniq = train_err.errcode.unique()
alphabet_err = []

for errcode in tqdm(errcode_uniq):
    for alpha in alphabet:
        if alpha in errcode:
            alphabet_err.append(errcode)
            

print(alphabet_err)        

100%|██████████| 2805/2805 [00:00<00:00, 716898.59it/s]

['B-A8002', 'Q-64002', 'S-61001', 'U-81009', 'S-64002', 'J-30021', 'S-65002', 'Q-64001', 'H-51042', 'C-11017', 'H-51046', 'H-51049', 'P-44010', 'P-41011', 'B-51042', 'P-41007 ', 'M-99999', 'U-82024', 'P-44010 ', 'H-51048', 'U-82026', 'P-41007', 'C-13053', 'C-14014', 'J-20029', 'J-30010', 'Y-00008', 'S-64000', 'En00409', 'E-59902', 'Q-73004', 'C-12032', 'J-40011', 'U-82023', 'Q-73006', 'D-10011', 'S-65', 'M-51007', 'S-64001', 'Y-00005', 'D-99999', 'U-82004', 'En00402', 'B-51049', 'C-11020', 'M-51020', 'En00406', 'C-11087', 'U-81000', 'Y-00004', 'C-13039', 'U-82020', 'P-41011 ', 'P-41001', 'U-81014']





In [25]:
np.sort(alphabet_err)

array(['B-51042', 'B-51049', 'B-A8002', 'C-11017', 'C-11020', 'C-11087',
       'C-12032', 'C-13039', 'C-13053', 'C-14014', 'D-10011', 'D-99999',
       'E-59902', 'En00402', 'En00406', 'En00409', 'H-51042', 'H-51046',
       'H-51048', 'H-51049', 'J-20029', 'J-30010', 'J-30021', 'J-40011',
       'M-51007', 'M-51020', 'M-99999', 'P-41001', 'P-41007', 'P-41007 ',
       'P-41011', 'P-41011 ', 'P-44010', 'P-44010 ', 'Q-64001', 'Q-64002',
       'Q-73004', 'Q-73006', 'S-61001', 'S-64000', 'S-64001', 'S-64002',
       'S-65', 'S-65002', 'U-81000', 'U-81009', 'U-81014', 'U-82004',
       'U-82020', 'U-82023', 'U-82024', 'U-82026', 'Y-00004', 'Y-00005',
       'Y-00008'], dtype='<U8')

In [26]:
len(alphabet_err)

55

- 알파벳 errcode의 갯수는 55개이다. 이들을 알파벳으로 그룹핑하려고 했으나 변수의 수가 그렇게 많아진다고 판단되지 않아 그대로 55개의 컬럼을 생성하기로 결정했다. -> 이는 팀프로젝트에서 라면 팀원들과 상의를 해봐야하는 부분이다. 코딩이나 시간으로 큰 차이는 없지만 유의미성 면에서 내가 독단적으로 판단하기엔 섣부르다. 아니면 아예 모델링까지 해서 결과를 보고 좋은 결과를 내는 쪽을 선택하면 된다.

In [27]:
print("alphabet 에러에 대한 errcode:", alphabet_err)


alphabet_vari = pd.DataFrame(index=train_err.user_id.unique())

for code in tqdm(alphabet_err):
    # train_err.user_id의 컬럼만 갖고있는 alphabet_frame DataFrame 생성
    alphabet_frame = pd.DataFrame(train_err.user_id)
    # errcode가 code인 row를 추출
    alphabet_dummies = pd.get_dummies(train_err["errcode"] == code)
    # 컬럼명을 code로 하는 column 생성
    alphabet_frame[code] = alphabet_dummies.iloc[:,1] #두번째 컬럼이 조건식 True인 column
    # user_id로 groupby.sum() 실행
    alphabet_frame = alphabet_frame.groupby(alphabet_frame.user_id).sum()
    # 최종 variables dataframe에 그룹바이 한 column 추가
    alphabet_vari[code] = alphabet_frame[code]

display(alphabet_vari)

  0%|          | 0/55 [00:00<?, ?it/s]

alphabet 에러에 대한 errcode: ['B-A8002', 'Q-64002', 'S-61001', 'U-81009', 'S-64002', 'J-30021', 'S-65002', 'Q-64001', 'H-51042', 'C-11017', 'H-51046', 'H-51049', 'P-44010', 'P-41011', 'B-51042', 'P-41007 ', 'M-99999', 'U-82024', 'P-44010 ', 'H-51048', 'U-82026', 'P-41007', 'C-13053', 'C-14014', 'J-20029', 'J-30010', 'Y-00008', 'S-64000', 'En00409', 'E-59902', 'Q-73004', 'C-12032', 'J-40011', 'U-82023', 'Q-73006', 'D-10011', 'S-65', 'M-51007', 'S-64001', 'Y-00005', 'D-99999', 'U-82004', 'En00402', 'B-51049', 'C-11020', 'M-51020', 'En00406', 'C-11087', 'U-81000', 'Y-00004', 'C-13039', 'U-82020', 'P-41011 ', 'P-41001', 'U-81014']


100%|██████████| 55/55 [01:45<00:00,  1.91s/it]


Unnamed: 0,B-A8002,Q-64002,S-61001,U-81009,S-64002,J-30021,S-65002,Q-64001,H-51042,C-11017,...,M-51020,En00406,C-11087,U-81000,Y-00004,C-13039,U-82020,P-41011,P-41001,U-81014
10000,0.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10001,53.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0.0,1,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,1.0,0,1.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,2.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24996,0.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,8.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,0.0,0,0.0,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- 0의 비율이 굉장히 높은 Dataframe이 생성되었다. 애초에 알파벳이 들어간 errcode의 갯수가 많지가 않다..

## 6)-4 word로 구성된 errcode
- PHONE_ERR, PUBLIC_ERR, standby, active, UNKNOWN, scanning timeout, NFANDROID2, CM a, http

In [28]:
# word로 구성된 errcode를 EDA를 통해 확인하고 하나의 리스트로 그룹핑
word_err = ['PHONE_ERR', 'PUBLIC_ERR', 'standby', 'active', 'UNKNOWN', 'scanning timeout', 'NFANDROID2', 'CM a', 'http']
print("word로 구성된 errcode:\n", word_err)

word_vari = pd.DataFrame(index=train_err.user_id.unique())

for code in tqdm(word_err):
    # train_err.user_id의 컬럼만 갖고있는 word_frame DataFrame 생성
    word_frame = pd.DataFrame(train_err.user_id)
    # errcode가 code인 row를 추출
    word_dummies = pd.get_dummies(train_err["errcode"] == code)
    # 컬럼명을 code로 하는 column 생성
    word_frame[code] = word_dummies.iloc[:,1] #두번째 컬럼이 조건식 True인 column
    # user_id로 groupby.sum() 실행
    word_frame = word_frame.groupby(word_frame.user_id).sum()
    # 최종 variables dataframe에 그룹바이 한 column 추가
    word_vari[code] = word_frame[code]

display(word_vari)



  0%|          | 0/9 [00:00<?, ?it/s]

word로 구성된 errcode:
 ['PHONE_ERR', 'PUBLIC_ERR', 'standby', 'active', 'UNKNOWN', 'scanning timeout', 'NFANDROID2', 'CM a', 'http']


100%|██████████| 9/9 [00:16<00:00,  1.88s/it]


Unnamed: 0,PHONE_ERR,PUBLIC_ERR,standby,active,UNKNOWN,scanning timeout,NFANDROID2,CM a,http
10000,0,0,0.0,0.0,0,0,0.0,0,0
10001,0,0,625.0,126.0,0,0,56.0,0,0
10002,0,0,0.0,0.0,0,0,0.0,0,0
10003,0,0,0.0,12.0,0,0,1.0,0,0
10004,0,0,5.0,7.0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...
24995,0,0,8.0,4.0,0,0,7.0,0,0
24996,0,0,0.0,0.0,0,0,0.0,0,0
24997,0,0,11.0,4.0,0,0,8.0,0,0
24998,0,0,0.0,1.0,0,0,0.0,0,0


## 6)-5 음수(-)인 errcode

In [32]:
int('time')

ValueError: invalid literal for int() with base 10: 'time'

In [69]:
print(re.match('0{2,}', '0001'))

<re.Match object; span=(0, 3), match='000'>


In [73]:
def int2bInt(x):
    # 문자로 구성된 errcode를 찾는다
    searched = re.search('[a-zA-Z]+', str(x))
    # 0으로 시작하지만 모두 숫자인 errcode를 찾는다 (ex. 0001)
    matched = re.match('0{2,}', str(x))
    if searched:
        result = x
    elif matched:
        result = x
    else:
        result = int(float(x)) # 위 조건이 아닌 errcode(정수인 errcode)는 int로 바꾼다.
    return result

int_errcode = []

for code in errcode_uniq:
    int_errcode.append(int2bInt(code))

print(int_errcode)

[1, 0, 2, 'NFANDROID2', 'B-A8002', 14, 4, 13, 3, 8, 'standby', 'active', 'Q-64002', 'connection timeout', 'S-61001', 6796, 5738, 'terminate by peer user', 'connection fail to establish', 80, 79, 81, 86, 84, 'connectionterminated by local host', 77, 78, 'UNKNOWN', 85, 90, 89, 88, 'U-81009', 'V-21008', 700001, 'connection fail for LMP response timout', 'S-64002', 6467, 4893, 5507, 'J-30021', 83, 95, 94, 91, 87, 82, 93, 'S-65002', 6, 5, -269, 'Q-64001', 'H-51042', 'C-11017', 39391, 3113, 3674, 3395, 4625, 6362, 21, 3569, 'H-51046', 3065, 3014, 'scanning timeout', 3633, 3092, 3332, 92, 96, 97, 100, 105, 110, 106, 101, 3758, 5007, 76, 'H-51049', 5843, 3825, 4714, 4358, 5794, 3120, 3630, 3102, 3475, 3328, 3242, 3071, 'V-21002', 5527, 5588, 5752, 5695, 3018, 5879, 3361, 108133, 5699, 3033, 174669, 'V-21003', 'V-21004', 'V-21005', 5901, 5878, 5943, 5809, 5831, 4984, 5726, 6117, 5889, 5898, 5796, 4171, 4683, 5557, 3843, 3141, 5655, 6502, 3084, 5759, 4049, 7433, 5443, 7396, 6093, 5113, 3975, 423

- 내가 원하는 함수
    - 만약 errcode에 문자가 포함되어있거나 0으로 시작하면 그대로 두고
    - 정수면 int로 바꾼다.

In [85]:
for code in int_errcode:
    if type(code) == str:
        print(code)

NFANDROID2
B-A8002
standby
active
Q-64002
connection timeout
S-61001
terminate by peer user
connection fail to establish
connectionterminated by local host
UNKNOWN
U-81009
V-21008
connection fail for LMP response timout
S-64002
J-30021
S-65002
Q-64001
H-51042
C-11017
H-51046
scanning timeout
H-51049
V-21002
V-21003
V-21004
V-21005
P-44010
P-41011
B-51042
P-41007 
M-99999
U-82024
P-44010 
PHONE_ERR
H-51048
PUBLIC_ERR
U-82026
P-41007
C-13053
C-14014
V-21010
J-20029
J-30010
Y-00008
S-64000
En00409
E-59902
Q-73004
C-12032
J-40011
U-82023
Q-73006
D-10011
S-65
M-51007
S-64001
Y-00005
P_41007
CM a
D-99999
L2CAP connection cancelled
U-82004
En00402
B-51049
C-11020
V-21007
M-51020
En00406
C-11087
U-81000
Y-00004
C-13039
0001
U-82020
P-41011 
http
P_41001
P-41001
U-81014


In [29]:
# errcode의 유니크값 확인결과 음수인 errcode는 -269 와 -270 이었다
minus_err = ['-269', '-270']
print('음수인 errcode:', minus_err)

minus_vari = pd.DataFrame(index=train_err.user_id.unique())

for code in tqdm(minus_err):
    # train_err.user_id의 컬럼만 갖고있는 minus_frame DataFrame 생성
    minus_frame = pd.DataFrame(train_err.user_id)
    # errcode가 code인 row를 추출
    minus_dummies = pd.get_dummies(train_err["errcode"] == code)
    # 컬럼명을 code로 하는 column 생성
    minus_frame[code] = minus_dummies.iloc[:,1] #두번째 컬럼이 조건식 True인 column
    # user_id로 groupby.sum() 실행
    minus_frame = minus_frame.groupby(minus_frame.user_id).sum()
    # 최종 variables dataframe에 그룹바이 한 column 추가
    minus_vari[code] = minus_frame[code]

display(minus_vari)

  0%|          | 0/2 [00:00<?, ?it/s]

음수인 errcode: ['-269', '-270']


100%|██████████| 2/2 [00:03<00:00,  1.97s/it]


Unnamed: 0,-269,-270
10000,0,0
10001,0,0
10002,0,0
10003,0,0
10004,0,0
...,...,...
24995,0,0
24996,0,0
24997,0,0
24998,0,0


In [30]:
minus_vari["-270"].sum()

11

In [31]:
num_errcode = []
for i in errcode_uniq:
    if i.isdigit() == True:
        num_errcode.append(i)
num_errcode.sort()      

print(num_errcode)

['0', '0001', '1', '100', '10005', '10018', '10043', '10073', '10080', '101', '10101', '10149', '10153', '10155', '10164', '10190', '102', '10222', '10226', '10240', '10247', '10263', '10267', '10274', '102789', '10280', '103', '10305', '10309', '103400', '10350', '10377', '10385', '103924', '104', '10406', '10412', '10420', '10433', '105', '10518', '10552', '10554', '10557', '10566', '10568', '105690', '10573', '10584', '106', '10612', '10630', '10670', '10674', '10676', '10684', '10689', '10692', '107', '10715', '10720', '10721', '10725', '10732', '10743', '10749', '10752', '10760', '10768', '10777', '10782', '10792', '108', '10803', '10807', '10812', '108133', '10814', '10819', '10827', '10830', '10833', '10838', '10839', '10841', '10851', '10852', '10861', '10881', '10894', '10898', '1089908', '109', '10902', '10907', '10908', '10933', '10934', '10938', '10945', '10949', '10955', '10961', '10972', '10980', '10995', '10997', '110', '11007', '11014', '11015', '11033', '11043', '11063

- 숫자로만 구성된 errcode를 추출하였고,
    - 기준은 1초과 10미만
    - 10번대, 20번대, 30번대, ... , 100번대
    - 세자리수
    - 네자리수
    - 다섯자리 이상
- 이렇게 총 11+3 = 14개 컬럼을 생성할 예정..
- 이었으나 일반 숫자는 관두기로 했다. 다음에 기회가 된다면 다시하는 걸로..