# ======================

# Notes

    > 우선 시스템의 '품질적 변화'가 있는지 체크해야 함.
      (즉, quality_data를 분석해서 '상태 정보'를 확인)
    > 이를 위해서 Train Data에서  [log에 대한 변화]와 [고객의 불만 접수]의 관계를 찾아야 함
    > 이 관계성을 학습한 모델을 이용해서 Test 에서 Quality 체크하고,
      변화에 따라 '어떤' 고객이 불만을 접수할 것인지 예측하는 프로세스

[퀄리디 데이터에 대한 분석]
* Q) 퀄리티 로그는 언제, 왜 발생하는가?
    * 링크 : https://dacon.io/competitions/official/235687/talkboard/402309?page=1&dtype=recent&ptype=pub
    * 위 글을 보면, 힌트 없이 개인의 능력으로 퀄리티 로그에 대한 분석이 이 대회의 핵심이라고도 볼 수 있다
* Q) 어떤 데이터를 분석해야 하는가?
    * https://dacon.io/competitions/official/235687/talkboard/402316?page=1&dtype=recent&ptype=pub
    * err_data : 주요 변수 찾기 + err 간 연관성 찾기(선-후 관계)
    * quality_data : 주요 변수 찾기 + quality의 임계 값 찾기
    * err_data와 quality_data의 연관성 찾기
* Q) '설명 가능한' 모델을 구축하려면?
    * 모형의 복잡도를 가볍게 가져가면서도 성능지표는 높여야 함
    * 마냥 모델 구조를 깊고 복잡하게 하기보다는 핵심이 되는 변수를 넣고, 파라미터 튜닝하는 방식으로
* 어떠한 퀄리티 로그를 에러로 볼 것인가?
* 에러라고 정의하는 

# ======================

In [1]:
import pandas as pd
import numpy as np

# train_err_data

* train_err_data : 시스템에 발생한 에러 로그
    * user_id : 10,000 ~ 24,999... 총 15,000명
    * time : 2020년 11월 한 달간의 시간(최초=1일 02시 56분 16초, 최후=30일 21시 06분 25초)
    * model_nm : 모델 번호 0~8까지, 총 9개
    * fwver : 펌웨어버전 총 37개
    * errtype : 에러 타입 총 41개
    * errode : 에러 코드 총 2806개

In [2]:
PATH = './data/'

train_err = pd.read_csv(PATH+'train_err_data.csv')
train_err.head()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1
2,10000,20201101030309,model_3,05.15.2138,11,1
3,10000,20201101050514,model_3,05.15.2138,16,1
4,10000,20201101050515,model_3,05.15.2138,4,0


In [10]:
train_err.tail()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0
16554662,24999,20201130210625,model_3,05.15.2138,15,1


In [4]:
train_err['user_id'].unique()

array([10000, 10001, 10002, ..., 24997, 24998, 24999], dtype=int64)

In [5]:
train_err['time'].unique()

array([20201101025616, 20201101030309, 20201101050514, ...,
       20201118123022, 20201129070715, 20201129070741], dtype=int64)

In [6]:
train_err['model_nm'].unique()

array(['model_3', 'model_2', 'model_0', 'model_1', 'model_7', 'model_4',
       'model_5', 'model_8', 'model_6'], dtype=object)

In [7]:
train_err['fwver'].unique()

array(['05.15.2138', '04.33.1185', '04.33.1261', '04.22.1750',
       '04.22.1778', '04.16.3553', '04.33.1149', '04.16.3571',
       '05.66.3237', '05.66.3571', '03.11.1149', '04.22.1684',
       '03.11.1167', '04.82.1684', '04.82.1778', '04.33.1171',
       '04.73.2237', '10', '04.82.1730', '04.73.2571', '8.5.3',
       '04.22.1666', '03.11.1141', '05.15.2120', '04.33.1125',
       '04.16.3439', '04.22.1442', '04.33.1095', '04.16.3569',
       '05.15.2090', '05.15.3104', '05.15.2122', '04.22.1656',
       '04.16.2641', '05.15.2114', '04.16.3345', '05.15.2092'],
      dtype=object)

In [12]:
len(train_err['fwver'].unique())

37

In [8]:
train_err['errtype'].unique()

array([15, 12, 11, 16,  4, 26, 10, 14,  3, 13,  6,  7, 27, 28, 31, 33, 22,
       41,  5, 40, 34, 37, 36, 23, 18, 20, 19, 21, 42, 24, 25, 17, 38, 39,
       35, 32, 30,  2,  1,  9,  8], dtype=int64)

In [13]:
len(train_err['errtype'].unique())

41

In [9]:
train_err['errcode'].unique()

array(['1', '0', '2', ..., '4526', '3965', '25999'], dtype=object)

In [14]:
len(train_err['errcode'].unique())

2806

# train_problem_data

* 사용자 불만이 접수된 시간에 대한 정보
    * user_id : 사용자 ID... 5000명의 고객이 불만을 접수함
    * time : 불만 접수 시간
* train_err_data의 rows : 16,554,662
* train_problem_data rows : 15,932

In [24]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
train_prob.head()

Unnamed: 0,user_id,time
0,19224,20201102200000
1,23664,20201116140000
2,15166,20201114130000
3,12590,20201108210000
4,15932,20201103210000


In [36]:
len(train_prob['user_id'].unique())

5000

In [25]:
train_prob.tail()

Unnamed: 0,user_id,time
5424,20167,20201125120000
5425,16270,20201110120000
5426,19114,20201106230000
5427,21505,20201104110000
5428,18822,20201102120000


# train_quality_data
* 시스템 퀄리티 로그
    * time : 로그 발생 시간
    * user_id : 사용자 ID
    * fwver	: 펌웨어 버전
    * quality_0~12 : 퀄리티 상태에 대한 로그

In [39]:
train_quality = pd.read_csv(PATH+'train_quality_data.csv')
train_quality

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
1,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
2,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
3,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
4,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828619,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,1,0,0,0,0,17,0,0
828620,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828621,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,3,0,0,0,0,17,0,0
828622,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0


In [52]:
train_quality.describe()

Unnamed: 0,time,user_id,quality_0,quality_1,quality_2,quality_3,quality_4,quality_6,quality_11,quality_12
count,828624.0,828624.0,684192.0,828624.0,788511.0,828624.0,828624.0,828624.0,828624.0,828624.0
mean,20201120000000.0,17574.678315,4.148701,-0.171782,4.751094,0.0,0.0,2.043391,-0.181638,0.045878
std,8863638.0,4374.113554,479.315029,0.692386,586.252469,0.0,0.0,32.69538,0.397767,0.302452
min,20201030000000.0,10000.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,0.0
25%,20201110000000.0,13685.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,20201120000000.0,17597.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,20201120000000.0,21423.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,20201130000000.0,24997.0,157667.0,171.0,191859.0,0.0,0.0,600.0,14.0,14.0


In [53]:
train_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828624 entries, 0 to 828623
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time        828624 non-null  int64  
 1   user_id     828624 non-null  int64  
 2   fwver       788544 non-null  object 
 3   quality_0   684192 non-null  float64
 4   quality_1   828624 non-null  int64  
 5   quality_2   788511 non-null  float64
 6   quality_3   828624 non-null  int64  
 7   quality_4   828624 non-null  int64  
 8   quality_5   828604 non-null  object 
 9   quality_6   828624 non-null  int64  
 10  quality_7   828624 non-null  object 
 11  quality_8   828624 non-null  object 
 12  quality_9   828624 non-null  object 
 13  quality_10  828624 non-null  object 
 14  quality_11  828624 non-null  int64  
 15  quality_12  828624 non-null  int64  
dtypes: float64(2), int64(8), object(6)
memory usage: 101.2+ MB


In [40]:
train_quality.head()

Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
1,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
2,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
3,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0
4,20201129090000,10000,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,4,0,0


In [31]:
train_quality.tail()

Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
828619,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,1,0,0,0,0,17,0,0
828620,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828621,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,3,0,0,0,0,17,0,0
828622,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828623,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,9,0,0,0,0,17,0,0


In [38]:
len(train_quality['user_id'].unique())

8281

In [48]:
t_q_min = [0] * 13
t_q_max = [0] * 13

for i in range(13) : 
    idx = 'quality_' + str(i)
    print(min(train_quality[idx]))
#     t_q_min[i] = min(train_quality[idx])
#     t_q_max[i] = train_quality[idx].max()
    
print(t_q_min)
print(t_q_max)

-1.0
-1
-1.0
0
0


TypeError: '<' not supported between instances of 'float' and 'str'

In [51]:
t_q_sorted = train_quality.sort_values(['time'])
t_q_sorted

Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
586114,20201031235000,20747,05.15.2138,0.0,0,0.0,0,0,0,0,0,0,0,5,0,0
153487,20201031235000,12729,04.33.1185,0.0,0,0.0,0,0,0,0,0,4,0,0,0,0
153488,20201031235000,12729,04.33.1185,0.0,0,0.0,0,0,0,0,0,4,0,0,0,0
153489,20201031235000,12729,04.33.1185,0.0,0,0.0,0,0,0,0,0,4,0,0,0,0
153490,20201031235000,12729,04.33.1185,0.0,0,0.0,0,0,0,0,0,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242062,20201130234000,14371,04.33.1261,-1.0,-1,-1.0,0,0,-1,-1,0,0,0,3,-1,0
242061,20201130234000,14371,04.33.1261,-1.0,-1,-1.0,0,0,-1,-1,0,0,0,3,-1,0
242060,20201130234000,14371,04.33.1261,-1.0,-1,-1.0,0,0,-1,-1,0,0,0,3,-1,0
377609,20201130234000,16861,04.16.3553,-1.0,-1,-1.0,0,0,-1,-1,0,0,0,7,-1,0
