# 데이터 불러오기 및 탐색

In [249]:
import numpy as np
import pandas as pd
import string

#전처리 및 데이터 split
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [250]:
df = pd.read_excel('data/traindata_google_appstore.xlsm',usecols='B,C')

In [251]:
df.columns = ['label','C1'] #열 이름 변경
print("\n[5줄만 보기]")
print("label 1 : 불량  | 0 : 정상 | 3 : 모름,기억안남,없음")
display(df.head(5))

print("\n[데이터 속성]")
print(df.info())

print("\n[데이터 label 갯수]")
print(df['label'].value_counts())
#학습할 불량샘플이 너무 작은게 아닐까..?


[5줄만 보기]
label 1 : 불량  | 0 : 정상 | 3 : 모름,기억안남,없음


Unnamed: 0,label,C1
0,1,ㅣㅣㅣㅣ
1,1,ㅠㅡㄹ레이스토어
2,1,ㅜ글스토어
3,1,ㅓㅄ음
4,0,히어로 스카이



[데이터 속성]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47581 entries, 0 to 47580
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   47581 non-null  int64 
 1   C1      47581 non-null  object
dtypes: int64(1), object(1)
memory usage: 743.6+ KB
None

[데이터 label 갯수]
0    45908
3     1202
1      471
Name: label, dtype: int64


# 전처리
클리닝 -> 함수화

**모름, 기억나지않음 등과 같은 경우는 어떤 경우엔 정상 샘플로 넘기기 때문에 label을 3으로 표시**

In [252]:
#클리닝 함수
def cleanText(text):
    repl =''
       
    if text.isdecimal() : #숫자로만 구성되어 있을 때 공백 치환
        text = ''
    
    text = text.lower() #영어일경우 소문자로 변경
    text = text.strip().replace('\\','')# 기호 치환
    
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 자음, 모음 제거
    text = re.sub(pattern= pattern, repl=repl, string=text)
          
    pattern = '[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]' # 특수기호 제거
    text = re.sub(pattern= pattern, repl=repl, string=text)
    
    if len(text)==0 :
        text = '불량'
    #text = text.replace(' ','불량')# 모든 공백 제거 이건 아닌것 같아서 삭제
    
    return text

In [253]:
# 인자로 입력받은 DataFrame을 복사 한 뒤 C1 컬럼 문자형 변환하고 복사된 DataFrame 반환
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.columns = ['label','C1'] #열 이름 변경
           
    #1. 응답에 숫자도 있어서 int 형으로 처리됌 -> 문자형으로 변환
    df_text = df_copy['C1'].apply(str)   
       
    #2. 텍스트 클리닝 (자음/모음/숫자만으로 이루어진 경우 빈칸처리, 특수기호 제거)
    df_text = df_text.map(lambda x: cleanText(x))
    
    #3.. 더 있나?
        
    #마지막에 다시 대입
    df_copy['C1'] = df_text
    
    return df_copy

In [254]:
df_copy = get_preprocessed_df(df)
# df_copy.isnull().sum() #공백은 불량으로 처리..?
df_copy

Unnamed: 0,label,C1
0,1,불량
1,1,레이스토어
2,1,글스토어
3,1,음
4,0,히어로 스카이
...,...,...
47576,1,불량
47577,1,불량
47578,1,불량
47579,1,불량


# 임베딩
-단어 수준 임베딩

    Latent Semantic Analysis
    Word2Vec
    GloVe
    FastText
    Swivel
    
-문장 수준 임베딩

    Weighted Embeddings
    Latent Semantic Analysis
    Latent Dirichlet Allocation
    Doc2Vec
    Embeddings from Language Models (ELMo)
    Bidirectional Encoder Representations from Transformer (BERT)
    
    
출처 : https://github.com/ratsgo/embedding

In [313]:
#벡터화
vectorizer = TfidfVectorizer()

X_features = vectorizer.fit_transform(df_copy['C1'].copy())
df_copy['X_features'] = X_features
y_target = df_copy['label'].copy()
df_copy

Unnamed: 0,label,C1,X_features
0,1,불량,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
1,1,레이스토어,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
2,1,글스토어,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
3,1,음,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
4,0,히어로 스카이,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
...,...,...,...
47576,1,불량,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
47577,1,불량,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
47578,1,불량,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."
47579,1,불량,"(0, 1185)\t1.0\n (1, 934)\t1.0\n (2, 717)\..."


# 데이터 나누기

In [326]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=333,stratify=y_target )

In [330]:
X_train =X_features
X_test = X_features

y_train =y_target
y_test = y_target

In [331]:
# LightGBM의 파이썬 패키지인 lightgbm에서 LGBMClassifier 임포트
from lightgbm import LGBMClassifier

# 앞서 XGBoost와 동일하게 n_estimators는 400 설정. 
lgbm_wrapper = LGBMClassifier(n_estimators=400)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", 
                 eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

[1]	training's multi_logloss: 0.0963368
Training until validation scores don't improve for 100 rounds
[2]	training's multi_logloss: 0.0892729
[3]	training's multi_logloss: 0.0841041
[4]	training's multi_logloss: 0.0798861
[5]	training's multi_logloss: 0.0763638
[6]	training's multi_logloss: 0.0733774
[7]	training's multi_logloss: 0.0708301
[8]	training's multi_logloss: 0.0686303
[9]	training's multi_logloss: 0.0667258
[10]	training's multi_logloss: 0.0650585
[11]	training's multi_logloss: 0.063596
[12]	training's multi_logloss: 0.0623044
[13]	training's multi_logloss: 0.0611617
[14]	training's multi_logloss: 0.0601477
[15]	training's multi_logloss: 0.0592418
[16]	training's multi_logloss: 0.0584318
[17]	training's multi_logloss: 0.0577061
[18]	training's multi_logloss: 0.0570529
[19]	training's multi_logloss: 0.0564575
[20]	training's multi_logloss: 0.0559189
[21]	training's multi_logloss: 0.0554312
[22]	training's multi_logloss: 0.0549846
[23]	training's multi_logloss: 0.0545747
[24]	

[212]	training's multi_logloss: 0.0475513
[213]	training's multi_logloss: 0.0475512
[214]	training's multi_logloss: 0.0475512
[215]	training's multi_logloss: 0.0475512
[216]	training's multi_logloss: 0.0475512
[217]	training's multi_logloss: 0.0475511
[218]	training's multi_logloss: 0.0475511
[219]	training's multi_logloss: 0.0475511
[220]	training's multi_logloss: 0.0475511
[221]	training's multi_logloss: 0.047551
[222]	training's multi_logloss: 0.047551
[223]	training's multi_logloss: 0.047551
[224]	training's multi_logloss: 0.0475509
[225]	training's multi_logloss: 0.0475509
[226]	training's multi_logloss: 0.0475509
[227]	training's multi_logloss: 0.0475509
[228]	training's multi_logloss: 0.0475508
[229]	training's multi_logloss: 0.0475508
[230]	training's multi_logloss: 0.0475508
[231]	training's multi_logloss: 0.0475508
[232]	training's multi_logloss: 0.0475507
[233]	training's multi_logloss: 0.0475507
[234]	training's multi_logloss: 0.0475507
[235]	training's multi_logloss: 0.047

In [336]:
df_copy['preds'] = preds
df_copy['pred_proba'] = pred_proba

df_copy2 = df_copy
del df_copy2['X_features']

In [337]:
df_copy2

Unnamed: 0,label,C1,preds,pred_proba
0,1,불량,1,0.997583
1,1,레이스토어,0,0.106690
2,1,글스토어,0,0.106690
3,1,음,0,0.106690
4,0,히어로 스카이,0,0.106690
...,...,...,...,...
47576,1,불량,1,0.997583
47577,1,불량,1,0.997583
47578,1,불량,1,0.997583
47579,1,불량,1,0.997583


In [338]:
#엑셀 저장
df_copy2.to_excel('google_result_20201213.xlsx',sheet_name='result')

# 평가

In [333]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred, pos_label='positive', average='micro')
    recall = recall_score(y_test , pred, pos_label='positive', average='micro')
    f1 = f1_score(y_test,pred, average='micro')
    # ROC-AUC 추가 
#     roc_auc = roc_auc_score(y_test, pred_proba, average='micro' , multi_class = 'ovo')
    print('오차 행렬')
    print(confusion)
    
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}'.format(accuracy, precision, recall, f1))
    
#     # ROC-AUC print 추가
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
#     F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [334]:
get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[45907     0     1]
 [  408    63     0]
 [  276     0   926]]
정확도: 0.9856, 정밀도: 0.9856, 재현율: 0.9856,    F1: 0.9856


In [284]:
# # plot_importance( )를 이용하여 feature 중요도 시각화
# from lightgbm import plot_importance
# import matplotlib.pyplot as plt
# %matplotlib inline

# fig, ax = plt.subplots(figsize=(10, 12))
# plot_importance(lgbm_wrapper, ax=ax)