# 데이터 불러오기 및 탐색

In [1]:
import numpy as np
import pandas as pd
import string

#전처리 및 데이터 split
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel('data/traindata_google_appstore.xlsm',usecols='B,C')
df_test = pd.read_excel('data/testdata_google_2011_11.xlsm',usecols='B,C')

In [3]:
df.columns = ['label','C1'] #열 이름 변경
print("\n[5줄만 보기]")
print("label 1 : 불량  | 0 : 정상 | 3 : 모름,기억안남,없음")
display(df.head(5))

print("\n[데이터 속성]")
print(df.info())

print("\n[데이터 label 갯수]")
print(df['label'].value_counts())
#학습할 불량샘플이 너무 작은게 아닐까..?


[5줄만 보기]
label 1 : 불량  | 0 : 정상 | 3 : 모름,기억안남,없음


Unnamed: 0,label,C1
0,1,ㅣㅣㅣㅣ
1,1,ㅠㅡㄹ레이스토어
2,1,ㅜ글스토어
3,1,ㅓㅄ음
4,0,히어로 스카이



[데이터 속성]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47581 entries, 0 to 47580
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   47581 non-null  int64 
 1   C1      47581 non-null  object
dtypes: int64(1), object(1)
memory usage: 743.6+ KB
None

[데이터 label 갯수]
0    45908
3     1202
1      471
Name: label, dtype: int64


# 전처리
클리닝 -> 함수화

**모름, 기억나지않음 등과 같은 경우는 어떤 경우엔 정상 샘플로 넘기기 때문에 label을 3으로 표시**

In [4]:
#클리닝 함수
def cleanText(text):
    repl =''
       
    if text.isdecimal() : #숫자로만 구성되어 있을 때 공백 치환
        text = ''
    
    text = text.lower() #영어일경우 소문자로 변경
    text = text.strip().replace('\\','')# 기호 치환
    
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 자음, 모음 제거
    text = re.sub(pattern= pattern, repl=repl, string=text)
          
    pattern = '[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]' # 특수기호 제거
    text = re.sub(pattern= pattern, repl=repl, string=text)
    
    if len(text)==0 :
        text = '불량'
    #text = text.replace(' ','불량')# 모든 공백 제거 이건 아닌것 같아서 삭제
    
    return text

In [5]:
# 인자로 입력받은 DataFrame을 복사 한 뒤 C1 컬럼 문자형 변환하고 복사된 DataFrame 반환
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.columns = ['label','C1'] #열 이름 변경
           
    #1. 응답에 숫자도 있어서 int 형으로 처리됌 -> 문자형으로 변환
    df_text = df_copy['C1'].apply(str)   
       
    #2. 텍스트 클리닝 (자음/모음/숫자만으로 이루어진 경우 빈칸처리, 특수기호 제거)
    df_text = df_text.map(lambda x: cleanText(x))
    
    #3.. 더 있나?
        
    #마지막에 다시 대입
    df_copy['C1'] = df_text
    
    return df_copy

In [6]:
df_copy = get_preprocessed_df(df)
df_test = get_preprocessed_df(df_test)
# df_copy.isnull().sum() #공백은 불량으로 처리..?

# 임베딩
-단어 수준 임베딩

    Latent Semantic Analysis
    Word2Vec
    GloVe
    FastText
    Swivel
    
-문장 수준 임베딩

    Weighted Embeddings
    Latent Semantic Analysis
    Latent Dirichlet Allocation
    Doc2Vec
    Embeddings from Language Models (ELMo)
    Bidirectional Encoder Representations from Transformer (BERT)
    
    
출처 : https://github.com/ratsgo/embedding

In [7]:
#벡터화

# vectorizer = TfidfVectorizer()
# X_features = vectorizer.fit_transform(df_copy['C1'].copy())
# df_copy['X_features'] = X_features
# y_target = df_copy['label'].copy()
# df_copy

In [8]:
#벡터화
vectorizer = TfidfVectorizer()
def vectorize_tfid_fit(df_copy):
    
    
    X_features = vectorizer.fit_transform(df_copy['C1'].copy())
    df_copy['X_features'] = X_features
    y_target = df_copy['label'].copy()
    return X_features , y_target

def vectorize_tfid_unfit(df_copy):
    
    
    X_features = vectorizer.transform(df_copy['C1'].copy())
    df_copy['X_features'] = X_features
    y_target = df_copy['label'].copy()
    return X_features , y_target

# 데이터 나누기

In [9]:
#전체 데이터를 테스트로
#X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=333,stratify=y_target )

In [10]:
X_train,y_train =  vectorize_tfid_fit(df_copy)
X_test,y_test =  vectorize_tfid_unfit(df_test)

In [11]:
# LightGBM의 파이썬 패키지인 lightgbm에서 LGBMClassifier 임포트
from lightgbm import LGBMClassifier

# 앞서 XGBoost와 동일하게 n_estimators는 400 설정. 
lgbm_wrapper = LGBMClassifier(n_estimators=400)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=400, eval_metric="logloss", 
                 eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

[1]	valid_0's multi_logloss: 0.0996645
Training until validation scores don't improve for 400 rounds
[2]	valid_0's multi_logloss: 0.0928569
[3]	valid_0's multi_logloss: 0.0876338
[4]	valid_0's multi_logloss: 0.083392
[5]	valid_0's multi_logloss: 0.0799235
[6]	valid_0's multi_logloss: 0.0770033
[7]	valid_0's multi_logloss: 0.0745322
[8]	valid_0's multi_logloss: 0.072363
[9]	valid_0's multi_logloss: 0.0705533
[10]	valid_0's multi_logloss: 0.0689015
[11]	valid_0's multi_logloss: 0.0675188
[12]	valid_0's multi_logloss: 0.0662681
[13]	valid_0's multi_logloss: 0.0651776
[14]	valid_0's multi_logloss: 0.0642276
[15]	valid_0's multi_logloss: 0.0633455
[16]	valid_0's multi_logloss: 0.0625992
[17]	valid_0's multi_logloss: 0.061965
[18]	valid_0's multi_logloss: 0.0613868
[19]	valid_0's multi_logloss: 0.0608495
[20]	valid_0's multi_logloss: 0.0604197
[21]	valid_0's multi_logloss: 0.0599518
[22]	valid_0's multi_logloss: 0.0596118
[23]	valid_0's multi_logloss: 0.0592889
[24]	valid_0's multi_logloss: 

[228]	valid_0's multi_logloss: 0.0585586
[229]	valid_0's multi_logloss: 0.0585608
[230]	valid_0's multi_logloss: 0.0585629
[231]	valid_0's multi_logloss: 0.0585651
[232]	valid_0's multi_logloss: 0.0585673
[233]	valid_0's multi_logloss: 0.0585694
[234]	valid_0's multi_logloss: 0.0585716
[235]	valid_0's multi_logloss: 0.0585737
[236]	valid_0's multi_logloss: 0.0585758
[237]	valid_0's multi_logloss: 0.0585779
[238]	valid_0's multi_logloss: 0.05858
[239]	valid_0's multi_logloss: 0.0585821
[240]	valid_0's multi_logloss: 0.0585842
[241]	valid_0's multi_logloss: 0.0585862
[242]	valid_0's multi_logloss: 0.0585883
[243]	valid_0's multi_logloss: 0.0585903
[244]	valid_0's multi_logloss: 0.0585924
[245]	valid_0's multi_logloss: 0.0585944
[246]	valid_0's multi_logloss: 0.0585964
[247]	valid_0's multi_logloss: 0.0585984
[248]	valid_0's multi_logloss: 0.0586004
[249]	valid_0's multi_logloss: 0.0586024
[250]	valid_0's multi_logloss: 0.0586044
[251]	valid_0's multi_logloss: 0.0586064
[252]	valid_0's mu

# 평가

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred, pos_label='positive', average='micro')
    recall = recall_score(y_test , pred, pos_label='positive', average='micro')
    f1 = f1_score(y_test,pred, average='micro')
    # ROC-AUC 추가 
#     roc_auc = roc_auc_score(y_test, pred_proba, average='micro' , multi_class = 'ovo')
    print('오차 행렬')
    print(confusion)
    
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}'.format(accuracy, precision, recall, f1))
    
#     # ROC-AUC print 추가
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
#     F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [13]:
get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[2142    0    2]
 [  26    0    1]
 [   7    0   28]]
정확도: 0.9837, 정밀도: 0.9837, 재현율: 0.9837,    F1: 0.9837


In [14]:
# # plot_importance( )를 이용하여 feature 중요도 시각화
# from lightgbm import plot_importance
# import matplotlib.pyplot as plt
# %matplotlib inline

# fig, ax = plt.subplots(figsize=(10, 12))
# plot_importance(lgbm_wrapper, ax=ax)

In [15]:
df_test['preds'] = preds
df_test['pred_proba'] = pred_proba

df_test2 = df_test
del df_test2['X_features']
df_test2

Unnamed: 0,label,C1,preds,pred_proba
0,0,1원 스토어,0,0.001117
1,0,2048 파스포푸트,0,0.098851
2,0,app,0,0.012952
3,0,app store,0,0.000399
4,0,app store,0,0.000399
...,...,...,...,...
2201,0,한게임,0,0.005123
2202,0,한게임,0,0.005123
2203,0,한게임,0,0.005123
2204,0,헤이데이,0,0.098851


In [17]:
#엑셀 저장
df_test2.to_excel('google_result_20201213.xlsx',sheet_name='result')