# 데이터 불러오기 및 탐색

In [1]:
import numpy as np
import pandas as pd
import string

#전처리 및 데이터 split
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

def get_label(df=None):
    df_copy = df.copy()
    df_copy.columns = ['label','C1'] #열 이름 변경
    return df_copy

In [2]:
df_google = pd.read_excel('data/traindata_google_appstore.xlsm',usecols='B,C')
df_brandname = pd.read_excel('data/traindata_car_brandname.xlsx',usecols='B,C')
df_comname = pd.read_excel('data/traindata_Company_name.xlsm',usecols='B,C')
df_cosme = pd.read_excel('data/traindata_cosmetics.xlsx',usecols='B,C')
df_finance = pd.read_excel('data/traindata_finance.xlsx',usecols='B,C')
df_obs = pd.read_excel('data/traindata_outback.xlsm',usecols='B,C')

In [3]:
data_list = [df_google,df_brandname,df_comname,df_cosme,df_finance,df_obs]
data_list_labeing = []
for df_cat in data_list:
    df_cat2 = get_label(df_cat)
    data_list_labeing.append(df_cat2)
    
df = pd.concat(data_list_labeing)

In [4]:

print("\n[5줄만 보기]")
print("label 1 : 불량  | 0 : 정상 | 3 : 모름,기억안남,없음")
display(df.head(5))

print("\n[데이터 속성]")
print(df.info())

print("\n[데이터 label 갯수]")
print(df['label'].value_counts())
#학습할 불량샘플이 너무 작은게 아닐까..?


[5줄만 보기]
label 1 : 불량  | 0 : 정상 | 3 : 모름,기억안남,없음


Unnamed: 0,label,C1
0,1,ㅣㅣㅣㅣ
1,1,ㅠㅡㄹ레이스토어
2,1,ㅜ글스토어
3,1,ㅓㅄ음
4,0,히어로 스카이



[데이터 속성]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 104096 entries, 0 to 6260
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   104096 non-null  int64 
 1   C1      104096 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.4+ MB
None

[데이터 label 갯수]
0    101387
3      1425
1      1284
Name: label, dtype: int64


# 전처리
클리닝 -> 함수화

**모름, 기억나지않음 등과 같은 경우는 어떤 경우엔 정상 샘플로 넘기기 때문에 label을 3으로 표시**

In [5]:
#클리닝 함수
def cleanText(text):
    repl =''
       
    if text.isdecimal() : #숫자로만 구성되어 있을 때 공백 치환
        text = ''
    
    text = text.lower() #영어일경우 소문자로 변경
    text = text.strip().replace('\\','')# 기호 치환
    
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 자음, 모음 제거
    text = re.sub(pattern= pattern, repl=repl, string=text)
          
    pattern = '[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]' # 특수기호 제거
    text = re.sub(pattern= pattern, repl=repl, string=text)
    
    if len(text)==0 :
        text = '불량'
    #text = text.replace(' ','불량')# 모든 공백 제거 이건 아닌것 같아서 삭제
    
    return text

In [6]:
# 인자로 입력받은 DataFrame을 복사 한 뒤 C1 컬럼 문자형 변환하고 복사된 DataFrame 반환



def get_preprocessed_df(df=None):
    df_copy = df.copy()
               
    #1. 응답에 숫자도 있어서 int 형으로 처리됌 -> 문자형으로 변환
    df_text = df_copy['C1'].apply(str)   
       
    #2. 텍스트 클리닝 (자음/모음/숫자만으로 이루어진 경우 빈칸처리, 특수기호 제거)
    df_text = df_text.map(lambda x: cleanText(x))
    
    #3.. 더 있나?
        
    #마지막에 다시 대입
    df_copy['C1re'] = df_text
    
    return df_copy

In [7]:
df_copy = get_preprocessed_df(df)
# df_copy.isnull().sum() #공백은 불량으로 처리..?

# 임베딩
-단어 수준 임베딩

    Latent Semantic Analysis
    Word2Vec
    GloVe
    FastText
    Swivel
    
-문장 수준 임베딩

    Weighted Embeddings
    Latent Semantic Analysis
    Latent Dirichlet Allocation
    Doc2Vec
    Embeddings from Language Models (ELMo)
    Bidirectional Encoder Representations from Transformer (BERT)
    
    
출처 : https://github.com/ratsgo/embedding

In [8]:
#벡터화

# vectorizer = TfidfVectorizer()
# X_features = vectorizer.fit_transform(df_copy['C1'].copy())
# df_copy['X_features'] = X_features
# y_target = df_copy['label'].copy()
# df_copy

In [9]:
#벡터화
vectorizer = TfidfVectorizer(min_df=0.0, analyzer='char', sublinear_tf=True, ngram_range=(1,5))
def vectorize_tfid_fit(df_copy):    
    X_features = vectorizer.fit_transform(df_copy['C1re'].copy())
    df_copy['X_features'] = X_features
    y_target = df_copy['label'].copy()
    return X_features , y_target



# 데이터 나누기

In [16]:
#전체 데이터를 테스트로
X_features, y_target = vectorize_tfid_fit(df_copy)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=333,stratify=y_target )

In [17]:
# X_train,y_train =  vectorize_tfid_fit(df_copy)
# X_test,y_test =  vectorize_tfid_unfit(df_test)

In [18]:
# LightGBM의 파이썬 패키지인 lightgbm에서 LGBMClassifier 임포트
from lightgbm import LGBMClassifier

# 앞서 XGBoost와 동일하게 n_estimators는 400 설정. 
lgbm_wrapper = LGBMClassifier(n_estimators=400,class_weight = 'balanced', )

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_features, y_target)]
lgbm_wrapper.fit(X_features, y_target, early_stopping_rounds=100, eval_metric="logloss", 
                 eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]



[1]	training's multi_logloss: 0.930253
Training until validation scores don't improve for 100 rounds
[2]	training's multi_logloss: 0.79833
[3]	training's multi_logloss: 0.692454
[4]	training's multi_logloss: 0.605687
[5]	training's multi_logloss: 0.533576
[6]	training's multi_logloss: 0.472707
[7]	training's multi_logloss: 0.421477
[8]	training's multi_logloss: 0.377678
[9]	training's multi_logloss: 0.340122
[10]	training's multi_logloss: 0.308022
[11]	training's multi_logloss: 0.280338
[12]	training's multi_logloss: 0.256293
[13]	training's multi_logloss: 0.23544
[14]	training's multi_logloss: 0.217127
[15]	training's multi_logloss: 0.20087
[16]	training's multi_logloss: 0.186836
[17]	training's multi_logloss: 0.17426
[18]	training's multi_logloss: 0.162977
[19]	training's multi_logloss: 0.15312
[20]	training's multi_logloss: 0.144301
[21]	training's multi_logloss: 0.136391
[22]	training's multi_logloss: 0.129455
[23]	training's multi_logloss: 0.123249
[24]	training's multi_logloss: 0

[202]	training's multi_logloss: 0.0266388
[203]	training's multi_logloss: 0.0265946
[204]	training's multi_logloss: 0.0265467
[205]	training's multi_logloss: 0.0264843
[206]	training's multi_logloss: 0.026441
[207]	training's multi_logloss: 0.0264024
[208]	training's multi_logloss: 0.026366
[209]	training's multi_logloss: 0.026331
[210]	training's multi_logloss: 0.0262923
[211]	training's multi_logloss: 0.0262571
[212]	training's multi_logloss: 0.0262245
[213]	training's multi_logloss: 0.0261907
[214]	training's multi_logloss: 0.0261545
[215]	training's multi_logloss: 0.0261217
[216]	training's multi_logloss: 0.0260781
[217]	training's multi_logloss: 0.026041
[218]	training's multi_logloss: 0.0260083
[219]	training's multi_logloss: 0.02597
[220]	training's multi_logloss: 0.0259259
[221]	training's multi_logloss: 0.0258951
[222]	training's multi_logloss: 0.0258594
[223]	training's multi_logloss: 0.0258339
[224]	training's multi_logloss: 0.0258005
[225]	training's multi_logloss: 0.025773

# 평가

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred, pos_label='positive', average='micro')
    recall = recall_score(y_test , pred, pos_label='positive', average='micro')
    f1 = f1_score(y_test,pred, average='micro')
    # ROC-AUC 추가 
#     roc_auc = roc_auc_score(y_test, pred_proba, average='micro' , multi_class = 'ovo')
    print('오차 행렬')
    print(confusion)
    
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}'.format(accuracy, precision, recall, f1))
    
#     # ROC-AUC print 추가
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
#     F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [20]:
get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[20246    32     0]
 [    8   248     1]
 [    0     0   285]]
정확도: 0.9980, 정밀도: 0.9980, 재현율: 0.9980,    F1: 0.9980


# 모델 저장

In [21]:
import pickle
import joblib

with open('model_google.pkl', 'wb') as file:  
    pickle.dump(lgbm_wrapper, file)

#벡터라이즈도 저장해줘야함!!!!
with open('TVectorizer.pkl', 'wb') as file:  
    pickle.dump(vectorizer, file)

In [22]:
#End