# 데이터 불러오기 및 탐색

In [35]:
import numpy as np
import pandas as pd
import string

#전처리 및 데이터 split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")

In [67]:
df = pd.read_excel('data/google_data_all_20201208.xlsm',usecols='B,C')
#df.head()


In [71]:
# 인자로 입력받은 DataFrame을 복사 한 뒤 C1 컬럼 문자형 변환하고 복사된 DataFrame 반환
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.columns = ['C1','label']
      
    #응답에 숫자도 있어서 int 형으로 생각함 -> 문자형으로 변환
    df_copy['C1'] = df_copy['C1'].apply(str)
    
    print("\n[5줄만 보기]")
    print("label 1 : 불량  | 0 : 정상")
    display(df_copy.head(8))
    
    print("\n[데이터 속성 탐색]")
    print(df.info())

    print("-"*100)
    print("[데이터 label 갯수]")
    print(df_copy['label'].value_counts())
    #학습할 불량샘플이 너무 작은게 아닐까..?
    
    
    return df_copy

In [72]:
get_preprocessed_df(df)


[5줄만 보기]
label 1 : 불량  | 0 : 정상


Unnamed: 0,C1,label
0,ㅣㅣㅣㅣ,1
1,ㅠㅡㄹ레이스토어,1
2,ㅜ글스토어,1
3,ㅓㅄ음,1
4,히어로 스카이,0
5,히든씨티,0
6,히든씨티,0
7,희비전,0



[데이터 속성 탐색]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47581 entries, 0 to 47580
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   C1[1].slice  47581 non-null  object
 1   True_False   47581 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 743.6+ KB
None
----------------------------------------------------------------------------------------------------
[데이터 label 갯수]
0    47134
1      447
Name: label, dtype: int64


Unnamed: 0,C1,label
0,ㅣㅣㅣㅣ,1
1,ㅠㅡㄹ레이스토어,1
2,ㅜ글스토어,1
3,ㅓㅄ음,1
4,히어로 스카이,0
...,...,...
47576,0,1
47577,0,1
47578,0,1
47579,0,1


**모름, 기억나지않음과 같은 경우는 어떤 경우엔 정상 샘플로 넘기기 때문에 우선은 정상샘플(0)로 표시

# 데이터 전처리

-단어 수준 임베딩

    Latent Semantic Analysis
    Word2Vec
    GloVe
    FastText
    Swivel
    
-문장 수준 임베딩

    Weighted Embeddings
    Latent Semantic Analysis
    Latent Dirichlet Allocation
    Doc2Vec
    Embeddings from Language Models (ELMo)
    Bidirectional Encoder Representations from Transformer (BERT)
    
    
출처 : https://github.com/ratsgo/embedding

In [73]:
# ### FastText 학습
# from gensim.models import FastText
# ft_model = FastText(result, size=100, window=5, min_count=5, workers=4, sg=1)

# 데이터 나누기

In [79]:
# 사전 데이터 가공 후 학습과 테스트 데이터 세트를 반환하는 함수.
def get_train_test_dataset(df=None):
    # 인자로 입력된 DataFrame의 사전 데이터 가공이 완료된 복사 DataFrame 반환
    df_copy = get_preprocessed_df(df)
    # DataFrame의 맨 마지막 컬럼이 레이블, 나머지는 피처들
    
    vectorizer = CountVectorizer(min_df=1)

    
    
    
    #벡터화
    vectorizer = TfidfVectorizer()
        
    X_features = vectorizer.fit_transform(df_copy['C1'].copy()) 
    y_target = df_copy['label'].copy()
    # X_features = df_copy.iloc[:, :-1]
    # y_target = df_copy.iloc[:, -1]
    
    
    # train_test_split( )으로 학습과 테스트 데이터 분할. stratify=y_target으로 Stratified 기반 분할
    X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=333,stratify=y_target )
    # 학습과 테스트 데이터 세트 반환
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_train_test_dataset(df)


[5줄만 보기]
label 1 : 불량  | 0 : 정상


Unnamed: 0,C1,label
0,ㅣㅣㅣㅣ,1
1,ㅠㅡㄹ레이스토어,1
2,ㅜ글스토어,1
3,ㅓㅄ음,1
4,히어로 스카이,0
5,히든씨티,0
6,히든씨티,0
7,희비전,0



[데이터 속성 탐색]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47581 entries, 0 to 47580
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   C1[1].slice  47581 non-null  object
 1   True_False   47581 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 743.6+ KB
None
----------------------------------------------------------------------------------------------------
[데이터 label 갯수]
0    47134
1      447
Name: label, dtype: int64


In [80]:
print('학습 데이터 레이블 값 비율')
print(y_train.value_counts())#/y_train.shape[0] * 100)
print('테스트 데이터 레이블 값 비율')
print(y_test.value_counts())#/y_test.shape[0] * 100)
X_train.astype

학습 데이터 레이블 값 비율
0    32993
1      313
Name: label, dtype: int64
테스트 데이터 레이블 값 비율
0    14141
1      134
Name: label, dtype: int64


<bound method _data_matrix.astype of <33306x2336 sparse matrix of type '<class 'numpy.float64'>'
	with 39750 stored elements in Compressed Sparse Row format>>

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

In [105]:
svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
knc = KNeighborsClassifier(n_neighbors=49)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)
abc = AdaBoostClassifier(n_estimators=62, random_state=111)
bc = BaggingClassifier(n_estimators=9, random_state=111)
etc = ExtraTreesClassifier(n_estimators=9, random_state=111)


clfs = {'SVC' : svc,'KN' : knc, 'NB': mnb, 'DT': dtc, 'LR': lrc, 'RF': rfc, 'AdaBoost': abc, 'BgC': bc, 'ETC': etc}
clfs = {'SVC' : svc}
def train_classifier(clf, X_train, y_train):    
    clf.fit(X_train, y_train)
def predict_labels(clf, X_test):
    return (clf.predict(X_test))
def predict_proba(clf, X_test):
    return (clf.predict_proba(X_test)[:, 1])

# 'X_train, X_test, y_train, y_test'
# features_train features_test labels_train labels_test
pred_scores = []
for k,v in clfs.items():
    train_classifier(v, X_train, y_train)
    pred = predict_labels(v,X_test)
    pred_scores.append((k, [accuracy_score(y_test,pred)]))
    #pred_proba = predict_proba(v,X_test)
    #pred_scores.append((k, [accuracy_score(y_test,pred),pred_proba]))


In [107]:
import matplotlib.pyplot as plt
%matplotlib inline

score=pd.DataFrame.from_dict(dict(pred_scores),orient='index',columns=['Score'])
print(score)
# score.plot(kind='bar', ylim=(0.9,1.0), figsize=(11,6), align='center', colormap="Accent")
# plt.xticks(np.arange(9), df.index)
# plt.ylabel('Accuracy Score')
# plt.title('Distribution by Classifier')
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

        Score
SVC  0.991384


In [108]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [109]:
# 3장에서 사용한 get_clf_eval() 함수를 이용하여 평가 수행. 
# get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[14140     1]
 [  122    12]]
정확도: 0.9914, 정밀도: 0.9231, 재현율: 0.0896,    F1: 0.1633, AUC:0.9539


In [110]:
# 인자로 사이킷런의 Estimator객체와, 학습/테스트 데이터 세트를 입력 받아서 학습/예측/평가 수행.
def get_model_train_eval(model, ftr_train=None, ftr_test=None, tgt_train=None, tgt_test=None):
    model.fit(ftr_train, tgt_train)
    pred = model.predict(ftr_test)
    pred_proba = model.predict_proba(ftr_test)[:, 1]
    get_clf_eval(tgt_test, pred, pred_proba)


abc = AdaBoostClassifier(n_estimators=62, random_state=111)
get_model_train_eval(abc, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)

오차 행렬
[[14141     0]
 [  124    10]]
정확도: 0.9913, 정밀도: 1.0000, 재현율: 0.0746,    F1: 0.1389, AUC:0.9539


NameError: name 'result' is not defined

# 평가