<a href="https://colab.research.google.com/github/KyoungmiKwon/Bigdata_Training_at_ITwill/blob/main/Team_Project/C05_KM_1_ML_total.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 벡터화
import logging # word2vec 학습과정에서 로그 메세지를 양식에 맞는 info 수준으로 볼 수 있다.
from gensim.models import word2vec # word2vec

from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF
from gensim.models.doc2vec import TaggedDocument,Doc2Vec # Doc2vec

# 머신러닝
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score

from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier 

from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV # 하이퍼 파라미터 튜닝
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# 데이터불러오기

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/Project/C03_KM_Text_Preprocessing_2.xlsx'
data = pd.read_excel(file_path,sheet_name='all_resize',header=0)

In [None]:
len(data)
text = data['seperate']

# Word2Vec

## 벡터화

In [None]:
def get_vector(words, model, num_features):
    """하나의 리뷰에 대해 전체 단어에 대한 개별 단어의 평균값을 계산하는 함수
    words :  리뷰 하나 (단어 모음집)
    model :  학습한 모델
    num_features : Word2Vec embedding 할 때 정했던 벡터의 차원 수"""

    # 출력 벡터 초기화 - np.zero 벡터 생성
    vec = np.zeros((num_features), dtype=np.float32)

    num_words = 0
    # 어휘 사전 준비
    idx2word_set = set(model.wv.index2word)

    for w in words:
        if w in idx2word_set: # 사전에 해당하는 단어가 있으면,
            num_words = 1 
            # 해당 단어에 대해 단어 벡터를 더함 (기본값이 0이므로, 벡터값을 더해 줌으로서 단어의 벡터값으로 치환하여 주는 것임)
            vec = np.add(vec,model[w])
    
    # 문장의 단어 수 만큼 나누어 벡터의 평균값을 문장 벡터로 함
    f_vec = np.divide(vec, num_words)
    return f_vec

In [None]:
def get_vector_dataset(news, model, num_features):
    """ 리뷰 데이터 셋에 대해 리뷰 별(리뷰 하나의)평균 값을 계산하는 함수
    news  :  리뷰 데이터 셋
    model :  학습한 모델
    num_features : embedding 할 때 정했던 벡터의 차원 수"""
    
    dataset = []

    for s in news:
        dataset.append(get_vector(s,model,num_features))

    news_vec = np.stack(dataset)
    return news_vec

In [None]:
# 진행 상황 나오는 log 설정
logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s',level=logging.INFO)
# 학습 시 필요한 하이퍼파라미터
n_worker = 4        # 프로세스 갯수 ,defalut = 3
n_size = 600        # 워드 벡터 특정값 수 (크기),  defalut = 100
min_cnt = 10        # 단어에 대한 최소 빈도 단어 수, defalut = 5
context =  5        # 컨텍스트 윈도우의 크기, defalut = 5
d_sampling = 0.001  # 다운 샘플링 수, defalut = 0.001

In [None]:
# 벡터모델 훈련
md_word = word2vec.Word2Vec(text,
                       workers = n_worker,
                       size = n_size,
                       min_count = min_cnt,
                       window = context,
                       sample = d_sampling)

2021-04-30 05:12:15,413: INFO : collecting all words and their counts
2021-04-30 05:12:15,419: INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-30 05:12:15,574: INFO : collected 452 word types from a corpus of 1006005 raw words and 1352 sentences
2021-04-30 05:12:15,578: INFO : Loading a fresh vocabulary
2021-04-30 05:12:15,582: INFO : effective_min_count=10 retains 452 unique words (100% of original 452, drops 0)
2021-04-30 05:12:15,585: INFO : effective_min_count=10 leaves 1006005 word corpus (100% of original 1006005, drops 0)
2021-04-30 05:12:15,593: INFO : deleting the raw counts dictionary of 452 items
2021-04-30 05:12:15,595: INFO : sample=0.001 downsamples 29 most-common words
2021-04-30 05:12:15,597: INFO : downsampling leaves estimated 372884 word corpus (37.1% of prior 1006005)
2021-04-30 05:12:15,602: INFO : estimated required memory for 452 words and 600 dimensions: 2395600 bytes
2021-04-30 05:12:15,604: INFO : resetting layer weights
2021-0

In [None]:
# 벡터화모델 저장
# 모델 이름을 하이퍼파라미터 설정 내역으로 하면 나중에 참고하기 좋음.
md_name = "W2V__4_workers__600_size__10_mincnt"
md_word.save(md_name)

# Word2Vec.load() 로 다시 사용가능!!

2021-04-30 05:12:22,496: INFO : saving Word2Vec object under W2V__4_workers__600_size__10_mincnt, separately None
2021-04-30 05:12:22,499: INFO : not storing attribute vectors_norm
2021-04-30 05:12:22,501: INFO : not storing attribute cum_table
2021-04-30 05:12:22,539: INFO : saved W2V__4_workers__600_size__10_mincnt


In [None]:
data_vec_word = get_vector_dataset(text, md_word, n_size) # (데이터셋, 모델, 임베딩 단어수)
len(data_vec_word)



1352

In [None]:
# 부정,중립,긍정으로 할 경우
X = data_vec_word
y = np.array(data['S'])

In [None]:
# 긍정, 긍정이 아닌 경우 : 이진분류로 치환시
y_1 = data['S'].replace(0,-1)
y = np.array(y_1)

## 0. 셋트분리

In [None]:
# for Word2Vec
X_tr, X_ts, y_tr, y_ts = train_test_split(X,y,             
                                          test_size = 0.2,
                                          stratify = y)

## 일반 머신러닝 모델 성능

In [None]:
# 5개의 머신 러닝 모델(학습기) 생성
log_clf = LogisticRegression()
sgd_clf = SGDClassifier(loss='log')
svm_clf = SVC(probability=True)
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()

In [None]:
classifiers = [log_clf, sgd_clf, svm_clf, tree_clf, knn_clf]
for clf in classifiers:
    clf.fit(X_tr, y_tr)  # 모델 훈련
    acc = clf.score(X_ts, y_ts)  # 모델 평가
    print(clf.__class__.__name__, 'accuracy:', acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression accuracy: 0.6678966789667896
SGDClassifier accuracy: 0.6162361623616236
SVC accuracy: 0.6014760147601476
DecisionTreeClassifier accuracy: 0.6199261992619927
KNeighborsClassifier accuracy: 0.6309963099630996


## Voting Ensemble

In [None]:
# voting ensemble 학습기 생성
vote_clf = VotingClassifier(estimators=[('log', log_clf), 
                                        ('sgd', sgd_clf), 
                                        ('svm', svm_clf), 
                                        ('tree', tree_clf), 
                                        ('knn', knn_clf)], 
                            voting='soft', 
                            n_jobs=-1)

In [None]:
# 모델 학습
vote_clf.fit(X_tr, y_tr)

VotingClassifier(estimators=[('log',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('sgd',
                              SGDClassifier(alpha=0.0001, average=False,
                                            class_weight=None,
                                            early_stopp...
                                                     min_

In [None]:
# 모델 평가
vote_clf.score(X_ts, y_ts)

0.6642066420664207

## Bagging Ensemble

In [None]:
# Bagging 분류기 객체 생성
bagging_clf = BaggingClassifier(base_estimator=tree_clf, 
                                n_estimators=50,
                                max_samples=100,
                                bootstrap=True,
                                oob_score=True,
                                n_jobs=-1
                                )

In [None]:
# 모델 훈련
bagging_clf.fit(X_tr, y_tr)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [None]:
# 모델 평가
bagging_clf.score(X_ts, y_ts)

0.6605166051660517

In [None]:
# OOB score
bagging_clf.oob_score_  #> 보통 테스트 셋의 점수와 비슷.

0.6577243293246994

## RandomForest

In [None]:
# 모델 생성
forest_clf = RandomForestClassifier(n_estimators=50,
                                    max_samples=100,
                                    oob_score=True,
                                    n_jobs=-1
                                    )

In [None]:
# 모델 훈련
forest_clf.fit(X_tr, y_tr)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=100,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
# 테스트 셋 평가
forest_clf.score(X_ts, y_ts)

0.6199261992619927

In [None]:
# OOB 샘플 평가
forest_clf.oob_score_  

0.6641998149861239

## Gradient Boosting

In [None]:
# 모델 생성
gb_clf = GradientBoostingClassifier()

In [None]:
# 모델 훈련
gb_clf.fit(X_tr, y_tr)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
# 테스트 셋 평가
gb_clf.score(X_ts, y_ts)

0.6531365313653137

## XGBoost

In [None]:
xgb_clf = XGBClassifier(tree_method = 'hist')

In [None]:
xgb_clf.fit(X_tr, y_tr)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='hist', verbosity=1)

In [None]:
xgb_clf.score(X_ts, y_ts)

0.6789667896678967

# Doc2Vec

## 벡터화

In [None]:
doc = []
for i, d in enumerate(text):
    a = TaggedDocument(words = d, tags = [i])
    doc.append(a)

In [None]:
# 학습 시 필요한 하이퍼파라미터 (word2vec과 동일)
n_worker = 4        # 프로세스 갯수 ,defalut = 3
n_size = 600        # 워드 벡터 특정값 수 (크기),  defalut = 100
min_cnt = 10        # 단어에 대한 최소 빈도 단어 수, defalut = 5
context =  5        # 컨텍스트 윈도우의 크기, defalut = 5
d_sampling = 0.001  # 다운 샘플링 수, defalut = 0.001

In [None]:
md_doc = Doc2Vec(doc,
                 workers = n_worker,   
                 min_count = min_cnt,  
                 window = context,     
                 vector_size = n_size, 
                 negative = 5,  #> negative samples 의 개수 , 클수록 느려짐
                 epochs = 40,
                 sample = d_sampling) 

In [None]:
# 모델 저장
md_name = "D2V__4_workers__600_size__10_mincnt"
md_doc.save(md_name)

2021-04-30 05:44:40,656: INFO : saving Doc2Vec object under D2V__4_workers__600_size__10_mincnt, separately None
2021-04-30 05:44:40,713: INFO : saved D2V__4_workers__600_size__10_mincnt


In [None]:
data_vec_doc = get_vector_dataset(text, md_doc, n_size) # (데이터셋, 모델, 임베딩 단어수)

In [None]:
X_d = data_vec_doc

## 셋트분리

In [None]:
# for Doc2Vec
X_tr_d, X_ts_d, y_tr_d, y_ts_d = train_test_split(X_d,y,             
                                          test_size = 0.2,
                                          stratify = y)

## 일반 머신러닝 모델 성능

In [None]:
# 5개의 머신 러닝 모델(학습기) 생성
log_clf = LogisticRegression()
sgd_clf = SGDClassifier(loss='log')
svm_clf = SVC(probability=True)
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()

In [None]:
classifiers = [log_clf, sgd_clf, svm_clf, tree_clf, knn_clf]
for clf in classifiers:
    clf.fit(X_tr_d, y_tr_d)  # 모델 훈련
    acc = clf.score(X_ts, y_ts)  # 모델 평가
    print(clf.__class__.__name__, 'accuracy:', acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression accuracy: 0.5018450184501845
SGDClassifier accuracy: 0.5018450184501845
SVC accuracy: 0.5018450184501845
DecisionTreeClassifier accuracy: 0.4981549815498155
KNeighborsClassifier accuracy: 0.5461254612546126


##Gradient Boosting

In [None]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_tr_d, y_tr_d)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
gb_clf.score(X_ts_d, y_ts_d)

0.6383763837638377

## XGBoost

In [None]:
xgb_clf = XGBClassifier(tree_method = 'hist')
xgb_clf.fit(X_tr_d, y_tr_d)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='hist', verbosity=1)

In [None]:
xgb_clf.score(X_ts_d, y_ts_d)

0.6494464944649446

# TF-IDF

## 벡터화

In [None]:
vectorizer = TfidfVectorizer(
            min_df = 0.0,
            analyzer = "word",
            sublinear_tf = True,
            ngram_range = (1,3),
            max_features = 300
        )

In [None]:
X_TI = vectorizer.fit_transform(list(data['Text']))

## 셋트분리

In [None]:
# for TF-IDF
X_tr_t, X_ts_t,y_tr_t,y_ts_t = train_test_split(X_TI,y,
                                          test_size = 0.2,
                                          stratify = y)

## 일반머신러닝모델성능

In [None]:
# 5개의 머신 러닝 모델(학습기) 생성
log_clf = LogisticRegression()
sgd_clf = SGDClassifier(loss='log')
svm_clf = SVC(probability=True)
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()

In [None]:
classifiers = [log_clf, sgd_clf, svm_clf, tree_clf, knn_clf]
for clf in classifiers:
    clf.fit(X_tr_t, y_tr_t)  # 모델 훈련
    acc = clf.score(X_ts_t, y_ts_t)  # 모델 평가
    print(clf.__class__.__name__, 'accuracy:', acc)

LogisticRegression accuracy: 0.7195571955719557
SGDClassifier accuracy: 0.7047970479704797
SVC accuracy: 0.7195571955719557
DecisionTreeClassifier accuracy: 0.6826568265682657
KNeighborsClassifier accuracy: 0.6789667896678967


## Gradient Boosting

In [None]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_tr_t, y_tr_t)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
gb_clf.score(X_ts_t, y_ts_t)

0.6900369003690037

## XGBoost

In [None]:
xgb_clf = XGBClassifier(tree_method = 'hist')
xgb_clf.fit(X_tr_t, y_tr_t)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='hist', verbosity=1)

In [None]:
xgb_clf.score(X_ts_t, y_ts_t)

0.6789667896678967