In [1]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import nltk
from soynlp.normalizer import *
from hanspell import spell_checker
from konlpy.tag import Okt

# ML
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

In [2]:
# 파일 불러오기

review_df = pd.read_csv('./MK_review_proceed_data.csv', encoding='utf-8', index_col=0) # df_review는 전체년도 리뷰 크롤링 파일
review_df.head(1)

Unnamed: 0,date,review,tokenized_review,tokenized_removed_review,reviews_for_vectorize,noun_tokenized_review,noun_tokenized_removed_review,noun_reviews_for_vectorize,rating,label
0,2023-09-10,상세 설명에 들어가지 않아도 검색 결과에서 중량과 가격이 바로 확인되면 더 편리하겠습니다,"['상세', '설명', '에', '들어가지', '않아도', '검색', '결과', '...","['상세', '설명', '들어가지', '않아도', '검색', '결과', '중량', ...",상세 설명 들어가지 않아도 검색 결과 중량 가격 확인 되면 더 편리하겠습니다,"['상세', '설명', '검색', '결과', '중량', '가격', '바로', '확인...","['상세', '설명', '검색', '결과', '중량', '가격', '확인', '더']",상세 설명 검색 결과 중량 가격 확인 더,5,1.0


In [34]:
##### model
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

##### sampling
from imblearn.over_sampling import SMOTE

##### pipeline
from imblearn.pipeline import Pipeline


##### confusion_matrix
from sklearn.metrics import confusion_matrix

##### dataset
X = review_df['reviews_for_vectorize']; y = review_df['label']


##### split train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)


##### cross-validation
skfold = StratifiedKFold(n_splits=5)


##### pipeline model
pipe_LGBM = Pipeline([
                ('vec', TfidfVectorizer(decode_error='ignore')),
                ('fit', LGBMClassifier())])

pipe_Smote_LGBM = Pipeline([
                ('vec', TfidfVectorizer(decode_error='ignore')),
                ('sampling', SMOTE(random_state=12)),
                ('fit', LGBMClassifier())])


##### set grid params

grid_params_LGBM = [{
    "vec__min_df" : [7],
    'fit__objective' : ['binary'],    
    'fit__boosting_type' : ['gbdt'],
    'fit__metric' : ['binary_logloss'],
    'fit__learning_rate' : [0.1],
    'fit__n_estimators' : [100],
    'fit__max_depth' : [-1],
    'fit__num_leaves' : [32],
    'fit__min_child_samples' : [50],
    'fit__force_col_wise' : ['True']          #or 'fit__force_row_wise' 
}]

grid_params_Smote_LGBM = [{
    "vec__min_df" : [7],
    "sampling__sampling_strategy" : ['auto'],
    "sampling__k_neighbors" : [5],
    'fit__objective' : ['binary'],    
    'fit__boosting_type' : ['gbdt'],
    'fit__metric' : ['binary_logloss'],
    'fit__learning_rate' : [0.1],
    'fit__n_estimators' : [100],
    'fit__max_depth' : [-1],
    'fit__num_leaves' : [32],
    'fit__min_child_samples' : [50],
    'fit__force_col_wise' : ['True']          #or 'fit__force_row_wise' 
}]

##### fit
pipe = [pipe_LGBM, pipe_Smote_LGBM ]
params = [grid_params_LGBM, grid_params_Smote_LGBM]

jobs = 20

grid_dict = {
    0: 'LGBM',
    1: 'Smote_LGBM',
            }

model_object = {}; model_acc = {}; model_roc_auc = {}; model_f1 = {}; model_best_params = {}; model_CM ={};

#1. scoring = 'neg_mean_absolute_error' 로 설정.
# 일반적으로 scoring을 값이 클 수록 모델 성능이 좋은 것으로 사이킷런에서 인식하는데, 
# mae는 값이 클 수록 모델 성능이 저하되는 것이므로 Negative 키워드를 붙여서 사용

for idx , (param , model) in enumerate(zip(params , pipe)) :
    search = GridSearchCV(model, param, scoring  = "f1", cv=skfold,
                           n_jobs=jobs , verbose=True ) #  verbose가 있으면 함수 수행시 발생하는 상세한 정보 출력
    search.fit(X_train , y_train)
    
    y_pred_train = search.predict(X_train)
    y_pred_test = search.predict(X_test)
       
    model_object[grid_dict.get(idx)] = search        
    model_acc[grid_dict.get(idx)] = accuracy_score(y_test, y_pred_test)  
    model_roc_auc[grid_dict.get(idx)] = roc_auc_score(y_test, y_pred_test)
    model_f1[grid_dict.get(idx)] = f1_score(y_test, y_pred_test)

    #model_confM_train = confusion_matrix(y_train, y_pred_train) # (X_test -> y_pred)
    #model_confM_test = confusion_matrix(y_test, y_pred_test) # (X_test -> y_pred)
    model_CM[grid_dict.get(idx)] =  confusion_matrix(y_test, y_pred_test) # (X_test -> y_pred)

    model_best_params[grid_dict.get(idx)] = search.best_params_
    
print("finish")

fig ,ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale = 2)
output = pd.DataFrame([model_f1.keys() , model_f1.values()], index = ["algo","f1"]).T # .T : 행 렬 변환
output.sort_values(["f1"], ascending= False ,inplace=True)
ax = sns.barplot(y="algo", x="f1", data=output)
plt.show()

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Number of positive: 6368, number of negative: 1556
[LightGBM] [Info] Total Bins 10966
[LightGBM] [Info] Number of data points in the train set: 7924, number of used features: 229
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803635 -> initscore=1.409167
[LightGBM] [Info] Start training from score 1.409167


AttributeError: 'LGBMClassifier' object has no attribute 'score_samples'

In [33]:
model_feat_nums['LGBM']

<bound method BaseEstimator.get_params of GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('vec',
                                        TfidfVectorizer(decode_error='ignore')),
                                       ('fit', LGBMClassifier())]),
             n_jobs=20,
             param_grid=[{'fit__boosting_type': ['gbdt'],
                          'fit__force_col_wise': ['True'],
                          'fit__learning_rate': [0.1], 'fit__max_depth': [-1],
                          'fit__metric': ['binary_logloss'],
                          'fit__min_child_samples': [50],
                          'fit__n_estimators': [100], 'fit__num_leaves': [32],
                          'fit__objective': ['binary'], 'vec__min_df': [7]}],
             scoring='f1', verbose=True)>

In [12]:
model_acc

{'LGBM': 0.9041942604856512, 'Smote_LGBM': 0.9077262693156732}

In [13]:
model_f1

{'LGBM': 0.9421178981061616, 'Smote_LGBM': 0.9430051813471502}

In [14]:
model_roc_auc

{'LGBM': 0.8020187677491047, 'Smote_LGBM': 0.8424157303370786}

In [9]:
CM_LGBM = model_CM['LGBM']
CM_Smote_LGBM = model_CM['Smote_LGBM']

In [10]:
CM_LGBM_acc_0 = (CM_LGBM[0][0]) / (CM_LGBM[0][0] + CM_LGBM[0][1]) #정확도
CM_LGBM_acc_1 = (CM_LGBM[1][1]) / (CM_LGBM[1][0] + CM_LGBM[1][1]) #정확도

CM_LGBM_acc_1, CM_LGBM_acc_0

(0.9703296703296703, 0.6337078651685393)

In [11]:
CM_Smote_LGBM_acc_0 = (CM_Smote_LGBM[0][0]) / (CM_Smote_LGBM[0][0] + CM_Smote_LGBM[0][1]) #정확도
CM_Smote_LGBM_acc_1 = (CM_Smote_LGBM[1][1]) / (CM_Smote_LGBM[1][0] + CM_Smote_LGBM[1][1]) #정확도

CM_Smote_LGBM_acc_1, CM_Smote_LGBM_acc_0

(0.95, 0.7348314606741573)