## 라이브러리 설치

In [None]:
# pip install konlpy
!git clone https://github.com/lovit/customized_konlpy.git
!pip install customized_konlpy

In [12]:
import json
import re
import pickle
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
import ckonlpy
from ckonlpy.tag import Twitter
from ckonlpy.tag import Postprocessor
from ckonlpy.utils import load_wordset

In [None]:
twitter = ckonlpy.tag.Twitter()

In [14]:
with open('dictionary_v1.pkl', 'rb') as f:
    dict_toadd = pickle.load(f)
for i in dict_toadd:
    twitter.add_dictionary(i,'Noun')

In [None]:
# passtags = {'Noun', 'Verb', 'Adjective', 'Adverb'}
# tokenizer = Postprocessor(base_tagger=twitter, passtags=passtags) # 품사 필터링 ON

In [15]:
tokenizer = Postprocessor(base_tagger=twitter) # 품사 필터링 OFF

#### 화장품 관련 데이터 불러오기

In [6]:
#coupang = pd.read_csv('./coupang.csv')
olive = pd.read_csv('./data/olive.csv')
gp = pd.read_csv('./data/gp.csv')
pw = pd.read_csv('./data/pwroom.csv')

In [7]:
#coupang = coupang.dropna(axis=0)
olive = olive.dropna(axis=0)
gp = gp.dropna(axis=0)
pw = pw.dropna(axis=0)

In [16]:
gp = gp.rename(columns={'review_text':'review_content'})

In [17]:
#c_label = [1 for i in range(len(coupang['review_content']))]
o_label = [1 for i in range(len(olive['review_content']))]
g_label = [1 for i in range(len(gp['review_content']))]
p_label = [1 for i in range(len(pw['review_content']))]

In [18]:
#coupang['label'] = c_label
olive['label'] = o_label
gp['label'] = g_label
pw['label'] = p_label

In [19]:
#total_review = pd.concat([coupang, olive, gp, pw])
total_review = pd.concat([gp, olive, pw])

In [22]:
def only_hangul(x):
  x = x.replace('\n', ' ')
  hangul = re.compile('[^ 가-힣]+') # 한글과 띄어쓰기를 제외한 모든 글자
  res = hangul.sub('', x)
  return res

In [23]:
total_review['review_content'] = total_review['review_content'].apply(only_hangul)

In [25]:
total_train = total_review.sample(frac=1)[:40000]

In [26]:
total_train = total_train.rename(columns={'review_content':'text'})

In [28]:
train_text = total_train['text'].tolist()
train_labels = total_train['label'].tolist()

In [29]:
X = total_train['text']
y = total_train['label']

In [30]:
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로',
           '자','에','와','한','하다','그리고','넘','네','을','랑','예요','오','여서','이에요','데',
           '에게','에서','라서','이라서','에요','와','만','나','로','이랑','내','엔','아','부터','수',
           '때','거','다','이다','이나','에도','것','고','게','인데','제','까지','에는','엔','이라'
          '약간','오늘','점','없이','자꾸','알','있','다니','또','어','딱','걍','더','중','니','저','면','듯']

In [31]:
def tokenize_text(sample) :
    tokenized_corpus = []
    for sentence in sample:
        tmp = [i[0] for i in  tokenizer.pos(sentence)  if not i[0] in stopwords] 
        tokenized_corpus .append(' '.join(tmp))
    return tokenized_corpus

In [32]:
train_corp =tokenize_text(train_text)

In [None]:
tokenizer.pos(' 뭉치는거 없이 잘 발려요 플라스틱으로 된 브러쉬 찾고있는데')

[('뭉치', 'Noun'),
 ('는', 'Josa'),
 ('거', 'Noun'),
 ('없이', 'Adverb'),
 ('잘', 'Verb'),
 ('발려요', 'Verb'),
 ('플라스틱', 'Noun'),
 ('으로', 'Josa'),
 ('된', 'Verb'),
 ('브러쉬', 'Noun'),
 ('찾고있는데', 'Verb')]

In [33]:
drop_train = [idx for idx, sentence in enumerate(train_corp) if len(sentence) < 1]
X_train = np.delete(train_corp, drop_train, axis=0)
y_train = np.delete(train_labels, drop_train, axis=0)
print(len(X_train))
print(len(y_train))

39960
39960


In [34]:
min_count = 2
ngram_range = (1, 1)

In [35]:
vectorizer = TfidfVectorizer(min_df=min_count,  ngram_range=ngram_range)
emb = vectorizer.fit_transform(X_train)
vocab = vectorizer.vocabulary_

In [None]:
with open('vocab_final.pkl', 'wb') as f:
    pickle.dump(vocab, f )

In [37]:
def get_test_emb_with_vocabfile(corpus, min_count, ngram_range, vocab):
  new_vectorizer = TfidfVectorizer(min_df=min_count, ngram_range=ngram_range, vocabulary=vocab)
  emb = new_vectorizer.fit_transform(corpus)
  return emb

In [38]:
train_matrix = emb
#test_matrix = get_test_emb_with_vocabfile(X_test, min_count, ngram_range, vocab)

#### One-class SVM 학습 및 저장 

In [39]:
kernel = 'linear'
gamma = 'scale'

nu = 0.1395

In [77]:
clf = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)

In [78]:
clf.fit(train_matrix, y_train)
y_pred_train = clf.predict(train_matrix)

In [None]:
joblib.dump(clf, 'ocsvm_model_final.joblib')

['ocsvm_model_final.joblib']

In [None]:
y_pred_train

array([ 1,  1,  1, ...,  1, -1,  1])

## gamma, nu 변화 테스트

In [41]:
## label 1, -1 balanced
balanced_test_data = pd.read_csv('test_data_b.csv', index_col=0)
## label 1, -1 imbalanced
imbalanced_test_data = pd.read_csv('test_data_imb.csv', index_col=0)

In [42]:
# 테스트 데이터셋 한글만 남기기
balanced_test_data['text'] = balanced_test_data['text'].apply(only_hangul2)
imbalanced_test_data['text'] = imbalanced_test_data['text'].apply(only_hangul2)

In [47]:
test_text_b = balanced_test_data['text'].tolist()
test_labels_b = balanced_test_data['label'].tolist()
test_text_imb = imbalanced_test_data['text'].tolist()
test_labels_imb = imbalanced_test_data['label'].tolist()

In [62]:
def get_res(data, pred):
  results = confusion_matrix(data['label'], pred) 
  print('Confusion Matrix :')
  print(results) 
  print('Accuracy Score for test: {:.2f} %'.format(accuracy_score(data['label'], pred)*100)) 
  report = classification_report(data['label'], pred, output_dict=True)
  return pd.DataFrame(report).transpose()

In [52]:
min_count = 2
ngram_range = (1, 1)

# sample 데이터셋 리뷰들을 embedding
t_matrix_b = get_test_emb_with_vocabfile(balanced_test_data['text'], min_count, ngram_range, vocab) #balanced 데이터
t_matrix_imb = get_test_emb_with_vocabfile(imbalanced_test_data['text'], min_count, ngram_range, vocab) #imbalanced 데이터

In [40]:
# gamma =0.01, nu=0.2
clf1 = OneClassSVM(kernel=kernel, gamma=0.01, nu=0.2)
clf1.fit(train_matrix, y_train)
y_pred_train1 = clf1.predict(train_matrix)

In [66]:
y_pred_test_b1 = clf.predict(t_matrix_b)
y_pred_test_imb1 = clf.predict(t_matrix_imb)

In [63]:
get_res(balanced_test_data, y_pred_test_b1)

Confusion Matrix :
[[2242   71]
 [ 361 2326]]
Accuracy Score for test: 91.36 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.861314,0.969304,0.912124,2313.0
1,0.97038,0.865649,0.915028,2687.0
accuracy,0.9136,0.9136,0.9136,0.9136
macro avg,0.915847,0.917477,0.913576,5000.0
weighted avg,0.919926,0.9136,0.913684,5000.0


In [67]:
get_res(imbalanced_test_data, y_pred_test_imb1)

Confusion Matrix :
[[ 972   12]
 [ 550 3466]]
Accuracy Score for test: 88.76 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.638633,0.987805,0.775738,984.0
1,0.99655,0.863048,0.925007,4016.0
accuracy,0.8876,0.8876,0.8876,0.8876
macro avg,0.817592,0.925426,0.850372,5000.0
weighted avg,0.926112,0.8876,0.895631,5000.0


In [56]:
# gamma =0.01, nu=0.15
clf2 = OneClassSVM(kernel=kernel, gamma=0.01, nu=0.15)
clf2.fit(train_matrix, y_train)

OneClassSVM(gamma=0.01, kernel='linear', nu=0.15)

In [68]:
y_pred_test_b2 = clf2.predict(t_matrix_b)
y_pred_test_imb2 = clf2.predict(t_matrix_imb)

In [69]:
get_res(balanced_test_data, y_pred_test_b2)

Confusion Matrix :
[[2210  103]
 [ 244 2443]]
Accuracy Score for test: 93.06 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.90057,0.955469,0.927208,2313.0
1,0.959544,0.909192,0.93369,2687.0
accuracy,0.9306,0.9306,0.9306,0.9306
macro avg,0.930057,0.932331,0.930449,5000.0
weighted avg,0.932263,0.9306,0.930691,5000.0


In [70]:
get_res(imbalanced_test_data, y_pred_test_imb2)

Confusion Matrix :
[[ 963   21]
 [ 367 3649]]
Accuracy Score for test: 92.24 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.72406,0.978659,0.832325,984.0
1,0.994278,0.908616,0.949519,4016.0
accuracy,0.9224,0.9224,0.9224,0.9224
macro avg,0.859169,0.943637,0.890922,5000.0
weighted avg,0.941099,0.9224,0.926455,5000.0


## no stopwords

In [91]:
no_stopwords_clf = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
no_stopwords_clf.fit(no_stop_train_matrix, y)

OneClassSVM(kernel='linear', nu=0.1395)

### 불용어 O

In [80]:
y_pred_test_b3 = clf.predict(t_matrix_b)
y_pred_test_imb3 = clf.predict(t_matrix_imb)

In [88]:
get_res(balanced_test_data, y_pred_test_b3)

Confusion Matrix :
[[2198  115]
 [ 223 2464]]
Accuracy Score for test: 93.24 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.907889,0.950281,0.928602,2313.0
1,0.955409,0.917008,0.935815,2687.0
accuracy,0.9324,0.9324,0.9324,0.9324
macro avg,0.931649,0.933644,0.932208,5000.0
weighted avg,0.933426,0.9324,0.932478,5000.0


In [82]:
get_res(imbalanced_test_data, y_pred_test_imb3)

Confusion Matrix :
[[ 961   23]
 [ 332 3684]]
Accuracy Score for test: 92.90 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.743233,0.976626,0.844093,984.0
1,0.993796,0.917331,0.954033,4016.0
accuracy,0.929,0.929,0.929,0.929
macro avg,0.868514,0.946978,0.899063,5000.0
weighted avg,0.944485,0.929,0.932397,5000.0


### 불용어 x

In [97]:
def tokenize_text_ns(sample) :
    tokenized_corpus = []
    for sentence in sample:
        tmp = [i[0] for i in  tokenizer.pos(sentence)] 
        tokenized_corpus .append(' '.join(tmp))
    return tokenized_corpus

In [98]:
train_corp_ns =tokenize_text_ns(train_text)

In [99]:
drop_train_ns = [idx for idx, sentence in enumerate(train_corp_ns) if len(sentence) < 1]
X_train_ns = np.delete(train_corp_ns, drop_train_ns, axis=0)
y_train_ns = np.delete(train_labels, drop_train_ns, axis=0)
print(len(X_train_ns))
print(len(y_train_ns))

39960
39960


In [100]:
emb3 = vectorizer.fit_transform(X_train_ns)
no_stop_train_matrix = emb3
vocab3 = vectorizer.vocabulary_

In [101]:
min_count = 2
ngram_range = (1, 1)

# sample 데이터셋 리뷰들을 embedding
t_matrix_b3 = get_test_emb_with_vocabfile(balanced_test_data['text'], min_count, ngram_range, vocab3) #balanced 데이터
t_matrix_imb3 = get_test_emb_with_vocabfile(imbalanced_test_data['text'], min_count, ngram_range, vocab3) #imbalanced 데이터

In [102]:
no_stopwords_clf = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
no_stopwords_clf.fit(no_stop_train_matrix, y_train_ns)

OneClassSVM(kernel='linear', nu=0.1395)

In [103]:
y_pred_test_b5 = no_stopwords_clf.predict(t_matrix_b3)
y_pred_test_imb5 = no_stopwords_clf.predict(t_matrix_imb3)

In [104]:
get_res(balanced_test_data, y_pred_test_b5)

Confusion Matrix :
[[2218   95]
 [ 255 2432]]
Accuracy Score for test: 93.00 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.896886,0.958928,0.92687,2313.0
1,0.962406,0.905099,0.932873,2687.0
accuracy,0.93,0.93,0.93,0.93
macro avg,0.929646,0.932013,0.929872,5000.0
weighted avg,0.932097,0.93,0.930096,5000.0


In [105]:
get_res(imbalanced_test_data, y_pred_test_imb5)

Confusion Matrix :
[[ 965   19]
 [ 395 3621]]
Accuracy Score for test: 91.72 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.709559,0.980691,0.823379,984.0
1,0.99478,0.901643,0.945925,4016.0
accuracy,0.9172,0.9172,0.9172,0.9172
macro avg,0.85217,0.941167,0.884652,5000.0
weighted avg,0.938649,0.9172,0.921808,5000.0
