## 라이브러리 설치

In [None]:
#mount - 코랩 사용시
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Colab Notebooks/ifamily

/content/drive/MyDrive/Colab Notebooks/ifamily


In [1]:
import json
import re
import pickle
import numpy as np
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report

In [2]:
min_count = 2
ngram_range = (1, 1)

## 테스트 데이터셋 로드
- 아래 두 개 중 하나 골라서 로드
- 첫 번째 파일은 label balanced data
- 두 번째 파일은 label imbalanced data



In [3]:
## label 1, -1 balanced
balanced_test_data = pd.read_csv('test_data_b.csv', index_col=0)

In [4]:
## label 1, -1 imbalanced
imbalanced_test_data = pd.read_csv('test_data_imb.csv', index_col=0)

In [5]:
# 자음,모음을 제외한 한글만 추출하는 함수
def only_hangul(x):
  x = x.replace('\n', ' ')
  hangul = re.compile('[^ 가-힣]+')
  res = hangul.sub('', x)
  return res

In [6]:
# 테스트 데이터셋 한글만 남기기
balanced_test_data['text'] = balanced_test_data['text'].apply(only_hangul)
imbalanced_test_data['text'] = imbalanced_test_data['text'].apply(only_hangul)

In [7]:
# 데이터셋 샘플 추출
balanced_test_data_sampled = balanced_test_data.sample(frac=0.05)
imbalanced_test_data_sampled = imbalanced_test_data.sample(frac=0.05)

#### 학습된 모델의 tfidvector 결과와 차원 동일하게 맞춰주기 위한 함수 추가

In [8]:
# corpus= 리뷰 데이터 , min_count= 특정 단어가 나타나는 '문서(리뷰)의 수' , ngram_range = 단어 묶음 범위
def get_test_emb_with_vocabfile(corpus, min_count, ngram_range):
    new_vectorizer = TfidfVectorizer(min_df=min_count, ngram_range=ngram_range, vocabulary=pickle.load(open("vocab_final.pkl", "rb")))
    emb = new_vectorizer.fit_transform(corpus)
    return emb

In [9]:
# sample 데이터셋 리뷰들을 embedding
t_matrix_sampled_b = get_test_emb_with_vocabfile(balanced_test_data_sampled['text'], min_count, ngram_range) #balanced 데이터
t_matrix_sampled_imb = get_test_emb_with_vocabfile(imbalanced_test_data_sampled['text'], min_count, ngram_range) #imbalanced 데이터

## 학습 모델 로드

In [10]:
clf_loaded = joblib.load('ocsvm_model_final.joblib')

## 모델 검증

balanced 데이터

In [12]:
# Accuracy 및 Confusion Matrix 확인
print('Data shape: ')
print(t_matrix_sampled_b.shape)
print('Settings: ')
print('min_count: ', min_count)
print('ngram_range: ', ngram_range)
y_pred_test = clf_loaded.predict(t_matrix_sampled_b) #sample데이터 예측 값
results = confusion_matrix(balanced_test_data_sampled['label'], y_pred_test) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score for test: {:.2f} %'.format(accuracy_score(balanced_test_data_sampled['label'], y_pred_test)*100)) 
report = classification_report(balanced_test_data_sampled['label'], y_pred_test, output_dict=True)
pd.DataFrame(report).transpose()

Data shape: 
(250, 22735)
Settings: 
min_count:  2
ngram_range:  (1, 1)
Confusion Matrix :
[[113  12]
 [  5 120]]
Accuracy Score for test: 93.20 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.957627,0.904,0.930041,125.0
1,0.909091,0.96,0.933852,125.0
accuracy,0.932,0.932,0.932,0.932
macro avg,0.933359,0.932,0.931947,250.0
weighted avg,0.933359,0.932,0.931947,250.0


imbalanced 데이터

In [13]:
# Accuracy 및 Confusion Matrix 확인
print('Data shape: ')
print(t_matrix_sampled_imb.shape)
print('Settings: ')
print('min_count: ', min_count)
print('ngram_range: ', ngram_range)
y_pred_test = clf_loaded.predict(t_matrix_sampled_imb) #sample데이터 예측 값
results = confusion_matrix(imbalanced_test_data_sampled['label'], y_pred_test) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score for test: {:.2f} %'.format(accuracy_score(imbalanced_test_data_sampled['label'], y_pred_test)*100)) 
report = classification_report(imbalanced_test_data_sampled['label'], y_pred_test, output_dict=True)
pd.DataFrame(report).transpose()

Data shape: 
(250, 22735)
Settings: 
min_count:  2
ngram_range:  (1, 1)
Confusion Matrix :
[[ 38   0]
 [ 15 197]]
Accuracy Score for test: 94.00 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.716981,1.0,0.835165,38.0
1,1.0,0.929245,0.963325,212.0
accuracy,0.94,0.94,0.94,0.94
macro avg,0.858491,0.964623,0.899245,250.0
weighted avg,0.956981,0.94,0.943845,250.0


balanced데이터와 imbalanced데이터 각각 20번 테스트

In [14]:
## balance data Test_20번
b_res = []

for i in range(20):
  balanced_test_data_sampled = balanced_test_data.sample(frac=0.05)
  t_matrix_sampled = get_test_emb_with_vocabfile(balanced_test_data_sampled['text'], min_count, ngram_range)
  y_pred_test = clf_loaded.predict(t_matrix_sampled)
  acc = accuracy_score(balanced_test_data_sampled['label'], y_pred_test)*100
  b_res.append(acc)
  print(str(i+1)+'_Accuracy Score: {:.2f} %'.format(acc))

print('===========Mean Of Accuracy===========: {:.2f} %'.format(sum(b_res)/len(b_res)))

1_Accuracy Score: 96.00 %
2_Accuracy Score: 94.40 %
3_Accuracy Score: 94.40 %
4_Accuracy Score: 93.60 %
5_Accuracy Score: 91.60 %
6_Accuracy Score: 94.40 %
7_Accuracy Score: 94.00 %
8_Accuracy Score: 93.20 %
9_Accuracy Score: 89.60 %
10_Accuracy Score: 94.40 %
11_Accuracy Score: 96.00 %
12_Accuracy Score: 95.20 %
13_Accuracy Score: 93.20 %
14_Accuracy Score: 90.80 %
15_Accuracy Score: 91.20 %
16_Accuracy Score: 92.80 %
17_Accuracy Score: 94.80 %
18_Accuracy Score: 91.60 %
19_Accuracy Score: 91.60 %
20_Accuracy Score: 92.80 %


In [15]:
## imbalance Test
imb_res = []

for i in range(20):
  imbalanced_test_data_sampled = imbalanced_test_data.sample(frac=0.05)
  t_matrix_sampled = get_test_emb_with_vocabfile(imbalanced_test_data_sampled['text'], min_count, ngram_range)
  y_pred_test = clf_loaded.predict(t_matrix_sampled)
  acc = accuracy_score(imbalanced_test_data_sampled['label'], y_pred_test)*100
  imb_res.append(acc)
  print(str(i+1)+'_Accuracy Score: {:.2f} %'.format(acc))

print('===========Mean Of Accuracy===========: {:.2f} %'.format(sum(imb_res)/len(imb_res)))

1_Accuracy Score: 94.40 %
2_Accuracy Score: 95.20 %
3_Accuracy Score: 96.80 %
4_Accuracy Score: 96.00 %
5_Accuracy Score: 96.00 %
6_Accuracy Score: 97.20 %
7_Accuracy Score: 94.00 %
8_Accuracy Score: 95.20 %
9_Accuracy Score: 92.40 %
10_Accuracy Score: 96.40 %
11_Accuracy Score: 94.80 %
12_Accuracy Score: 95.60 %
13_Accuracy Score: 93.20 %
14_Accuracy Score: 95.20 %
15_Accuracy Score: 96.40 %
16_Accuracy Score: 96.40 %
17_Accuracy Score: 90.80 %
18_Accuracy Score: 93.60 %
19_Accuracy Score: 98.00 %
20_Accuracy Score: 94.40 %


In [8]:
balanced_test_data['label'].value_counts()

 1    2687
-1    2313
Name: label, dtype: int64

In [9]:
imbalanced_test_data['label'].value_counts()

 1    4016
-1     984
Name: label, dtype: int64