## Support Vector Machine with TF-IDF(+stemming)

백그라운드 불러오기

In [1]:
import os
import re

import pandas as pd
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

데이터 경로 지정(input, output), 변수값 지정

In [2]:
data_in_path = './data_in/'
data_out_path = './data_out/'
train_data = 'train_data.csv'
test_data = 'test_data.csv'

random_seed = 42
test_split = 0.2

In [3]:
train_data = pd.read_csv(data_in_path + train_data)

In [4]:
train_clause = list(train_data['clauses'])
train_label = list(train_data['label'])

텍스트 데이터 >> 벡터화(TF-IDF활용, 단어 단위, unigram, 최대 등장단어 1000개)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="word", sublinear_tf=True, ngram_range=(1,1), max_features=100) 

data_vec = vectorizer.fit_transform(train_clause)
data_lab = np.array(train_label)

In [6]:
features = vectorizer.get_feature_names()

train set 과 test set으로 분리

In [7]:
from sklearn.model_selection import train_test_split

vec_train, vec_eval, lab_train, lab_eval = train_test_split(data_vec, data_lab, test_size = test_split, random_state = random_seed)

SVM-SVC로 훈련

In [8]:
from sklearn import svm, metrics

clf = svm.SVC(kernel = 'linear', C = 3.0, random_state = random_seed, probability = True, class_weight = 'balanced', 
              cache_size = 300)

clf.fit(vec_train, lab_train)

SVC(C=3.0, cache_size=300, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=42,
    shrinking=True, tol=0.001, verbose=False)

In [9]:
print("Accuracy of training: %f" % clf.score(vec_train, lab_train))
print("Accuracy: %f" % clf.score(vec_eval, lab_eval))

Accuracy of training: 0.900528
Accuracy: 0.903614


In [10]:
test_data = pd.read_csv(data_in_path + test_data)

test_clause = list(test_data['clauses'])

In [11]:
test_clauses = []

for clauses in test_clause:
    test_clauses.append(clauses.split())

In [12]:
test_vecs = vectorizer.transform(test_clause)

In [13]:
# 위에서 만든 서포트 벡터 머신 분류기를 통해 예측값을 가져온다.
result = clf.predict(test_vecs)

In [14]:
import os
# 테스트 데이터 파일에 쓰기 -

test_clauses = list(test_data['clauses'])
test_label = list(test_data['label'])

if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

# 판다스 데이터 프레임을 통해 데이터를 구성해서 output에 넣는다.
output = pd.DataFrame( data={"label": test_label ,  "predict": result} )

# 이제 csv파일로 만든다.
output.to_csv(data_out_path + "svm-tfidf-predict-test-stem.csv", index=False, quoting=3 )

In [15]:
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(lab_eval, (clf.predict_proba(vec_eval)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % clf.score(test_vecs, test_data['label']))  # checking the accuracy
print("Precision: %f" % metrics.precision_score(test_data['label'], result))  # checking for the precision
print("Recall: %f" % metrics.recall_score(test_data['label'], result))  # checking for the recall
print("F1-Score: %f" % metrics.f1_score(test_data['label'], result))  # checking for the F-1 score
print("AUC: %f" % auc)

------------
Accuracy: 0.881643
Precision: 0.926606
Recall: 0.859574
F1-Score: 0.891832
AUC: 0.934362
