## Support Vector Machine with counterVectorization(+stemming)

In [1]:
import os
import re
import pandas as pd
import numpy as np
import json
np.seterr(divide='ignore', invalid='ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data_in_path = './data_in/'
data_out_path = './data_out/'
train_data = 'train_data.csv'
test_data = 'test_data.csv'

train_data = pd.read_csv(data_in_path + train_data)
test_data = pd.read_csv(data_in_path + test_data)

random_seed = 42
test_split = 0.2

In [3]:
clause_input= train_data['clauses']
label_input=train_data['label']

test_input = test_data['clauses']

#트레이닝셋 벡터화
vectorizer = CountVectorizer(analyzer="word", max_features=100)
train_stem = vectorizer.fit_transform(clause_input)
label_stem = np.array(train_data['label'])

#테스트셋에 transform  
test_vecs = vectorizer.transform(test_input)

In [4]:
print("training set featues: ",train_stem.shape)
print("label feature: ",label_stem.shape)

training set featues:  (1659, 100)
label feature:  (1659,)


In [5]:
from sklearn.model_selection import train_test_split

vec = train_stem
lab = label_stem

vec_train, vec_eval, lab_train, lab_eval = train_test_split(vec, lab, test_size=test_split, random_state=random_seed)

In [6]:
#print(vec_train[0])

In [7]:
from sklearn import svm
clf = svm.SVC(kernel = 'linear', C = 1.0, gamma = 'auto', random_state = 42, probability = True)

clf.fit(vec_train, lab_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=True, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [8]:
print("Accuracy of training: %f" % clf.score(vec_train, lab_train))
print("Accuracy: %f" % clf.score(vec_eval, lab_eval))

Accuracy of training: 0.906556
Accuracy: 0.909639


## 여기서부터 테스트

In [9]:
# 위에서 만든 랜덤 포레스트 분류기를 통해 예측값을 가져온다.
result = clf.predict(test_vecs)

In [10]:
import os
# 테스트 데이터 파일에 쓰기 -

test_clauses = list(test_data['clauses'])
test_label = list(test_data['label'])

if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

# 판다스 데이터 프레임을 통해 데이터를 구성해서 output에 넣는다.
output = pd.DataFrame( data={"label": test_label ,  "predict": result} )

# 이제 csv파일로 만든다.
output.to_csv(data_out_path + "svm-countvec-predict-test-stem.csv", index=False, quoting=3)

In [11]:
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(lab_eval, (clf.predict_proba(vec_eval)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % clf.score(test_vecs, test_data['label']))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(test_data['label'], result))
print("Recall: %f" % metrics.recall_score(test_data['label'], result))
print("F1-Score: %f" % metrics.f1_score(test_data['label'], result))

------------
Accuracy: 0.884058
Precision: 0.915556
Recall: 0.876596
F1-Score: 0.895652
