### 0. Data Loading

프로젝트 폴더에서 데이터를 불러온다 

In [1]:
import re
import pandas as pd
import numpy as np 
import json


stemFile ='train_data.csv'
stemInput= pd.read_csv(stemFile)

In [2]:
clauses = list(stemInput['clauses'])
train_label = np.array(stemInput['label'])

In [3]:
clauses

['contractor engin record shall sign seal draw cover sheet calcul depart record set',
 'modif repair drill shaft micropil',
 'product certif materi requir product certif identifi accept subsect section',
 'propos submit mail enclos propos seal envelop mark direct',
 'submit measur test support data accept',
 'depart reserv right delet bid portion util reloc work contract',
 'upon final accept verif final pay record govern send sf public voucher purchas servic person final voucher releas claim document',
 'content written claim condit preced contractor entitl addit compens time extens contract claim contractor shall submit certifi written claim depart includ individu claim minimum follow inform',
 'submit written document inspect engin within hour inspect',
 'far claus util small busi concern far claus prompt payment construct contract term subcontract includ site site work suppli contract',
 'internet bid submitt bidder shall execut propos bidder digit id enter firm bid offic street ad

### 2. CounterVectorize를 활용한 벡터화

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cvector = CountVectorizer(analyzer="word", max_features=1000)
train_x = cvector.fit_transform(clauses)


print(train_x)

  (0, 796)	1
  (0, 231)	1
  (0, 113)	1
  (0, 802)	1
  (0, 195)	1
  (0, 267)	1
  (0, 787)	1
  (0, 809)	1
  (0, 800)	1
  (0, 720)	2
  (0, 288)	1
  (0, 182)	1
  (1, 746)	1
  (1, 541)	1
  (2, 789)	1
  (2, 863)	1
  (2, 4)	1
  (2, 417)	1
  (2, 751)	1
  (2, 521)	1
  (2, 128)	2
  (2, 688)	2
  (3, 246)	1
  (3, 519)	1
  (3, 295)	1
  :	:
  (1656, 55)	1
  (1656, 704)	1
  (1656, 139)	1
  (1656, 153)	1
  (1656, 181)	1
  (1656, 991)	1
  (1657, 666)	1
  (1657, 76)	1
  (1657, 305)	1
  (1657, 358)	1
  (1657, 241)	1
  (1657, 500)	1
  (1657, 497)	1
  (1657, 237)	1
  (1657, 704)	1
  (1658, 549)	1
  (1658, 973)	1
  (1658, 925)	1
  (1658, 829)	1
  (1658, 149)	1
  (1658, 522)	1
  (1658, 950)	1
  (1658, 516)	2
  (1658, 360)	1
  (1658, 406)	1


### 3. 훈련데이터와 검증데이터 나누기
전체 훈련데이터에 사용되는 문장 4049개를 t_size 비율만큼 훈련:검증 데이터로 나눈다. sklearn의 train_test_split 함수를 사용해서 자동으로 분할한다.

In [5]:
from sklearn.model_selection import train_test_split

t_size=0.2 
r_seed=42

train_cl, eval_cl, train_lb, eval_lb = train_test_split(train_x, train_label, test_size=t_size, random_state=r_seed )

### 4. Logistic Regression

- logistic regression 회귀 분석은 sklearn의 linear_model 라이브러리에서 제공한다.
- .fit() 함수로 훈련 데이터를 주입해 훈련시키고, .score() 함수로 테스트 데이터를 넣어 정확도를 측정할 수 있다.

In [8]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(train_cl, train_lb)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
print("Accuracy of training: %f" % log.score(train_cl, train_lb))
print("Accuracy: %f" % log.score(eval_cl, eval_lb))

Accuracy of training: 0.996986
Accuracy: 0.915663


### 5. 테스트 하기

- 테스트 데이터로 테스트하기
- 테스트 데이터도 countervectorize로 벡터화하기

In [7]:
test_data= pd.read_csv('test_data.csv')

test_x = cvector.transform(test_data['clauses']).toarray()
test_y = test_data['label']

In [8]:
# 예측값 result 구하기
result = log.predict(test_x)

from sklearn import metrics


print("------------")
print("Accuracy: %f" % log.score(test_x, test_data['label']))  # checking the accuracy
print("Precision: %f" % metrics.precision_score(test_data['label'], result))  # checking for the precision
print("Recall: %f" % metrics.recall_score(test_data['label'], result))  # checking for the recall
print("F1-Score: %f" % metrics.f1_score(test_data['label'], result))  # checking for the F-1 score

------------
Accuracy: 0.874396
Precision: 0.925581
Recall: 0.846809
F1-Score: 0.884444


In [9]:
output = pd.DataFrame( data={"label":test_y,  "predict": result} )

output.to_csv("cvec-predict-test.csv", index=False, quoting=3)