In [34]:
import pandas as pd
import csv

labeled_corpus = pd.read_csv('labeled_corpus.csv', encoding = 'utf-8', index_col = [0])
labeled_corpus = labeled_corpus[labeled_corpus['token_ngram'].notna()]

In [44]:
import numpy as np
from sklearn.pipeline import Pipeline

In [35]:
labeled_corpus

Unnamed: 0,token_ngram,label
0,"들/VV,국제/NNG,유가/NNG,크/VA,오르/VV,원화/NNG,엔/NNG,화간/...",-1
1,"위안/NNG,절상/NNG,원화/NNG,절상/NNG,가장/MAG,크/VA,추가/NNG...",1
2,"금융/NNG,불안사태/NNG,발생/NNG,하/VV,가운데/NNG,국제/NNG,환투기...",0
3,"금리갭/NNG,확대/NNG,되/XSV,가운데/NNG,남/VV,fed/NNG,금리/N...",1
4,"미/NNG,달러/NNG,엔/NNG,약세/NNG,지속/NNG,되/VV,보/VV,물/V...",0
...,...,...
256264,"경기/NNG,전망/NNG,비교적/VAX,분명/MAG,인플레이션/NNG,명분/NNG,...",-1
256265,"믹스/NNG,합의/NNG,반영/NNG,예상/NNG,경제/NNG,성장률/NNG,하향/...",0
256266,"돌아온/VV,박스권/NNG,하단/NNG,상단/NNG,테스트/NNG,가능성/NNG,무...",0
256267,"돌아온/VV,박스권/NNG,하단/NNG,상단/NNG,테스트/NNG,가능성/NNG,무...",1


In [62]:
labeled_corpus = labeled_corpus[labeled_corpus['label']!= 0]
labeled_corpus

Unnamed: 0,token_ngram,label
0,"들/VV,국제/NNG,유가/NNG,크/VA,오르/VV,원화/NNG,엔/NNG,화간/...",-1
1,"위안/NNG,절상/NNG,원화/NNG,절상/NNG,가장/MAG,크/VA,추가/NNG...",1
3,"금리갭/NNG,확대/NNG,되/XSV,가운데/NNG,남/VV,fed/NNG,금리/N...",1
5,"들/VV,달러/NNG,엔/NNG,유로/NNG,대해/VV,상당히/MAG,강세/NNG,...",1
7,"자본/NNG,유입/NNG,대해서/VV,관대/NNG,반면/NNG,자본/NNG,유출/N...",1
...,...,...
256262,"가/VV,채권투자/NNG,채권시장/NNG,전망/NNG,금융시장/NNG,차/NNG,트...",-1
256263,"fed/NNG,원자재/NNG,가격/NNG,상승/NNG,둔화/NNG,여부/NNG,관심...",-1
256264,"경기/NNG,전망/NNG,비교적/VAX,분명/MAG,인플레이션/NNG,명분/NNG,...",-1
256267,"돌아온/VV,박스권/NNG,하단/NNG,상단/NNG,테스트/NNG,가능성/NNG,무...",1


In [63]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [81]:
X_train, X_test, y_train, y_test = train_test_split(labeled_corpus['token_ngram'], labeled_corpus['label'], random_state = 0, train_size = 0.9, shuffle=True)

In [82]:
def my_tokenizer(x):
    return x.split(",")

In [83]:
under_15 = pd.read_csv('under_15.csv', encoding = 'utf-8', index_col = [0])
stop_words = []
for token in under_15['token']:
    stop_words += token.split(",")
    
stop_words

['원화/NNG;절하/NNG;압력/NNG',
 '은행/NNG;환율/NNG;상승/NNG',
 '해외/NNG;증권/NNG;투자/NNG;증가/NNG',
 '해외/NNG;증권/NNG;투자/NNG;늘/VV',
 '민간/NNG;소비/NNG;부진/NNG',
 '이루어졌으므로/VV',
 '절상/NNG;원화/NNG;절상/NNG',
 '절상/NNG;위안/NNG;절상/NNG',
 '절상/NNG;아니/VCN',
 '자본/NNG;유출/NNG;가능성/NNG;낮/VA',
 '경제/NNG;금융안정/NNG',
 '금융안정/NNG;해치/VV',
 '국채/NNG;매입/NNG;줄/VV',
 '금융시장/NNG;위안/NNG;절상/NNG',
 '절상/NNG;작/VA',
 '선매입/NNG',
 '자본/NNG;수지/NNG;적자/NNG',
 '기업/NNG;자금/NNG;조달/NNG;확대/NNG',
 '기업/NNG;자금/NNG;조달/NNG;늘/VV',
 '주택담보대출/NNG;줄/VV',
 '대출태도/NNG;완화/NNG',
 '부동산/NNG;가격/NNG;상승/NNG;금리/NNG;인상/NNG',
 '금리/NNG;역전/NNG;해소/NNG',
 '금리/NNG;변동/NNG;확대/NNG',
 '공급측면/NNG;압력/NNG',
 '은행/NNG;중소기업/NNG;대출/NNG;확대/NNG',
 '중소기업/NNG;대출/NNG;증가/NNG',
 '자금/NNG;조달/NNG;증가/NNG',
 '주택담보대출/NNG;감소/NNG',
 '대출/NNG;주택담보대출/NNG;늘/VV',
 '항등식/NNG',
 '투자/NNG;매우/MAG;부진/NNG',
 'fed/NNG;단기/NNG;금리/NNG;인상/NNG',
 '전망/NNG;인플레이션/NNG;압력/NNG',
 '재정/NNG;적자/NNG;늘/VV',
 '최대/NNG;잡/VV',
 '금리/NNG;인상/NNG;장기/NNG;금리/NNG;상승/NNG',
 '달리/MAG;금리/NNG;인상/NNG',
 '경제/NNG;비관/NNG',
 '투자/NNG;줄/VV',
 '상환/NNG;압력/NNG',
 '가계/NN

In [84]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_df = 15)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', MultinomialNB(alpha=0.001)), ])


In [85]:
text_clf = text_clf.fit(X_train, y_train)

In [69]:
predicted = text_clf.predict(X_test)
predicted

array([-1,  1,  1, ...,  1,  1,  1], dtype=int64)

In [70]:
np.mean(predicted == y_test)

0.6983156206135953

In [71]:
text_clf1 = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_df = 15)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', MultinomialNB(alpha=0.001)), ])


In [72]:
text_clf1 = text_clf.fit(X_train, y_train)

In [75]:
predicted = text_clf1.predict(X_test)
np.mean(predicted == y_test)

0.6983156206135953

In [94]:
text_clf1.feature_log_prob_

AttributeError: 'Pipeline' object has no attribute 'feature_log_prob_'