In [2]:
import numpy as np
import pandas as pd
import csv

labeled_corpus = pd.read_csv('labeled_corpus.csv', index_col=[0], header = 0, encoding = 'utf-8')
labeled_corpus

Unnamed: 0,token_ngram,label
0,"들/VV,국제/NNG,유가/NNG,크/VA,오르/VV,원화/NNG,엔/NNG,화간/...",-1
1,"위안/NNG,절상/NNG,원화/NNG,절상/NNG,가장/MAG,크/VA,추가/NNG...",-1
2,"금융/NNG,불안사태/NNG,발생/NNG,하/VV,가운데/NNG,국제/NNG,환투기...",-1
3,"금리갭/NNG,확대/NNG,되/XSV,가운데/NNG,남/VV,fed/NNG,금리/N...",1
4,"들/VV,달러/NNG,엔/NNG,유로/NNG,대해/VV,상당히/MAG,강세/NNG,...",1
...,...,...
207370,"경기/NNG,전망/NNG,비교적/VAX,분명/MAG,인플레이션/NNG,명분/NNG,...",-1
207371,"돌아온/VV,박스권/NNG,하단/NNG,상단/NNG,테스트/NNG,가능성/NNG,무...",1
207372,"돌아온/VV,박스권/NNG,하단/NNG,상단/NNG,테스트/NNG,가능성/NNG,무...",1
207373,"가/VV,채권투자/NNG,채권시장/NNG,전망/NNG,금융시장/NNG,차/NNG,트...",1


In [9]:
labeled_corpus = labeled_corpus[labeled_corpus['token_ngram'].notna()]

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm

In [5]:
def my_tokenizer(x):
    return x.split(",")

In [6]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)),
                      ('clf', MultinomialNB(alpha=0.001)), ])
vect = text_clf.named_steps['vect']
clf = text_clf.named_steps['clf']

In [24]:
#Bagging

# avg_polarity_score = np.zeros((1, 18787))
# accuracy = np.zeros(30)
posterior_list = []
for i in tqdm(range(30)):
    X_train, X_test, y_train, y_test = train_test_split(labeled_corpus['token_ngram'], labeled_corpus['label'], random_state = i, train_size = 0.9, shuffle=True)
    text_clf.fit(X_train, y_train)
    posterior_list.append(np.vstack([np.array(vect.get_feature_names()), np.exp(clf.feature_log_prob_)]))
    # posterior = np.concatenate(vect.get_feature_names(), clf.feature_log_prob_, axis=0)
    # posterior_list.append(posterior)
    # polarity_score = posterior[1]/posterior[0]
    # avg_polarity_score += polarity_score
    # pred = text_clf.predict(X_test)
    # accuracy[i] = np.sum(pred == y_test)

# avg_polarity_score = avg_polarity_score/30
# print(avg_polarity_score)



  0%|                                                   | 0/30 [00:00<?, ?it/s][A[A

  3%|█▎                                      | 1/30 [02:12<1:03:49, 132.06s/it][A[A

  7%|██▋                                     | 2/30 [04:26<1:01:57, 132.78s/it][A[A

 10%|████▏                                     | 3/30 [06:38<59:35, 132.41s/it][A[A

 13%|█████▌                                    | 4/30 [08:51<57:27, 132.61s/it][A[A

 17%|███████                                   | 5/30 [11:03<55:15, 132.64s/it][A[A

 20%|████████▍                                 | 6/30 [13:18<53:18, 133.27s/it][A[A

 23%|█████████▊                                | 7/30 [15:31<50:59, 133.02s/it][A[A

 27%|███████████▏                              | 8/30 [17:44<48:52, 133.27s/it][A[A

 30%|████████████▌                             | 9/30 [19:57<46:35, 133.14s/it][A[A

 33%|█████████████▋                           | 10/30 [22:11<44:27, 133.36s/it][A[A

 37%|███████████████                     

In [25]:
pd.DataFrame(posterior_list).to_csv('posterior_list.csv')

In [26]:
cv = CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)

In [27]:
X = cv.fit_transform(labeled_corpus['token_ngram'])

In [28]:
ngram_list = cv.get_feature_names()
len(ngram_list)

28069

In [29]:
polarity_scores = np.zeros((30, len(ngram_list)))

for i, itr in tqdm(enumerate(posterior_list)):
    for idx, n_gram in enumerate(itr[0]):
        tmp_n = ngram_list.index(n_gram)
        p_score = float(itr[2][idx])/float(itr[1][idx])
        polarity_scores[i][tmp_n] = p_score





0it [00:00, ?it/s][A[A

1it [01:24, 84.51s/it][A[A

2it [02:48, 84.47s/it][A[A

3it [04:13, 84.42s/it][A[A

4it [05:38, 84.55s/it][A[A

5it [07:02, 84.38s/it][A[A

6it [08:25, 84.04s/it][A[A

7it [09:49, 84.03s/it][A[A

8it [11:13, 84.14s/it][A[A

9it [12:37, 83.91s/it][A[A

10it [14:01, 83.96s/it][A[A

11it [15:25, 84.05s/it][A[A

12it [16:51, 84.71s/it][A[A

13it [18:15, 84.52s/it][A[A

14it [19:39, 84.40s/it][A[A

15it [21:04, 84.47s/it][A[A

16it [22:29, 84.49s/it][A[A

17it [23:52, 84.33s/it][A[A

18it [25:17, 84.32s/it][A[A

19it [26:41, 84.42s/it][A[A

20it [28:06, 84.39s/it][A[A

21it [29:30, 84.27s/it][A[A

22it [30:53, 83.99s/it][A[A

23it [32:16, 83.64s/it][A[A

24it [33:40, 83.85s/it][A[A

25it [35:05, 84.16s/it][A[A

26it [36:30, 84.47s/it][A[A

27it [37:54, 84.30s/it][A[A

28it [39:19, 84.42s/it][A[A

29it [40:44, 84.56s/it][A[A

30it [42:08, 84.27s/it][A[A


In [30]:
polarity_scores.shape
for i, v in enumerate(polarity_scores):
    for j, w in enumerate(v):
         if polarity_scores[i][j] == 0:
             polarity_scores[i][j] = None

In [31]:
df_p_scores = pd.DataFrame(polarity_scores)
df_p_scores = df_p_scores.fillna(df_p_scores.mean())

In [32]:
df_p_scores.T.isnull().sum()

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
dtype: int64

In [33]:
avg_polarity_scores = list(df_p_scores.mean())

In [34]:
ps = pd.DataFrame(avg_polarity_scores, index=ngram_list)
ps

Unnamed: 0,0
4대중증질환/nng,1.158647
6자회담/nng,1.199792
a+등급/nng,1.453386
a-등급/nng,1.240235
aa+등급/nng,0.593103
...,...
連/nng,0.731278
龍/nng,1.146432
柳/nng,0.795499
利/nng,0.619327


In [35]:
ps[ps[0].apply(lambda x: x > 1.3 or x < 0.7)]

Unnamed: 0,0
a+등급/nng,1.453386
aa+등급/nng,0.593103
abx지수/nng,1.609861
bbb+등급/nng,0.589251
bis/nng,0.693650
...,...
不/nng,1.371514
兩難/nng,0.331847
女/nng,0.588102
年/nng,1.445743


In [36]:
=========================================여기 밑에는 이전꺼(ramdom_state = i 설정 하기 전)===================

SyntaxError: invalid syntax (<ipython-input-36-115b14e607fe>, line 1)

In [58]:
clf.classes_ 

array([-1,  1], dtype=int64)

In [31]:
list(p)

[array([4, 5, 6]), array([7, 8, 9])]

In [12]:
vect = text_clf.named_steps['vect']

vect.get_feature_names()

['6자회담/nng',
 'a+등급/nng',
 'a-등급/nng',
 'aa+등급/nng',
 'aa-등급/nng',
 'aaa등급/nng',
 'aa등급/nng',
 'asp/nng',
 'a등급/nng',
 'bbb+등급/nng',
 'bbb-등급/nng',
 'bbb등급/nng',
 'bb등급/nng',
 'bei/nng',
 'bis/nng',
 'bps/nng',
 'bw/nng',
 'cac40/nng',
 'cb/nng',
 'cd/nng',
 'cd/nng;금리/nng;상승/nng',
 'cds/nng',
 'cp/nng',
 'crb/nng',
 'crs/nng',
 'db/nng',
 'dc/nng',
 'dps/nng',
 'dr/nng',
 'd의공포/nng',
 'eb/nng',
 'ebitda/nng',
 'ecb/nng',
 'ecb/nng;금리/nng;인상/nng',
 'els/nng',
 'eps/nng',
 'etf/nng',
 'etn/nng',
 'fcf/nng',
 'fed/nng',
 'fed/nng;금리/nng;인상/nng',
 'ff/nng',
 'ftse/nng',
 'ftse100/nng',
 'fx스왑/nng',
 'g7/nng',
 'g8/nng',
 'gdi/nng',
 'gdp/nng',
 'gdr/nng',
 'gni/nng',
 'icbm/nng',
 'ira/nng',
 'irs/nng',
 'ism/nng',
 'l자형/nng',
 'm&a/nng',
 'mmf/nng',
 'msci/nng',
 'ndf시장/nng',
 'oecd/nng',
 'ois스프레드/nng',
 'opec/nng',
 'p2p대출/nng',
 'pbr/nng',
 'pce/nng',
 'per/nng',
 'pf/nng',
 'pmi/nng',
 'qfii/nng',
 'rbc비율/nng',
 'rim/nng',
 'roa/nng',
 'roe/nng',
 'rp/nng',
 'rqfii/nng',
 's&p/nng',


In [40]:
avg_polarity_score.shape

(1, 18787)

In [34]:
len(vect.get_feature_names())

18787

In [13]:
polarity_scores = pd.DataFrame(list(avg_polarity_score), columns = vect.get_feature_names()).T
polarity_scores

Unnamed: 0,0
6자회담/nng,0.946671
a+등급/nng,0.959866
a-등급/nng,0.990433
aa+등급/nng,1.000245
aa-등급/nng,1.066672
aaa등급/nng,0.994153
aa등급/nng,1.018349
asp/nng,0.909473
a등급/nng,1.011986
bbb+등급/nng,0.981812


In [16]:
polarity_scores[polarity_scores[0].apply(lambda x: x > 1.3 or x < 0.7)]

Unnamed: 0,0
丙/nng,0.546383
甘/nng,1.784487
申年/nng,0.556876
鳶/nng,1.767705
걸/nng,1.806759
걸작/nng,1.784487
공유형모기지/nng,1.989147
교관/nng,1.784487
구민/nng,1.302410
구스타브/nng,0.508210
