In [1]:
import pandas as pd
import re

from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import accuracy_score

In [50]:
data = pd.read_csv('./Data/labeling_data.csv',encoding='cp949')

## 불용어 파일 가져오기

In [51]:
file_path = './data/stopwords.txt'
with open(file_path,'r') as op:
    stopwords = op.readlines()
    stopwords = stopwords[0].split(',')

## 감성 사전 구축을 위한 tf-idf 구축

### 단어 추출을 위한 tokenizer  
불용어 제거, 명사만 추출

In [52]:
okt = Okt()
def tokenizer(text):
    re.sub('[\W]',' ',text)
    result = []
    token_pos = okt.pos(text)
    for word, pos in token_pos:
        if (pos == 'Noun') and not(word in stopwords):
            result.append(word)
    return result

tf-idf 구축

In [53]:
x = data['내용'].astype('str')
y = data['label']

tfidf = TfidfVectorizer(max_features=1000,tokenizer=tokenizer)
x_tdm = tfidf.fit_transform(x)

In [63]:
tmp= data[data['label']=='1']
tmp2 = data[data['label']=='-1']
tmp = pd.concat([tmp,tmp2])

In [67]:
tmp.to_csv('./Data/labeling_data.csv',encoding='cp949')

## Logistic Regression

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x_tdm,y,
                                                   test_size=0.3,
                                                   random_state=42)

lr_clf = LogisticRegressionCV(max_iter=1000)
lr_clf.fit(x_train,y_train)

pred =lr_clf.predict(x_test)


# print('Train data 정확도: ',lr_clf.score(x_train,y_train))
# print('Test data 정확도: ',accuracy_score(y_test,pred))

ValueError: Input contains NaN

In [2]:
st_df = pd.read_csv('./data/dict.csv',encoding='cp949')

In [5]:
st_df[st_df['회귀계수']>0][:20]

Unnamed: 0.1,Unnamed: 0,단어,회귀계수,points
0,2329,소재,3.704273,1.0
1,4169,친환경,3.243266,0.875539
2,4207,캠페인,2.931911,0.791481
3,3584,정화,2.889541,0.780042
4,3818,지속,2.817278,0.760533
5,1218,마련,2.62236,0.707909
6,2706,에코,2.587478,0.698492
7,4902,확대,2.50633,0.676584
8,3510,전환,2.409122,0.65034
9,42,가장,2.254802,0.608677


In [6]:
st_df[st_df['회귀계수']<0][:20]

Unnamed: 0.1,Unnamed: 0,단어,회귀계수,points
2571,4922,환경오염,-5.928674,-1.0
2572,2811,오염,-3.734349,-0.629861
2573,2015,사실,-3.255813,-0.549142
2574,1423,문제,-3.182966,-0.536854
2575,122,개비,-2.901641,-0.4894
2576,3444,적발,-2.859539,-0.482298
2577,4502,페놀,-2.785904,-0.469878
2578,1441,뭐,-2.635461,-0.444501
2579,763,노출,-2.612642,-0.440652
2580,4260,콜라,-2.408047,-0.406141


In [66]:
st_df = pd.DataFrame({'단어':tfidf.get_feature_names(),
                      '회귀계수':lr_clf.coef_.flat})
st_df.tail()

Unnamed: 0,단어,회귀계수
995,희망,0.676055
996,희생,-0.454693
997,흰색,0.847935
998,히어로,-0.625252
999,힐링,0.412005


## 회귀 계수를 점수화 (-1 ~ 1)로

In [71]:
st_neg = st_df[st_df['회귀계수']<0].sort_values('회귀계수')

ma = st_neg['회귀계수'].max()
mi = st_neg['회귀계수'].min()

st_neg['points']=st_neg['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi) - 1))

Unnamed: 0,단어,회귀계수,points
977,환경오염,-8.457512,-1.000000
773,지구,-5.106258,-0.603551
220,리플,-4.510213,-0.533040
627,의약품,-3.697882,-0.436942
515,쓰레기,-3.685172,-0.435439
...,...,...,...
314,배달,-0.015227,-0.001289
589,요구,-0.009465,-0.000608
318,버려진,-0.005601,-0.000151
124,꼭,-0.004808,-0.000057


In [78]:
st_pos = st_df[st_df['회귀계수']>0].sort_values('회귀계수',ascending=False)

ma = st_pos['회귀계수'].max()
mi = st_pos['회귀계수'].min()

st_pos['points']=st_pos['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi)))
st_pos[:20]

Unnamed: 0,단어,회귀계수,points
840,친환경,8.320163,1.0
521,아웃,4.020127,0.483013
584,옷,3.538164,0.425068
337,보전,3.162151,0.37986
958,형,3.123427,0.375204
795,차,2.977596,0.357671
9,가장,2.848457,0.342145
420,생활,2.628966,0.315756
871,텀블러,2.612707,0.313801
163,달,2.572218,0.308933


In [74]:
st_df = st_pos.append(st_neg)

In [77]:
st_df.to_csv('data/dict.csv',encoding='cp949')