In [48]:
import pandas as pd
import re

from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import accuracy_score

In [49]:
data = pd.read_excel('./data.xlsx')

In [50]:
data1 = data[data['label'] == -1]
data2 = data[data['label'] == 1]
df = data1.append(data2)

In [52]:
df

Unnamed: 0,내용,label
0,.년 .월 당시 한국환경기술개발원이 낸 .휴.폐광된 금속광산 지역의 오염 관리대책...,-1
1,광산 지역에서 . 떨어진 곳에서도 비소가 .배. 높았다,-1
2,안내판에 .영풍.이란 이름에 주목한 건 이 기업의 석포제련소 환경오염 문제 때문이다,-1
4,.년에야 일대 환경오염 문제가 알려졌다,-1
30,수몰 예정지 주민들이 .년 동안 반대 운동을 벌이며 영양댐 건설을 막아냈다,-1
...,...,...
1653,그냥 평범한 논리 회로임1전기차 시장 수요 및 공급 증가2전기 수요 증가3친환경 전...,1
1655,경동나비엔도 보니까 저녹스보일러 설치 지원비받아서 매출에 영향좀 있었던듯 친환경 설...,1
1658,친환경 음식물 처리기 ㅇㅇ,1
1659,에휴 바이든이 친환경 한다고 하고테슬라 전기차가 저렇게 올라온거 보면죽어도 정유주에...,1


In [53]:
file_path = './data/stopwords.txt'
with open(file_path,'r') as op:
    stopwords = op.readlines()
    stopwords = stopwords[0].split(',')

In [54]:
okt = Okt()
def tokenizer(text):
    re.sub('[\W]',' ',text)
    result = []
    token_pos = okt.pos(text)
    for word, pos in token_pos:
        if (pos == 'Noun') and not(word in stopwords):
            result.append(word)
    return result

In [56]:
x = df['내용']
y = df['label']

tfidf = TfidfVectorizer(max_features=1000,tokenizer=tokenizer)
x_tdm = tfidf.fit_transform(x)

In [58]:
x_train, x_test, y_train, y_test = train_test_split(x_tdm,y,
                                                   test_size=0.3,
                                                   random_state=42)

lr_clf = LogisticRegressionCV(max_iter=1000)
lr_clf.fit(x_train,y_train)
print(lr_clf.score(x_train,y_train))

pred =lr_clf.predict(x_test)
print(accuracy_score(y_test,pred))

0.9977477477477478
0.8324607329842932


In [59]:
len(tfidf.get_feature_names())
print(lr_clf.coef_.shape)

(1, 1000)


In [66]:
st_df = pd.DataFrame({'단어':tfidf.get_feature_names(),
                      '회귀계수':lr_clf.coef_.flat})
st_df.tail()

Unnamed: 0,단어,회귀계수
995,희망,0.676055
996,희생,-0.454693
997,흰색,0.847935
998,히어로,-0.625252
999,힐링,0.412005


In [61]:
(st_df['회귀계수'].max())

8.32016342498613

In [62]:
ma = st_df['회귀계수'].max()
mi = st_df['회귀계수'].min()

st_df['points']=st_df['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi) * 2) - 1)

In [63]:
st_df.to_csv('data/dict.csv',encoding='cp949')

In [71]:
st_neg = st_df[st_df['회귀계수']<0].sort_values('회귀계수')

ma = st_neg['회귀계수'].max()
mi = st_neg['회귀계수'].min()

st_neg['points']=st_neg['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi) - 1))

Unnamed: 0,단어,회귀계수,points
977,환경오염,-8.457512,-1.000000
773,지구,-5.106258,-0.603551
220,리플,-4.510213,-0.533040
627,의약품,-3.697882,-0.436942
515,쓰레기,-3.685172,-0.435439
...,...,...,...
314,배달,-0.015227,-0.001289
589,요구,-0.009465,-0.000608
318,버려진,-0.005601,-0.000151
124,꼭,-0.004808,-0.000057


In [78]:
st_pos = st_df[st_df['회귀계수']>0].sort_values('회귀계수',ascending=False)

ma = st_pos['회귀계수'].max()
mi = st_pos['회귀계수'].min()

st_pos['points']=st_pos['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi)))
st_pos[:20]

Unnamed: 0,단어,회귀계수,points
840,친환경,8.320163,1.0
521,아웃,4.020127,0.483013
584,옷,3.538164,0.425068
337,보전,3.162151,0.37986
958,형,3.123427,0.375204
795,차,2.977596,0.357671
9,가장,2.848457,0.342145
420,생활,2.628966,0.315756
871,텀블러,2.612707,0.313801
163,달,2.572218,0.308933


In [74]:
st_df = st_pos.append(st_neg)

In [77]:
st_df.to_csv('data/dict.csv',encoding='cp949')