In [1]:
import pandas as pd
import re

from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import accuracy_score

In [2]:
pd.__version__

'1.2.4'

In [3]:
import konlpy
konlpy.__version__

'0.5.2'

In [5]:
import sklearn
sklearn.__version__

'0.24.1'

## 불용어 파일 가져오기

In [51]:
features = 5000
file_path = './data/stopwords.txt'
with open(file_path,'r') as op:
    stopwords = op.readlines()
    stopwords = stopwords[0].split(',')

### 단어 추출을 위한 tokenizer  
불용어 제거, 명사만 추출

In [52]:
def tokenizer(text):
    okt = Okt()
    re.sub('[\W]',' ',text) # 한글 제외하고 전부 삭제
    result = []
    token_pos = okt.pos(text)
    for word, pos in token_pos:
        if (pos == 'Noun') and not(word in stopwords): # 명사 추출, 불용어제거
            result.append(str(word))
    return result

## 감성사전 업데이트

In [None]:
def update_dict():
    # 데이터 전처리
    data = pd.read_csv('./Data/labeling_data.csv',encoding='cp949')
    x = data['내용'].astype('str')
    y = data['label']

    # tf-idf 구축
    tfidf = TfidfVectorizer(max_features=features,tokenizer=tokenizer)
    x_tdm = tfidf.fit_transform(x)

    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_tdm,y,
                                                       test_size=0.3,
                                                       random_state=42)

    # Logistic Regression
    lr_clf = LogisticRegressionCV(max_iter=1000)
    lr_clf.fit(x_train,y_train)

    pred =lr_clf.predict(x_test)

    # 회귀에서 각 단어당 회귀계수 추출
    st_df = pd.DataFrame({'단어':tfidf.get_feature_names(),
                          '회귀계수':lr_clf.coef_.flat})
    st_df.tail()
    
    # 회귀계수가 음수일 때는 -1~0 으로 min-max scaling 변형
    st_neg = st_df[st_df['회귀계수']<0].sort_values('회귀계수')
    ma = st_neg['회귀계수'].max()
    mi = st_neg['회귀계수'].min()
    st_neg['points']=st_neg['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi) - 1))

    # 회귀계수가 양수인 경우는 0~1로 min-max scaling 변형
    st_pos = st_df[st_df['회귀계수']>0].sort_values('회귀계수',ascending=False)
    ma = st_pos['회귀계수'].max()
    mi = st_pos['회귀계수'].min()
    st_pos['points']=st_pos['회귀계수'].apply(lambda x : ((x - mi)/(ma - mi)))

    st_df = st_pos.append(st_neg)
    st_df.to_csv('./data/dict.csv',encoding='cp949')

tf-idf 구축