### 비지도학습 기반 감성 분석 소개
#### SentiWordNet 을 이용한 Sentiment Analysis
- WordNet Synset과 SentiWordNet SentiSynset 클래스의 이해

In [27]:
from nltk.corpus import wordnet as wn

In [28]:
term = 'function'
# 'sun' 라는 단어로 wordnet의 synsets(단어의 의미와 구성 및 품사등을 저장하고 있는 객체) 생성.
synsets = wn.synsets(term)
print('synsets() 반환 type : ', type(synsets))
print('synsets() 반환 값 갯수 : ', len(synsets))
print('synsets() 반환 값 : ', synsets)

synsets() 반환 type :  <class 'list'>
synsets() 반환 값 갯수 :  10
synsets() 반환 값 :  [Synset('function.n.01'), Synset('function.n.02'), Synset('function.n.03'), Synset('function.n.04'), Synset('function.n.05'), Synset('affair.n.03'), Synset('routine.n.03'), Synset('function.v.01'), Synset('serve.v.01'), Synset('officiate.v.02')]


In [29]:
for synset in synsets :
    print('##### Synset 이름 : ', synset.name(), '#####')
    print('POS : ', synset.lexname())
    print('Definition : ', synset.definition())
    print('Lemmas: ', synset.lemma_names())

##### Synset 이름 :  function.n.01 #####
POS :  noun.relation
Definition :  (mathematics) a mathematical relation such that each element of a given set (the domain of the function) is associated with an element of another set (the range of the function)
Lemmas:  ['function', 'mathematical_function', 'single-valued_function', 'map', 'mapping']
##### Synset 이름 :  function.n.02 #####
POS :  noun.attribute
Definition :  what something is used for
Lemmas:  ['function', 'purpose', 'role', 'use']
##### Synset 이름 :  function.n.03 #####
POS :  noun.act
Definition :  the actions and activities assigned to or required or expected of a person or group
Lemmas:  ['function', 'office', 'part', 'role']
##### Synset 이름 :  function.n.04 #####
POS :  noun.relation
Definition :  a relation such that one thing is dependent on another
Lemmas:  ['function']
##### Synset 이름 :  function.n.05 #####
POS :  noun.group
Definition :  a formal or official social gathering or ceremony
Lemmas:  ['function']
##### Synset

In [30]:
term = 'tiger'
synsets = wn.synsets(term)
for synset in synsets :
    print('##### Synset 이름 : ', synset.name(), '#####')
    print('POS : ', synset.lexname())
    print('Definition : ', synset.definition())
    print('Lemmas: ', synset.lemma_names())

##### Synset 이름 :  tiger.n.01 #####
POS :  noun.person
Definition :  a fierce or audacious person
Lemmas:  ['tiger']
##### Synset 이름 :  tiger.n.02 #####
POS :  noun.animal
Definition :  large feline of forests in most of Asia having a tawny coat with black stripes; endangered
Lemmas:  ['tiger', 'Panthera_tigris']


In [31]:
# synset 객체를 단어별로 생성합니다
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')
print(tree.definition())
print()
print(tiger.definition())

a tall perennial woody plant having a main trunk and branches forming a distinct elevated crown; includes both gymnosperms and angiosperms

large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [32]:
print(tree.name())

tree.n.01


In [33]:
print(tree.name().split('.')[0])

tree


In [34]:
entities = [tree, lion, tiger, cat, dog]
entity_names = [entity.name().split('.')[0] for entity in entities]
print(entity_names)

['tree', 'lion', 'tiger', 'cat', 'dog']


In [35]:
print(entities[0].path_similarity(entities[1], 2))

0.07142857142857142


In [36]:
print(entities[2].path_similarity(entities[3], 2))

0.25


In [37]:
print(entities[3].path_similarity(entities[4], 2))

0.2


In [38]:
# entities[0]으로 나머지 전체 데이터와 유사도 측정 : 결과가 리스트
# entities[1]으로 나머지 전체 데이터와 유사도 측정 : 결과가 리스트
# entities[2]으로 나머지 전체 데이터와 유사도 측정 : 결과가 리스트
# entities[3]으로 나머지 전체 데이터와 유사도 측정 : 결과가 리스트
# entities[4]으로 나머지 전체 데이터와 유사도 측정 : 결과가 리스트
# 결과 리스트들을 별도의 리스트에 append -> 2차원 

similarities = []
for entity in entities :
    similarity = [round(entity.path_similarity(nestedEntity), 2) for nestedEntity in entities]
    similarities.append(similarity)
print(similarities)

[[1.0, 0.07, 0.07, 0.08, 0.12], [0.07, 1.0, 0.33, 0.25, 0.17], [0.07, 0.33, 1.0, 0.25, 0.17], [0.08, 0.25, 0.25, 1.0, 0.2], [0.12, 0.17, 0.17, 0.2, 1.0]]


In [39]:
import pandas as pd
similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names)
similarity_df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [40]:
import nltk
from nltk.corpus import sentiwordnet as swn

In [41]:
father = swn.senti_synset('father.n.01')
print('father 긍정감성 지수: ', father.pos_score())
print('father 부정감성 지수: ', father.neg_score())
print('father 객관성 지수: ', father.obj_score())
print()
fabulous = swn.senti_synset('fabulous.a.01')
print('fabulous 긍정감성 지수: ',fabulous.pos_score())
print('fabulous 부정감성 지수: ',fabulous.neg_score())
print('fabulous 객관성 지수: ',fabulous.obj_score())

father 긍정감성 지수:  0.0
father 부정감성 지수:  0.0
father 객관성 지수:  1.0

fabulous 긍정감성 지수:  0.875
fabulous 부정감성 지수:  0.125
fabulous 객관성 지수:  0.0


In [42]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
# pos_tag : 입력 단어에  세부품사를 추출하는 모듈
from nltk.corpus import wordnet as wn

In [43]:
import pandas as pd
import re
review_df = pd.read_csv('./labeledTrainData.tsv', header=0, sep='\t', quoting = 3)
review_df['review'] = review_df['review'].str.replace('<br />', ' ')
review_df['review'] = review_df['review'].apply(lambda x:re.sub('[^a-zA-Z]',' ',x) )

In [44]:
from sklearn.model_selection import train_test_split
class_df = review_df['sentiment']  # target 분리
feature_df = review_df.drop(['id','sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test= train_test_split(feature_df, class_df, test_size=0.3, random_state=156)

In [45]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
# 품사태그를 전달 받아서 해당 품사객체를 리턴하는 함수 제작
def penn_to_wn(tag):
    if tag.startswith('J'):   # 형용사
        return wn.ADJ
    elif tag.startswith('N'):  # 명사
        return wn.NOUN
    elif tag.startswith('R'):  # 부사
        return wn.ADV
    elif tag.startswith('V'):   # 동사
        return wn.VERB
    return 

In [47]:
# 문장을 전달인수로 받고, 부정감성지수와 긍정감성지수의 연산으로 결정된
# 0 또는 1을 리턴하는 함수 제작
def swn_polarity(text):   # text 에는 문장이 전달 : review_df['review'][0]
    # 0 또는 1 의 값을 결정할 감성지수를 저장할 변수 생성 & 초기화
    sentiment = 0.0
    tokens_count = 0
    
    # 단어의 어원(표준형)을 찾아줄 객체 생성
    lemmatizer = WordNetLemmatizer()
    
    # 전달된 text 를 문장별로 분리
    raw_sentences = sent_tokenize(text)
    
    for rs in raw_sentences:
        ts = pos_tag( word_tokenize(rs) )
        for word, tag in ts:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN , wn.ADJ, wn.ADV):
                continue # 명사 형용사 동사 아니면 다음단어 : 필요없는 품사제외
            # 단어의 어근(표준형) 추출
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)    
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wn.synsets(lemma , pos=wn_tag)
            if not synsets:  # 생성한 synset 리스트가 비었으면 다음
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 
            # 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    if not tokens_count:
        return 0
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 
    if sentiment >= 0 :
        return 1
    return 0

- NNP: 단수 고유명사
- VB: 동사
- VBP: 동사 현재형
- TO: to 전치사
- NN: 명사(단수형 혹은 집합형)
- DT: 관형사
- RB : 부사

In [48]:
review_df['preds'] = review_df['review'].apply( lambda x : swn_polarity(x) )

In [49]:
y_target = review_df['sentiment'].values   # 실제값
preds = review_df['preds'].values   # 예측값

In [50]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score 
from sklearn.metrics import recall_score, f1_score, roc_auc_score

print("오차행렬:")
print(confusion_matrix( y_target, preds))
print("정확도:", accuracy_score(y_target , preds))
print("정밀도:", precision_score(y_target , preds))
print("재현율:", recall_score(y_target, preds))

오차행렬:
[[7668 4832]
 [3636 8864]]
정확도: 0.66128
정밀도: 0.647196261682243
재현율: 0.70912


In [51]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [52]:
senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][3])
print(senti_scores)

{'neg': 0.061, 'neu': 0.866, 'pos': 0.072, 'compound': 0.4893}


In [53]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    # compound 값에 기반하여 threshold 입력값보다 크면 1, 그렇지않으면 0을 반환
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

In [54]:
review_df['vader_preds'] = review_df['review'].apply(lambda x : vader_polarity(x, 0.1))
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

In [55]:
print('#### VADER 예측 성능 평가 ####')
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
print(confusion_matrix(y_target, vader_preds))
print("정확도 : ", accuracy_score(y_target, vader_preds))
print("정밀도 : ", precision_score(y_target, vader_preds))
print("재현율 : ", recall_score(y_target, vader_preds))

#### VADER 예측 성능 평가 ####
[[ 6729  5771]
 [ 1858 10642]]
정확도 :  0.69484
정밀도 :  0.6483884725522452
재현율 :  0.85136
