# 비지도학습 감성분석 - `Lexicon`기반


In [1]:
import numpy as np
import pandas as pd
from google.colab import files
up = files.upload()

Saving labeledTrainData.tsv to labeledTrainData.tsv


`Wordnet Synset` 및 `Sentiwordnet SentiSynset` Class

In [3]:
import nltk
from nltk.corpus import wordnet

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
term = 'present'
synsets = wordnet.synsets(term) # 리스트 형식

In [7]:
type(synsets), len(synsets) 

(list, 0)

In [12]:
term = 'fly'
synsets = wordnet.synsets(term) # 리스트 형식

for synset in synsets:
  print(f'########## name: {synset.name()} ##########')
  print(' - POS:',synset.lexname() )
  print(' - 정의:',synset.definition() )
  print(' - 표제어:',synset.lemma_names(), '\n' )

########## name: fly.n.01 ##########
 - POS: noun.animal
 - 정의: two-winged insects characterized by active flight
 - 표제어: ['fly'] 

########## name: tent-fly.n.01 ##########
 - POS: noun.artifact
 - 정의: flap consisting of a piece of canvas that can be drawn back to provide entrance to a tent
 - 표제어: ['tent-fly', 'rainfly', 'fly_sheet', 'fly', 'tent_flap'] 

########## name: fly.n.03 ##########
 - POS: noun.artifact
 - 정의: an opening in a garment that is closed by a zipper or by buttons concealed under a fold of cloth
 - 표제어: ['fly', 'fly_front'] 

########## name: fly.n.04 ##########
 - POS: noun.act
 - 정의: (baseball) a hit that flies up in the air
 - 표제어: ['fly', 'fly_ball'] 

########## name: fly.n.05 ##########
 - POS: noun.artifact
 - 정의: fisherman's lure consisting of a fishhook decorated to look like an insect
 - 표제어: ['fly'] 

########## name: fly.v.01 ##########
 - POS: verb.motion
 - 정의: travel through the air; be airborne
 - 표제어: ['fly', 'wing'] 

########## name: fly.v.02 ##

### 어휘간 유사도

In [13]:
# 단어 품사 모를 경우 - synsets() 으로 알아냄 
for synset in wordnet.synsets('tiger'):
  print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [15]:
# 단어 품사 아는 경우 - synset()
tiger = wordnet.synset('tiger.n.02 ') # 품사: synset.name()의 결과값..?
tree = wordnet.synset('tree.n.02 ')
lion = wordnet.synset('lion.n.02 ')
cat = wordnet.synset('cat.n.02 ')
dog = wordnet.synset('dog.n.02 ')

In [17]:
# 단어간 유사도 - path_similarity(비교대상)
tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree) # 호랑이-사자 / 호랑이-개 / 호랑이-나무

(0.06666666666666667, 0.06666666666666667, 0.047619047619047616)

In [19]:
# 5개 단어간의 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
  similarity = [entity.path_similarity(another) for another in entities]
  similarities.append(similarity)

# DataFrame 구조로 만들기
df = pd.DataFrame(similarities, columns=['tree', 'lion', 'tiger', 'cat', 'dog'],
                  index=['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.047619,0.076923,0.071429
lion,0.071429,1.0,0.066667,0.166667,0.111111
tiger,0.047619,0.066667,1.0,0.071429,0.066667
cat,0.076923,0.166667,0.071429,1.0,0.125
dog,0.071429,0.111111,0.066667,0.125,1.0


### SentiSynset Class

In [23]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [25]:
from nltk.corpus import sentiwordnet
senti_synsets = list(sentiwordnet.senti_synsets('slow'))
senti_synsets

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [28]:
# 'father' 단어의 긍정/부정/객관성 지수
# 단어의 품사를 알고있느 경우 사용 가능 
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score() # 긍정/부정/객관적

(0.0, 0.0, 1.0)

In [32]:
# 'mother' 단어의 긍정/부정/객관성 지수
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), father.neg_score(), father.obj_score() # 긍정/부정/객관적

(0.0, 0.0, 1.0)

In [31]:
# 'fabulous' 
fabulous = sentiwordnet.senti_synset('fabulous.a.01')  # 'fabulous' - (adjective) 
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score() # 긍정/부정/객관적

(0.875, 0.125, 0.0)

In [33]:
# 'love'
love = sentiwordnet.senti_synset('love.v.01')  # 'love' - (verb) 
love.pos_score(), love.neg_score(), love.obj_score() # 긍정/부정/객관적

(0.5, 0.0, 0.5)

In [34]:
# 명사 / 형용사 / 부사 / 동사
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

### 감정지수 계산

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
# tokenize
from nltk import word_tokenize, pos_tag
sentsnce = "It's good to see you again."
word_list = word_tokenize(sentsnce)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [41]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [42]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [45]:
def penn_to_wordnet(tag):
  if tag.startswith('N'): # tag.startwith('대상'): tag의 '대상'으로 시작하는 데이터
    return wordnet.NOUN
  if tag.startswith('J'): 
    return wordnet.ADJ
  if tag.startswith('R'): 
    return wordnet.ADV
  if tag.startswith('V'): 
    return wordnet.VERB

In [46]:
for word, pos in pos_tag(word_list):
  print(word, penn_to_wordnet(pos))

It None
's v
good a
to None
see v
you None
again r
. None


### sentence로 부터 senti_synset 객체를 만드는 과정

In [54]:
sentnece = "It's good to see you again."
word_list = [word for word in word_tokenize(sentnece) if len(word) >2]
word_list

['good', 'see', 'you', 'again']

In [55]:
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag: # None이 아닌 'n', 'a', 'r', 'v' 이면
    synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
    synset = synsets[0]
    print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [58]:
# 긍정 점수 - 부정 점수 값
sentiment = 0 
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag: # None이 아닌 'n', 'a', 'r', 'v' 이면
    synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
    synset = synsets[0]
    sentiment+= synset.pos_score() - synset.neg_score() # 긍정 점수 - 부정 점수 값 더해줌 

print(sentiment)

0.75


### sentnece로부터 감성지수

In [66]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [68]:
sentiment = 0 
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag: # None이 아닌 'n', 'a', 'r', 'v' 이면
    lemma = lemmatizer.lemmatize(word, wn_tag)
    synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
    synset = synsets[0]
    sentiment+= synset.pos_score() - synset.neg_score() # 긍정 점수 - 부정 점수 값 더해줌 

print(sentiment)

0.75


### document 에서 감성지수 계산하는 과정 및 함수

In [70]:
from nltk import sent_tokenize

In [71]:
document="""
This is a movie made purely to satisfy the fans and there should be no doubt about that.
No Way Home, in my opinion, is even better than Homecoming and Far From Home, and pretty much one of the best MCU movies of all time. 
It's a simple story, but the execution is fantastic. 
Even the smallest of surprises have a huge impact, and I could feel that in the theatre as I joined several other Spider-Man fans cheer out for both heroes and villains.
The action sequences were brilliant; seeing them in 3D is totally worth the price of admission. 
Every actor delivered a believable, realistic performance, and especially our lead actor Tom Holland. 
The visual effects too were top notch and the editing was stupendous. 
Two and a half hours flew by real quick while watching this popcorn action entertainer. 
It won't be fair to reveal anything, so here I conclude my review, and recommend you to check out this new world of Spidey-ness on the big screen and in 3D. 
And once you've seen it, please don't spoil it for others, just like you won't want it spoiled for yourself.
"""

In [74]:
sentiment = 0.0 
for sentence  in sent_tokenize(document):
  word_list = [word for word in word_tokenize(sentence) if len(word) > 2]

  for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag: # None이 아닌 'n', 'a', 'r', 'v' 이면
      lemma = lemmatizer.lemmatize(word, wn_tag)
      synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
      if not synsets:
        print(word)
        continue

      synset = synsets[0]
      sentiment+= synset.pos_score() - synset.neg_score() # 긍정 점수 - 부정 점수 값 더해줌 

print('긍정' if sentiment >=0 else '부정' )

Homecoming
From
MCU
Spider-Man
lead
popcorn
n't
anything
Spidey-ness
've
n't
others
n't
긍정


In [83]:
# 함수 만들기 ^^
def swn_polarity(text):
  lemmatizer = WordNetLemmatizer()
  sentiment = 0.0 
  for sentence  in sent_tokenize(document):
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]

    for word, pos in pos_tag(word_list):
      wn_tag = penn_to_wordnet(pos)
      if wn_tag: # None이 아닌 'n', 'a', 'r', 'v' 이면
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        if not synsets:
          #print(word)
          continue

        synset = synsets[0]
        sentiment+= synset.pos_score() - synset.neg_score() # 긍정 점수 - 부정 점수 값 더해줌 

  return 1 if  sentiment >=0 else 0 

### IMDB 영화평 감성분석 

In [77]:
# 맨 처음에 데이터 불러왔어!
# DataFrame 구조로 만들기
df =pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3) # quoting=3 따옴표 붙여서 
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [80]:
# <br /> 태그 공백으로 변환하기
df.review = df.review.str.replace('<br />', '') # <br /> : HTML 코드에서 줄바꿈 코드
# 구둣점, 숫자 제거 -> 영문자가 아닌 글자는 공백으로 변환 
df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()

  after removing the cwd from sys.path.


In [81]:
df.shape

(25000, 3)

In [82]:
# 데이터 10000개만
df = df.head(10000)

In [85]:
# 'pred' 필드 추가 - 'review' 데이터 감정분석 함수로 1, 0으로 나타냄 
%time df['pred'] = df.review.apply(lambda x: swn_polarity(x)) # swn_polarity(): 위에서 정의한 사용자 지정 함수

CPU times: user 3min 18s, sys: 1.85 s, total: 3min 20s
Wall time: 3min 26s


In [86]:
# 정확도 계산 
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.5053

In [87]:
## 여기까지는 에러처리 안한 상태!!
# 밑의 함수는 에러처리한 함수

In [None]:
# 강사님 깃허브에 있음 ㅎㅎ

### `VADER Lexicon`을 이용한 감성분석

In [89]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [92]:
# 훨씬 간편한 방법 ~~~!!  
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_analyzer.polarity_scores(df.review[0])

# {'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

{'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

In [93]:
def vader_polarity(document, threshhold=0.1):
  score = senti_analyzer.polarity_scores(document)
  return 1 if score['compound'] >= threshhold else 0

In [94]:
%time df['vader'] = df.review.apply(lambda x: vader_polarity(x, 0.1))

CPU times: user 31.6 s, sys: 200 ms, total: 31.8 s
Wall time: 32.2 s


In [97]:
accuracy_score(df.sentiment, df.vader)

0.6998

In [None]:
# 결론 ) Wordnet Synset 및 Sentiwordnet SentiSynset Class 보다 VADER Lexicon 이게 더 편해 