In [115]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
import pandas as pd
review_df = pd.read_csv('./labeledTrainData.tsv', header=0, sep='\t', quoting=3)
# header = 0 : 파일의 첫번째 줄이 열이름
# quoting = 3 : 큰따옴표 무시
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [116]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

#### str 클래스의 replace를 이용하여 '\<br>' 태그를  ''로 변환하세요

#### 정규표현식을 이용하여 영어 문자열이 아닌 문자는 모두 공백으로 변환하세요(apply와 lambda이용)

In [117]:
review_df['review'] = review_df['review'].apply(lambda x : re.sub('[^a-zA-Z]', ' ', x.replace('<br />', '')))
review_df['review'][0]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

#### 학습/테스트 데이터 분리하세요(train_test_split함수 이용)

In [118]:
Y_target = review_df['sentiment']
X_data = review_df.drop(['sentiment', 'id'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_data, Y_target, test_size=0.3, random_state=156)

In [119]:
X_train

Unnamed: 0,review
3724,This version moved a little slow for my taste...
23599,I really enjoyed this film because I have a t...
11331,Saw this in the theater in and fell out o...
15745,Recently I was looking for the newly issued W...
845,Escaping the life of being pimped by her fath...
...,...
6955,This is a generally nice film with good stor...
7653,The real shame of The Gathering is not in...
9634,In what could have been an otherwise run of t...
6860,Excellent P O W adventure adapted by Eric W...


In [120]:
y_train

3724     0
23599    1
11331    1
15745    1
845      1
        ..
6955     1
7653     0
9634     0
6860     1
24108    0
Name: sentiment, Length: 17500, dtype: int64

#### 파이프라인에 LogisticRegression 객체를 넣어 학습하고 예측하세요
- CountVecrorize 이용
    - stop_words='english', ngram_range=(1, 2)
- 성능평가 출력은 정확도를 사용하세요.(C=10)

In [121]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])

print('Pipeline & Count Vectorized - Logistic Regression의 예측 정확도는 : {0:.3f}'.format(accuracy_score(y_test, pred)))

Pipeline & Count Vectorized - Logistic Regression의 예측 정확도는 : 0.886


In [122]:
import nltk
from nltk.corpus import sentiwordnet as swn

In [123]:
father = swn.senti_synset('father.n.01')
print('father 긍정감성 지수 : ', father.pos_score())
print('father 부정감성 지수 : ', father.neg_score())
print('father 객관성 지수 : ', father.obj_score())
print()
fabulous = swn.senti_synset('fabulous.a.01')
print('fabulous 긍정감성 지수 : ', fabulous.pos_score())
print('fabulous 부정감성 지수 : ', fabulous.neg_score())
print('fabulous 객관성 지수 : ', fabulous.obj_score())

father 긍정감성 지수 :  0.0
father 부정감성 지수 :  0.0
father 객관성 지수 :  1.0

fabulous 긍정감성 지수 :  0.875
fabulous 부정감성 지수 :  0.125
fabulous 객관성 지수 :  0.0


In [124]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
# pos_tag : 입력 단어에 세부품사를 추출하는 모듈
from nltk.corpus import wordnet as wn

### 비지도학습 기반 감성 분석 실습 - IMDB 영화평

In [125]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import re
review_df = pd.read_csv('./labeledTrainData.tsv', header=0, sep='\t', quoting=3)
review_df['review'] = review_df['review'].apply(lambda x : re.sub('[^a-zA-Z]', ' ', x.replace('<br />', '')))

from sklearn.model_selection import train_test_split
class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.3, random_state=156)

In [126]:
# 품사태그를 전달 받아서 해당 품사객체를 리턴하는 함수 제작
def penn_to_wn(tag) :
    if tag.startswith('J') : # 형용사
        return wn.ADJ
    elif tag.startswith('N') : # 명사
        return wn.NOUN
    elif tag.startswith('R') : # 부사
        return wn.ADV
    elif tag.startswith('V') : # 동사
        return wn.VERB
    return

In [127]:
# 문장을 전달인수로 받고, 부정감성지수와 긍정감성지수의 연산으로 결정된
# 0 또는 1을 리턴하는 함수 제작
def swn_polarity(text) : # text 예는 문장이 전달 : review_df['review'][0]
    # 0 또는 1의 값을 결정할 감성지수를 저장할 변수 생성 & 초기화
    sentiment = 0.0
    tokens_count = 0
    
    # 단어의 어원(표준형)을 찾아줄 객체 생성
    lemmatizer = WordNetLemmatizer()
    
    # 전달된 텍스트를 문장별로 분리
    raw_sentences = sent_tokenize(text)
    
    for rs in raw_sentences :
        ts = pos_tag(word_tokenize(rs))
        for word, tag in ts :
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue # 명사, 형용사, 동사 아니면 다음 반복 : 분석에 필요없는 품사 제외
            # 단어의 어근(표준형) 추출
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets : # 생성한 synset 리스트가 비었으면 다음
                continue
            # print(synsets)
            
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로
            # 부정 감성 지수는 -로 합산해 감성 지수 계산.
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1
        if not tokens_count :
            return 0
        
        # 총 score가 0이상일 경우, 긍정(Postive), 그렇지 않으면 부정(Negative) 0
        if sentiment >= 0:
            return 1
        return 0

In [128]:
swn_polarity(review_df['review'][0])

0

- NNP : 단순 고유명사
- VB : 동사
- VBP : 동사 현재형
- TO : to 전치사
- NN : 명사(단수형 또는 집합형)
- DT : 관형사
- RB : 부사

In [None]:
review_df['preds'] = review_df['review'].apply(lambda x : swn_polarity(x))

In [None]:
y_target = review_df['sentiment'].values # 실제값
preds = review_df['preds'].values # 예측값

In [None]:
from sklearn.metrics import accuracy_scoreacy_score, precision_scorem, confusion_matrix
from sklearn.metrics import recall_score, f1_score, roc_auc_score

print(confusion_matrix(y_target, preds))
print("정확도 : ", accuracy_score(y_target, preds))
print("정밀도 : ", precision_score(y_target, preds))
print("재현율 : ", recall_score(y_target, preds))

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][3])
print(senti_scores)

In [None]:
def vader_polarity(review, threshold=0.1) :
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    