<a href="https://colab.research.google.com/github/joyfulspace/ADP/blob/master/10_%EB%82%98%EC%9D%B4%EB%B8%8C%EB%B2%A0%EC%9D%B4%EC%A6%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 나이브 베이즈
* A와 B가 일어날 확률
  - 두 사건이 독립이라면: $P(A)XP(B)$
  - A가 일어났다는 조건 하에서 B가 일어날 확률 $P(B|A)$
  - A와 B가 연달아서 일어날 확률: $P(A)*P(B|A)=P(A\cap B)$ (A가 사전, B가 사후)
  - B가 일어나고 A가 일어날 확률: $P(B)*P(A|B)=P(A\cap B)$
  - 따라서 $P(A)*P(B|A)=P(B)*P(A|B)$ : 베이즈 정리
* 나이브 베이즈: A가 일어나고 B가 일어날 확률과 C가 일어나고 B가 일어날 확률은 다른데 이 원리를 이용
  - 긍정, 부정을 예측하거나 주제 분류, 텍스트 마이닝에서 많이 사용
  - ex. 스팸메일 찾기
    - $P(스팸메일|'광고')=P('광고'|스팸메일)*P(스팸메일)/P('광고')$
    - 광고 단어가 있을 때 스팸메일일 확률 = 스팸메일인데 광고 단어가 있을 확률 * 스팸메일일 확률 / '광고' 단어가 있을 확률
   - 여러개의 단어가 나타났을 때 스팸일지 정상일지?
   - 긴 텍스트라도 쪼개서 확률 계산 가능

In [None]:
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [None]:
!git clone https://github.com/joyfulspace/ADP.git

Cloning into 'ADP'...
remote: Enumerating objects: 2488, done.[K
remote: Counting objects: 100% (2181/2181), done.[K
remote: Compressing objects: 100% (2175/2175), done.[K
remote: Total 2488 (delta 23), reused 2135 (delta 4), pack-reused 307[K
Receiving objects: 100% (2488/2488), 55.99 MiB | 24.73 MiB/s, done.
Resolving deltas: 100% (145/145), done.


In [None]:
import chardet

# 영화리뷰데이터 # 파일 없음
with open('ADP/data/imdb_master.csv', 'rb') as f:
  result = chardet.detect(f.read()) # 인코딩 자동으로 감지

train = pd.read_csv('ADP/data/imdb_master.csv', encoding=result['encoding'])
train.head()

FileNotFoundError: ignored

In [None]:
train = train.drop(train.columns[[0, 1, 4]], axis=1)

In [None]:
train.label = [(l!='neg')*1 for l in train.label]

In [None]:
train.head()

In [None]:
d = train
d.columns = ['user_review', 'positive']
d.head()

NameError: ignored

In [None]:
# train셋, test셋 나눔
split = 0.7
d_train = d[:int(split*len(d))]
d_test = d[int((1-split)*len(d)):]

### 1. 나이브베이즈

In [None]:
import nltk

def get_filtering(text): # feature를 바꿈
  text = re.sub('[^A-Za-z +', '', text)
  texts = nltk.word_tokenize(text)
  return (texts)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=get_filtering)
features = vectorizer.fit_transform(d_train.user_review)

In [None]:
i = 30000 # 3만건에서 100번 째 칸까지 확인하기
j = 100
words = vectorizer.get_feature_names()[i:i+30]
pd.DataFrame(features[j:j+10, i:i+30].todense(), columns=words)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB()
model1.fit(features, d_train.positive)

pred1 = model1.predict_proba(vectorizer.transform(d_test.user_review))
pred1, pred1.shape

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
from matplotlib.pyplot import plot, xlabel, ylabel, annotate, xlim, ylim, title
def performance(y_true, pred, color='g', ann=True):
  acc = accuracy_score(y_true, pred[:,1]>0.5)
  auc = roc_auc_score(y_true, pred[:,1])
  fpr, tpr, thr = roc_curve(y_true, pred[:,1])
  plot(fpr, tpr, color, linewidth='3')
  xlabel('False positive rate')
  ylabel('True positive rate')
  if ann:
    annotate('Acc: %0.2f' % acc, (0.1,0.8), size=14)
    annotate('AUC: %0.2f' % acc, (0.1,0.7), size=14)

In [None]:
performance(d_test.positive, pred1)

### 2. 나이브베이즈 + Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=get_filtering)
features = vectorizer.fit_transform(d_train.user_review)

In [None]:
vectorizer.transform(d_test.user_review)

In [None]:
# 모델 비교
pred2 = model1.predict_proba(vectorizer.transform(d_test.user_review))
performance(d_test.positive, pred1, ann=False)
performance(d_test.positive, pred2, color='b') # tf-idf한 것이 결과 더 좋음
xlim(0, 0.5)
ylim(0.5, 1)

In [None]:
# 매개변수 최적화
# 관찰할 옵션의 정의
param_ranges = {
    'max_features': [10000, 30000, 50000, None],
    'min_df': [1, 2, 3],
    'nb_alpha':[0.01, 0.1, 1.0]
}

In [1]:
# 옵션을 주고 머징모델을 만들어 주는 함수
def build_model(max_features=None, min_df=1, nb_alpha=1.0, return_preds=False):
  vectorizer = TfidfVectorizer(max_features=max_features, min_df=min_df, tokenizer=get_filtering)
  features = vectorizer.fit_transform(d_train.user_review)
  model = MultinomialNB(alpha=nb_alpha)
  model.fit(features, d_train.positive)
  pred = model.predict_proba(vectorizer.transform(d_test.user_review))
  res = {
      'max_features':max_features,
      'min_df':min_df,
      'nb_alpha':nb_alpha,
      'auc':roc_auc_score(d_test.positive, pred[:,1])
  }
  if return_preds:
    res['preds'] = pred
  return res

In [None]:
# 옵션을 변화하면서 만들어지는 머징모델들의 평가수치를 저장하고 출력
from itertools import product
results = []
for p in product(*param_ranges.values()):
  res = build_model(**dict(zip(param_ranges.keys(), p)))
  results.append(res)
  print(res)

In [None]:
# 결과값을 데이터프레임으로 만듬
opt = pd.DataFrame(results)

In [None]:
mf_idx = [0, 9, 18, 27]
plot(opt.max_features[mf_idx], opt.auc[mf_idx], linewidth=2)
title('AUC vs max_features')

In [None]:
mdf_idx = [1, 20, 30]
plot(opt.nb_alpha[mdf_idx], opt.auc[mdf_idx], linewidth=2)
title('AUC vs min_df')

In [None]:
nba_idx = [1, 20, 30]
plot(opt.nb_alpha[nba_idx], opt.auc[nba_idx], linewidth=2)
title('AUC vs alpha')

In [None]:
# 3가지 융합모델
pred3 = build_model(nb_alpha=0.01, return_preds=True)['preds']
performance(d_test.positive, pred1, ann=False)
performance(d_test.positive, pred2, color='b', ann=False)
performance(d_test.positive, pred3, color='r')
xlim(0, 0.5)
ylim(0.5, 1)

## 3. 랜덤 포레스트+Tf-idf

In [None]:
# Tf-idf
vectorizer = TfidfVectorizer(strip_accents='unicode', min_df=3, max_features=30000, norm='l2')
features = vectorizer.fit_transform(d_train.user_review)

In [None]:
# 나이브베이즈 + Tf-idf
model3 = MultinomialNB()
model3.fit(features, d_train.positive)
pred3 = model3.predict_proba(vectorizer.transform(d_test.user_review))
performance(d_test.positive, pred3)

In [None]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(n_estimators=100)
model2.fit(features, d_train.positive)

In [None]:
# 랜덤포레스트 + Tf-idf 머징모델 평가
pred2 = model2.predict_proba(vectorizer.transform(d_test.user_review))
performance(d_test.positive, pred2)

In [4]:
# 랜덤포레스트 + Tf-idf 머징모델의 테스트 적용
examples = ["First of all I hate those moronic rappers",
            "She was approached with her own show... she jumped ship on Drake and Josh. They then decided that maybe they would do a ",
            "I found a couple of topics unusually explicitly addressed, and until the end,",
            "One Great movie, I've watched it several times",
            "It's a grisly movie if you are intereseted in that, and there's often a morbid focus on food to accompany events, like a ",
            "I think many people are annoyed with this film because it's being pushed as a horror film--which it isn't. So, if you "
]

model2.predict(vectorizer.transform(examples))

## 4. word2vec 사용
* Doc2vec: Word2vec을 이용하여 텍스트를 단어처럼 하나의 벡터로 나타낸 것.

In [9]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import re

# 긍정/부정과 관련없은 영화배우 이름 추가(주관적인 작업)
stop_words = set(['Tom Cruise', 'Johansson', 'Reynolds', 'Nicole Kidman', 'DiCaprio'])

# 띄어쓰기와 알파벳 활용
def tokenize(docs):
  pattern = re.compile('[\W_]+', re.UNICODE)
  sentences = []

  for d in docs:
    sentence = d.lower().split(" ")
    sentence = [pattern.sub('', w) for w in sentence]
    sentences.append([w for w in sentence if w not in stop_words])

  return sentences

# 1글자 이상의 명사, 동사, 형용사, 부사 등을 추출함.
def get_noun(text):
  text = re.sub('[^A-Za-z ]+', '', text)
  texts = nltk.word_tokenize(text)
  return ([x for (x,y) in (nltk.pos_tag(texts)) if (y=='NNP' or y=='NN' or y=='PRP' or y=='VBP' or y=='RB' or y=='JJ') and len(x)>0])

# 명사, 동사, 형용사, 부사 추출
def tokenize2(docs):
  results = []
  for d in docs:
    r = get_noun(d)
    results.append(r)

  return results

In [5]:
tokenize(examples)

[['first', 'of', 'all', 'i', 'hate', 'those', 'moronic', 'rappers'],
 ['she',
  'was',
  'approached',
  'with',
  'her',
  'own',
  'show',
  'she',
  'jumped',
  'ship',
  'on',
  'drake',
  'and',
  'josh',
  'they',
  'then',
  'decided',
  'that',
  'maybe',
  'they',
  'would',
  'do',
  'a',
  ''],
 ['i',
  'found',
  'a',
  'couple',
  'of',
  'topics',
  'unusually',
  'explicitly',
  'addressed',
  'and',
  'until',
  'the',
  'end'],
 ['one', 'great', 'movie', 'ive', 'watched', 'it', 'several', 'times'],
 ['its',
  'a',
  'grisly',
  'movie',
  'if',
  'you',
  'are',
  'intereseted',
  'in',
  'that',
  'and',
  'theres',
  'often',
  'a',
  'morbid',
  'focus',
  'on',
  'food',
  'to',
  'accompany',
  'events',
  'like',
  'a',
  ''],
 ['i',
  'think',
  'many',
  'people',
  'are',
  'annoyed',
  'with',
  'this',
  'film',
  'because',
  'its',
  'being',
  'pushed',
  'as',
  'a',
  'horror',
  'filmwhich',
  'it',
  'isnt',
  'so',
  'if',
  'you',
  '']]

In [12]:
tokenize2(examples) # 더 체계적인 형태

[['First', 'I', 'hate', 'moronic'],
 ['She',
  'own',
  'show',
  'she',
  'ship',
  'Drake',
  'Josh',
  'They',
  'then',
  'maybe',
  'they'],
 ['I', 'couple', 'unusually', 'explicitly', 'end'],
 ['Great', 'movie', 'Ive', 'it', 'several'],
 ['grisly', 'movie', 'you', 'are', 'often', 'morbid', 'focus', 'food'],
 ['I',
  'think',
  'many',
  'are',
  'film',
  'horror',
  'filmwhich',
  'it',
  'So',
  'you']]

In [None]:
sentences = tokenize(d_train.user_review)
sentences2 = tokenize2(d_train.user_review)
len(sentences), len(sentences2)

In [None]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences2, size=300, window=10, min_count=1, sample=1e-3, workers=2) #size: word를 벡터로 바꿀 때 벡터의 크기, window: 한 단어와 가까운 주변 단어의 크기, workers: 스레드

In [None]:
model.inti_sims(replace=True) # 트레이닝이 완료되면 init_sims 명령으로 필요없는 메모리를 unload 시킨다.

In [13]:
# word2vec을 이용하여 텍스트를 수치화. doc2vec
from numpy import zeros
def featurize_w2v(model, sentences):
  f = zeros((len(sentences), model.vector_size))
  for i, s in enumerate(sentences):
    for w in s:
      try:
        vec = model[w]
      except KeyError:
        continue
      f[i,:] = f[i,:] + vec
    f[i,:] = f[i,:]/len(s)
  return f

In [None]:
features_w2v = featurize_w2v(model, sentences)
features_w2v.shape, type(features_w2v) # (70000, 300), numpy.ndarray

In [None]:
#doc2vec + 랜덤포레스트 머징모델
model4 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model4.fit(features_w2v, d_train.positive)

In [None]:
# tokenize2 테스트
test_sentences = tokenize(d_test.user_review)

In [None]:
test_features_w2v = featurize_w2v(model, test_sentences) # sentences를 벡터로 만듦

In [None]:
# doc2vec + 랜덤포레스트 머징모델을 7만개의 문장에 대해 테스트해본다
pred4 = model4.predict_proba(test_features_w2v)
pred4.shape # 70000, 2 긍정/부정

In [None]:
# 테스트한 결과(긍정 or 부정)
for i in range(0, pred4.shape[0]):
  positive_true = '부정' if (pred4[i][0] > pred4[i][1]) else '긍정'
  print(i, positive_true)

In [None]:
# doc2vec + 랜덤포레스트 머징모델 평가 그래프
performance(list(d_test.positive), pred4, color='b')
xlim(0, 1)
ylim(0, 1)

In [None]:
# doc2vec _ 랜덤포레스트 머징모델 테스트
example_feat4 = featurize_w2v(model, tokenize2(examples))
model4.predict(example_feat4)
example_feat4.shape # 6, 300 6개 문장에 300개 벡터