# PV-DM

#### doc2vec 만들기

In [3]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument # 문장마다 paragraph ID, tag ID

samples = ['너 오늘 이뻐 보인다.',
          '나는 오늘 기분이 더러워',
          '나 좋은 일이 생겼어',
          '아 오늘 진짜 짜증나',
          '환상적인데, 정말 좋은거 같아']

sentences = [s.split() for s in samples]

In [4]:
sentences

[['너', '오늘', '이뻐', '보인다.'],
 ['나는', '오늘', '기분이', '더러워'],
 ['나', '좋은', '일이', '생겼어'],
 ['아', '오늘', '진짜', '짜증나'],
 ['환상적인데,', '정말', '좋은거', '같아']]

In [7]:
documents = [TaggedDocument(doc, [f'd{i}']) for i, doc in enumerate(sentences)]
documents 

[TaggedDocument(words=['너', '오늘', '이뻐', '보인다.'], tags=['d0']),
 TaggedDocument(words=['나는', '오늘', '기분이', '더러워'], tags=['d1']),
 TaggedDocument(words=['나', '좋은', '일이', '생겼어'], tags=['d2']),
 TaggedDocument(words=['아', '오늘', '진짜', '짜증나'], tags=['d3']),
 TaggedDocument(words=['환상적인데,', '정말', '좋은거', '같아'], tags=['d4'])]

In [8]:
# PV-DM 모델을 생성한다.
model = Doc2Vec(size = 5, alpha = 0.025, min_alpha = 0.00025, min_count = 1, dm = 1)
# size: 벡터 크기(하나의 워드에 대해 5개의 벡터로 출력할 것
# alpha: 학습율, min_count: 단어들이 최소 한번 쓰인 것을 가지고 모델을 만들어라
# dm: 1, pd-dm으로 학습 0, pv-dbow로 학습
# PV-DM 모델을 학습한다.
model.build_vocab(documents)
model.train(documents, total_examples = len(samples), epochs = 100)
# 태그 붙은 문장을 넣어주고, 전체 문장 개수(total_examples) 6

In [9]:
# word vector를 확인해본다.
model.wv['보인다.']

array([-0.09755615, -0.03389874, -0.09121843, -0.07583982, -0.05491541],
      dtype=float32)

In [10]:
# paragraph vector를 확인해 본다.
model.docvecs[0] # "너 오늘 이뻐 보인다"의 벡터

array([-0.04046106,  0.07178956, -0.07444929, -0.02096957,  0.01386234],
      dtype=float32)

In [11]:
# doc2vec을 쓰면 문장마다 하나의 벡터로 만들 수 있다.
model.docvecs.vectors_docs

array([[-0.04046106,  0.07178956, -0.07444929, -0.02096957,  0.01386234],
       [-0.02511991,  0.0449729 ,  0.00684274, -0.09887459,  0.02967102],
       [-0.0140421 , -0.09616086,  0.00980749,  0.05497954,  0.09260675],
       [-0.00475911, -0.08969692, -0.08260957, -0.05395378, -0.09687751],
       [ 0.01370306, -0.09600275,  0.09940554,  0.0561846 , -0.09013303]],
      dtype=float32)

새로운 문장에 대한 pv를 inference한다.

In [12]:
#inference stage
model.infer_vector('오늘 좋은 일이 있을 것 같아.'.split())
# 5개의 벡터로 추론해준다.

array([-0.04696345, -0.10669488, -0.06712131, -0.03154682, -0.07034492],
      dtype=float32)

### 예제 살펴보기
Doc2Vec과 Logistic Regression을 이용한 영화리뷰 데이터 분류

In [80]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [81]:
TRAIN_CLEAN_DATA = '4-1.train_clean.csv'
DATA_IN_PATH = 'C:/inkyun/실습파일과 교재/5.자연어처리(실습파일)/dataset/'

In [82]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [83]:
sentences = []
for review in reviews:
    sentences.append(review.split())

In [84]:
model_name = '4-1.300features.doc2vec'
model_saved = True

In [85]:
if model_saved:
    model = Doc2Vec.load(DATA_IN_PATH + model_name)
else:
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]

    model = Doc2Vec(vector_size=300, alpha=0.025, min_alpha = 0.00025,
                   min_count=10, workers = 4, dm = 1)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=10)
    model.save(DATA_IN_PATH + model_name)

In [86]:
keys = list(model.wv.vocab.keys())[:20]
print(keys)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought']


In [88]:
print(model.wv.similarity('dog','cat'), model.wv.similarity('dog','cake'), sep='\n')

0.5822584
0.05612491


In [89]:
print(np.dot(model.wv['dog'], model.wv['cat']), np.dot(model.wv['dog'], model.wv['cake']),sep = '\n')

49.470947
2.869419


In [90]:
model.wv.most_similar('dog')

[('chicken', 0.592168390750885),
 ('cat', 0.5822584629058838),
 ('puppy', 0.5540207624435425),
 ('cats', 0.5450679063796997),
 ('eat', 0.5350972414016724),
 ('dogs', 0.5316440463066101),
 ('bunny', 0.5241804122924805),
 ('worm', 0.5236088633537292),
 ('bite', 0.5121653079986572),
 ('bike', 0.5107408761978149)]

In [92]:
new_sentence = model.infer_vector(['system','response','cpu','compute'])

In [101]:
RANDOM_SEED = 42
TEST_SPLIT = 0.2
X = np.array([model.docvecs[i] for i in range(len(sentences))])
y = np.array(sentiments)

In [102]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT,
 random_state=RANDOM_SEED)

In [103]:
lgs = LogisticRegression(class_weight = 'balanced', solver = 'newton-cg')
lgs.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [104]:
predicted = lgs.predict(X_eval)
print(predicted[:20])
print('Accuracy: %f' % lgs.score(X_eval, y_eval))

[0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 1]
Accuracy: 0.845200


### FNN

In [105]:
from tensorflow.keras.layers import LSTM, Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [123]:
xInput = Input(batch_shape = (None,300))
xHidden1 = Dense(512)(xInput)
xHidden2 = Dense(256)(xHidden1)
xHidden3 = Dense(128)(xHidden2)
xOutput = Dense(1, activation = 'sigmoid')(xHidden3)
model2 = Model(xInput, xOutput)
model2.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.05) ,metrics=['accuracy'])
model2.fit(X_train,y_train, epochs=100, batch_size = 200)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2022949af08>

In [116]:
from sklearn.metrics import accuracy_score

In [125]:
accuracy_score(pred,y_eval)

0.8146

### word2vec - 8/5 gensim 복습하고 돌아오기

In [135]:
from gensim.models.word2vec import Word2Vec

In [136]:
num_features = 300 # 문자 벡터 차원 수
min_word_count = 40 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 10 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

# 초기화 및 모델 학습
from gensim.models import word2vec

# 모델 학습
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          size=num_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
model

<gensim.models.word2vec.Word2Vec at 0x202296bf948>

In [142]:
model.wv(sentences)

TypeError: 'Word2VecKeyedVectors' object is not callable