### 임베딩
##### 1. 희소 표현 기반 임베딩
##### 2. 횟수 기반 임베딩
##### 3. 예측 기반 임베딩
##### 4. 횟수/예측 기반 임베딩


In [14]:
# 1. 희소 표현 기반 임베딩
import numpy as np
import pandas as pd
class2 = pd.read_csv("080263/chap10/data/class2.csv")

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder(sparse_output=False)

var = np.array(class2['class2']).reshape(-1, 1)
train_x = onehot_encoder.fit_transform(var)
print(train_x)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [15]:
# 2. 횟수 기반 임베딩
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is last chance',
    'and if you do not have this chance',
    'you will never get any chance',
    'will you do get this one?',
    'please, get this chance'
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 13,
 'is': 7,
 'last': 8,
 'chance': 2,
 'and': 0,
 'if': 6,
 'you': 15,
 'do': 3,
 'not': 10,
 'have': 5,
 'will': 14,
 'never': 9,
 'get': 4,
 'any': 1,
 'one': 11,
 'please': 12}

In [16]:
vect.transform(['you will never get any chance']).toarray()

array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]])

In [17]:
vect = CountVectorizer(stop_words=['and', 'is', 'please', 'this']).fit(corpus)
vect.vocabulary_

{'last': 6,
 'chance': 1,
 'if': 5,
 'you': 11,
 'do': 2,
 'not': 8,
 'have': 4,
 'will': 10,
 'never': 7,
 'get': 3,
 'any': 0,
 'one': 9}

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
doc = ['I like machine learning', 'I love deep learning', 'I run everyday']
tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(doc)
doc_distance = (tfidf_matrix * tfidf_matrix.T)
print(doc_distance)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (3, 3)>
  Coords	Values
  (0, 1)	0.224324998974933
  (0, 0)	1.0000000000000002
  (1, 1)	1.0000000000000002
  (1, 0)	0.224324998974933
  (2, 2)	1.0000000000000002


In [17]:
# 3. 예측 기반 임베딩
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec

sample = open("080263/chap10/data/peter.txt", "r", encoding="UTF8")
s = sample.read()
f = s.replace("\n", " ")

data = []
for i in sent_tokenize(f):
    temp = []
    for j in word_tokenize(i):
        temp.append(j.lower())
    data.append(temp)

data

[['once',
  'upon',
  'a',
  'time',
  'in',
  'london',
  ',',
  'the',
  'darlings',
  'went',
  'out',
  'to',
  'a',
  'dinner',
  'party',
  'leaving',
  'their',
  'three',
  'children',
  'wendy',
  ',',
  'jhon',
  ',',
  'and',
  'michael',
  'at',
  'home',
  '.'],
 ['after',
  'wendy',
  'had',
  'tucked',
  'her',
  'younger',
  'brothers',
  'jhon',
  'and',
  'michael',
  'to',
  'bed',
  ',',
  'she',
  'went',
  'to',
  'read',
  'a',
  'book',
  '.'],
 ['she', 'heard', 'a', 'boy', 'sobbing', 'outside', 'her', 'window', '.'],
 ['he', 'was', 'flying', '.'],
 ['there', 'was', 'little', 'fairy', 'fluttering', 'around', 'him', '.'],
 ['wendy', 'opened', 'the', 'window', 'to', 'talk', 'to', 'him', '.'],
 ['“', 'hello', '!'],
 ['who', 'are', 'you', '?'],
 ['why', 'are', 'you', 'crying', '”', ',', 'wendy', 'asked', 'him', '.'],
 ['“', 'my', 'name', 'is', 'peter', 'pan', '.'],
 ['my',
  'shadow',
  'wouldn',
  '’',
  't',
  'stock',
  'to',
  'me.',
  '”',
  ',',
  'he',
  'rep

In [28]:
model1 = Word2Vec(data, min_count=1, vector_size=100, window=5, sg=0)
print("Cosine similarity between 'peter' 'wendy' - CBOW :", model1.wv.similarity('peter', 'wendy'))
print("Cosine similarity between 'peter' 'hook' - CBOW :", model1.wv.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'wendy' - CBOW : 0.07439384
Cosine similarity between 'peter' 'hook' - CBOW : 0.027709857


In [29]:
model2 = Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
print("Cosine similarity between 'peter' 'wendy' - Skip Gram :", model2.wv.similarity('peter', 'wendy'))
print("Cosine similarity between 'peter' 'hook' - Skip Gram :", model2.wv.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'wendy' - Skip Gram : 0.40088683
Cosine similarity between 'peter' 'hook' - Skip Gram : 0.52016735


In [36]:
from gensim.test.utils import common_texts
from gensim.models import FastText

model = FastText("080263/chap10/data/peter.txt", vector_size=4, window=3, min_count=1, epochs=10)
sim_score = model.wv.similarity('peter', 'wendy')
print(sim_score)
sim_score = model.wv.similarity('peter', 'hook')
print(sim_score)

0.4592452
0.043825686


In [10]:
from __future__ import print_function
from gensim.models import KeyedVectors

model_kr = KeyedVectors.load_word2vec_format("080263/chap10/data/wiki.ko.vec")

In [11]:
for similar_word in model_kr.similar_by_word('노력'):
    print(similar_word[0], similar_word[1])

노력함 0.796721339225769
노력중 0.7502310872077942
노력만 0.7195297479629517
노력과 0.7137250900268555
노력의 0.6944872140884399
노력가 0.6931817531585693
노력이나 0.6855085492134094
노력없이 0.6761217713356018
노력맨 0.6756712198257446
노력보다는 0.6753138303756714


In [14]:
similarities = model_kr.most_similar(positive=['동물', '육식동물'], negative=['사람'])
print(similarities[0])

('초식동물', 0.7804122567176819)


In [3]:
# 4. 횟수/예측 기반 임베딩
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.decomposition import PCA
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath("/Users/minwoo/dev/Awesome-DL-Study/Deep_Learning_With_Tensorflow/080263/chap10/data/glove.6B.100d.txt")
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


(400000, 100)

In [4]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
model.most_similar('bill')

[('legislation', 0.8072139620780945),
 ('proposal', 0.7306863069534302),
 ('senate', 0.7142540812492371),
 ('bills', 0.704440176486969),
 ('measure', 0.6958035230636597),
 ('passed', 0.6906244158744812),
 ('amendment', 0.6846879720687866),
 ('provision', 0.6845566630363464),
 ('plan', 0.6816462874412537),
 ('clinton', 0.6663140058517456)]

In [5]:
model.most_similar(negative=['cherry'])

[('kazushige', 0.4834350347518921),
 ('askerov', 0.4778185784816742),
 ('lakpa', 0.46915262937545776),
 ('ex-gay', 0.45713332295417786),
 ('tadayoshi', 0.4522107243537903),
 ('turani', 0.44810065627098083),
 ('saglam', 0.4469599425792694),
 ('aijun', 0.4435270130634308),
 ('adjustors', 0.44235295057296753),
 ('nyum', 0.4423117935657501)]

In [6]:
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380331993103),
 ('throne', 0.6755736470222473),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534157752991),
 ('prince', 0.6517034769058228),
 ('elizabeth', 0.6464518308639526),
 ('mother', 0.631171703338623),
 ('emperor', 0.6106470823287964),
 ('wife', 0.6098655462265015)]

In [7]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[x1, x2], negative=[y1])
    return result[0][0]

print(analogy('beer', 'france', 'australia'))
print(analogy('tallest', 'long', 'tall'))

champagne
longest


In [9]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

cereal
