# Word2Vec

In [1]:
corpus = ["and the cute kitten purred and then",
          "the cute furry cat purred and miaowed",
          "that the small kitten miaowed and she",
          "the loud furry dog bowwowed and bit"]

In [2]:
cor = {'cat':['cute', 'furry', 'purred', 'miaowed'],
       'kitten':['cute', 'purred', 'small', 'miaowed'],
       'dog':['bowwowed', 'furry', 'loud', 'ran', 'bit']}

## word-count Word2vec

In [3]:
words = ['cute', 'furry', 'purred', 'miaowed', 'cute', 'purred', 'small', 'miaowed', 'bowwowed', 'furry', 'loud', 'ran', 'bit']
dic = {word:i for i, word in enumerate(words)}

In [4]:
import pandas as pd
data = pd.DataFrame([], index=cor.keys(), columns=words)
data = data.fillna(0)
data

Unnamed: 0,cute,furry,purred,miaowed,cute.1,purred.1,small,miaowed.1,bowwowed,furry.1,loud,ran,bit
cat,0,0,0,0,0,0,0,0,0,0,0,0,0
kitten,0,0,0,0,0,0,0,0,0,0,0,0,0
dog,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
for i in cor.keys():
    data.loc[i, cor[i]] = 1

In [6]:
data

Unnamed: 0,cute,furry,purred,miaowed,cute.1,purred.1,small,miaowed.1,bowwowed,furry.1,loud,ran,bit
cat,1,1,1,1,1,1,0,1,0,1,0,0,0
kitten,1,0,1,1,1,1,1,1,0,0,0,0,0
dog,0,1,0,0,0,0,0,0,1,1,1,1,1


In [7]:
import numpy as np
def similar(x, y):
    x = np.array(x)
    y = np.array(y)
    return np.sum(x*y)/ (np.sqrt(sum((x**2))) * np.sqrt(sum((y**2))))

In [8]:
print(f'{data.index[0]}和{data.index[1]}的相似度为：{similar(data.iloc[0,:], data.iloc[1, :])}')
print(f'{data.index[0]}和{data.index[2]}的相似度为：{similar(data.iloc[0,:], data.iloc[2, :])}')
print(f'{data.index[2]}和{data.index[1]}的相似度为：{similar(data.iloc[2,:], data.iloc[1, :])}')

cat和kitten的相似度为：0.8017837257372731
cat和dog的相似度为：0.2886751345948129
dog和kitten的相似度为：0.0


## word2vec: CBOW

In [9]:
words = ['cute', 'furry', 'purred', 'miaowed', 'cute', 'purred', 'small', 'bowwowed', 'furry', 'loud', 'ran', 'bit', 'cat',
         'kitten', 'dog']
dic = {word:i for i, word in enumerate(words)}

In [10]:
onehot = np.diag([1]*len(words))
mat = pd.DataFrame(onehot, columns=words, index=words)
mat

Unnamed: 0,cute,furry,purred,miaowed,cute.1,purred.1,small,bowwowed,furry.1,loud,ran,bit,cat,kitten,dog
cute,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
furry,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
purred,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
miaowed,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
cute,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
purred,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
small,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
bowwowed,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
furry,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
loud,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [11]:
x = mat.loc[['dog', 'cat', 'kitten'], :]
x

Unnamed: 0,cute,furry,purred,miaowed,cute.1,purred.1,small,bowwowed,furry.1,loud,ran,bit,cat,kitten,dog
dog,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
cat,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
kitten,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [12]:
y = pd.DataFrame(index=mat.index)
for w in ['dog', 'cat', 'kitten']:
    a = mat.loc[cor[w], :].sum(axis=0)
    y = pd.concat([y, a], axis=1)
y = y.T
print(y)

   cute  furry  purred  miaowed  cute  purred  small  bowwowed  furry  loud  \
0     0      1       0        0     0       0      0         1      1     1   
0     1      1       1        1     1       1      0         0      1     0   
0     1      0       1        1     1       1      1         0      0     0   

   ran  bit  cat  kitten  dog  
0    1    1    0       0    0  
0    0    0    0       0    0  
0    0    0    0       0    0  


In [13]:
import numpy as np
x_data = np.array(x, dtype=np.float32)
y_data = np.array(y, dtype=np.float32)
print(x_data, '\n', y_data)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]] 
 [[0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [14]:
import tensorflow as tf

input_size = len(mat)
output_size = len(mat)
hidden_size = 5
learning_rate = 0.001

x = tf.placeholder(tf.float32, [None, input_size])
y = tf.placeholder(tf.float32, [None, output_size])

w = tf.Variable(tf.zeros([input_size, hidden_size]))
v = tf.Variable(tf.zeros([hidden_size, output_size]))

b1 = tf.Variable(tf.zeros([hidden_size]))
b2 = tf.Variable(tf.zeros([output_size]))

hidden_output = tf.sigmoid(tf.matmul(x, w) + b1)
out = tf.nn.softmax(tf.matmul(hidden_output, v) + b2)

cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(out), axis=1))

train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    
    for i in range(2000):
        sess.run(train_op, feed_dict={x:x_data, y:y_data})
    wc = sess.run(w)

  from ._conv import register_converters as _register_converters


In [15]:
wc

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.05189953,  0.05189953,  0.05189953,  0.05189953,  0.05189953],
       [ 0.0290082 ,  0.0290082 ,  0.0

In [16]:
wc2 = {}
wc2['cat'] = wc[12]
wc2['kitten'] = wc[13]
wc2['dog'] = wc[14]
wc2

{'cat': array([0.05189953, 0.05189953, 0.05189953, 0.05189953, 0.05189953],
       dtype=float32),
 'dog': array([-0.01879204, -0.01879204, -0.01879204, -0.01879204, -0.01879204],
       dtype=float32),
 'kitten': array([0.0290082, 0.0290082, 0.0290082, 0.0290082, 0.0290082],
       dtype=float32)}

In [17]:
similar(wc2['cat'], wc2['kitten'])

0.9999999574315318

In [18]:
print(f"cat和kitten的相似度为：{similar(wc2['cat'], wc2['kitten'])}")
print(f"cat和dog的相似度为：{similar(wc2['cat'], wc2['dog'])}")
print(f"dog和kitten的相似度为：{similar(wc2['dog'], wc2['kitten'])}")

cat和kitten的相似度为：0.9999999574315318
cat和dog的相似度为：-0.9999999355466211
dog和kitten的相似度为：-1.0000000072316493


相比上面用word-count的方法，CBOW方法得到的结果能够挖掘出dog和kitten之间的关系

## FastText

In [19]:
import fastText
classifier = fastText.train_supervised(r'..\data\train.txt')

In [20]:
classifier.get_words()

['like',
 'it',
 '</s>',
 'i',
 'a',
 'it,',
 'make',
 'me',
 'feel',
 'hate',
 'shit.',
 'baby.',
 'is',
 'smell',
 'its',
 'smell.',
 'oh,',
 'how',
 'could',
 'look',
 'sooo',
 'beautiful.']

In [21]:
classifier.get_labels()
classifier.predict(['i hate it', 'i like the baby smell', 'oh, how could it look sooo beautiful.'])

(['__label__bad', '__label__bad', '__label__good'],
 array([0.50006127, 0.50004983, 0.50001967]))

In [22]:
help(fastText.train_supervised)

Help on function train_supervised in module fastText.FastText:

train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss='softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='')
    Train a supervised model and return a model object.
    
    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
    
    The input file must must contain at least one label per line. For an
    example consult the example datasets which are part of the fastText
    repository such as the dataset pulled by classification-example.sh.

