In [None]:
import numpy as np
from emo_utils import *
import emoji
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
X_train, Y_train = read_csv('data/train_emoji.csv')
X_test, Y_test = read_csv('data/tesss.csv')

In [None]:
#获取最长的句子，得到她的长度
maxLen = len(max(X_train, key=len).split())

### 第一个模型
<center>
<img src="images/image_1.png" style="width:900px;height:300px;">
<caption><center> **Figure 1**: Baseline model (Emojifier-V1).</center></caption>
</center>

In [None]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [None]:
Y_oh_train = convert_to_one_hot(Y_train, C = 5)
Y_oh_test = convert_to_one_hot(Y_test, C = 5)

In [None]:
#读取预训练的嵌入模型
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [None]:
# average层
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.lower().split()#将句子化为小写表示，并分词
    avg = np.zeros((50, ))
    for w in words:
        avg += word_to_vec_map[w]
    avg /= len(words)
    return avg

#### 映射，Softmax，loss：
$$ z^{(i)} = W . avg^{(i)} + b$$
$$ a^{(i)} = softmax(z^{(i)})$$
$$ \mathcal{L}^{(i)} = - \sum_{k = 0}^{n_y - 1} Yoh^{(i)}_k * log(a^{(i)}_k)$$

In [None]:
# GRADED FUNCTION: model

def model(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400):
    np.random.seed(1)
    m = Y.shape[0]                          # number of training examples
    n_y = 5                                 # number of classes  
    n_h = 50                                # dimensions of the GloVe vectors 
    
    # Initialize parameters using Xavier initialization
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))

    Y_oh = convert_to_one_hot(Y, C = n_y) 
    
    # Optimization loop
    for t in range(num_iterations):                       # Loop over the number of iterations
        for i in range(m):                                # Loop over the training examples
            #average层
            avg = sentence_to_avg(X[i], word_to_vec_map)
            #Softmax层
            z = np.dot(W, avg) + b
            a = softmax(z)
            #损失函数
            cost = -Y_oh * np.log(a)
            #梯度值
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz

            # Update parameters with Stochastic Gradient Descent
            W = W - learning_rate * dW
            b = b - learning_rate * db
        
        if t % 100 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map)

    return pred, W, b

In [None]:
#上面模型的测试接口,提供了accuracy度量,所以需要Y
def predict(X, Y, W, b, word_to_vec_map):
    m = X.shape[0]#num of training examples
    pred = np.zeros((m, 1))
    
    for j in range(m):
        avg = sentence_to_avg(X[j], word_to_vec_map)

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)
        
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    
    return pred

In [None]:
#训练集和测试集的acc
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, word_to_vec_map)

### 第二个模型
<img src="images/emojifier-v2.png" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-V2. A 2-layer LSTM sequence classifier. </center></caption>

In [None]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

#### embedding层

- 输入句子的长度不同，这里对短句直接padding

In [None]:
#输入一个batch的句子，将每个单词用index表示，得到输入给embedding层的tensor
def sentences_to_indices(X, word_to_index, max_len):
    X_indices = np.zeros((X.shape[0], max_len))
    for i in range(X.shape[0]):
        words_index = np.array([word_to_index[word] for word in X[i].lower().split()])
        X_indices[i, :len(words_index)] = words_index
    return X_indices

- embedding层
- [官方提供的例子](https://github.com/MoyanZitto/keras-cn/blob/master/docs/legacy/blog/word_embedding.md)

In [None]:
# pretrained_embedding_layer

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    #初始化嵌入矩阵
    vocab_len = len(word_to_index) + 1  
    emb_dim = word_to_vec_map["cucumber"].shape[0]  
    emb_matrix = np.zeros((vocab_len, emb_dim))
    #生成嵌入矩阵--从字典表示转化为矩阵
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    #生成嵌入层
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

#### 模型
- 这里注意LSTM的用法--return_sequences返回序列还是最后一个元素

In [None]:
def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    
    sentence_indices = Input(shape=input_shape, dtype=np.int32)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(5, activation='softmax')(X)
    X = Activation('softmax')(X)
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [None]:
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index)
model.summary()    #可用于查看模型参数数量，各层的配置

In [None]:
#损失和优化方法
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#训练集
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train, C = 5)

In [None]:
#训练
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

In [None]:
#测试集
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
Y_test_oh = convert_to_one_hot(Y_test, C = 5)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)

In [None]:
#运行接口
x_test = np.array(['not feeling happy'])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
model.predict(X_test_indices)