<a href="https://colab.research.google.com/github/MeldaChen/practice/blob/main/%E6%B7%B1%E5%BA%A6%E5%AD%B8%E7%BF%92_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [None]:
test_df

Unnamed: 0,content,sentiment
0,Love Jones cleverly portrays young African-Ame...,1
1,"Wow, here is another great golf movie. That's ...",1
2,"Giorgino is a long, excruciating journey from ...",1
3,Had it with the one who raised you since when ...,1
4,This wonderfully witty comedy-drama wowed the ...,1
...,...,...
24995,All the pro comments about this movie claim th...,0
24996,Nothing will ever top KOMODO with the lovely J...,0
24997,"When I went to see this film, let's not say th...",0
24998,I supposed I was actually expecting a Bollywoo...,0


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense  #不用攤平的

layers = [
    # 3001: 情緒化對象(3000種最常用詞彙+padding(0))
    # 100: 情緒個數(100-500)
    # input_length: 一篇文章你要看幾個詞彙(128-512)
    # 一篇文章我會蒐集512詞彙, 這512詞彙是在3000最常用單字裡, 每一個詞彙會被我化作100維度的向量
    # param係數個數 = 3001(種詞彙)*100(種情緒)
    # 情緒沒有及格標準(情緒有正負)，故不加activation (做了會只有0以上的值)
    # NLP 的問題，第一層一定是embedding
    Embedding(3001, 100, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss = SparseCategoricalCrossentropy(),
    optimizer = "adam",
    metrics = ["accuracy"])

In [None]:
# tokenize:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words = 3000)
tok.fit_on_texts(train_df["content"])
# 你可以看一下fit的結果
# tok.word_index
# tok.index_word

In [None]:
# 文章 token 化：sequence
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,1,433,155,1114,783,2974,6,221,8.0,3.0,...,,,,,,,,,,
1,10,293,1,120,155,150,593,2,444.0,9.0,...,,,,,,,,,,
2,21,3,75,678,5,132,41,11,19.0,63.0,...,,,,,,,,,,
3,44,22,178,5,374,11,6,3,49.0,270.0,...,,,,,,,,,,
4,29,137,5,1822,6,32,1121,17,37.0,54.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,10,207,76,31,105,12,68,537,395.0,31.0,...,,,,,,,,,,
24996,1,83,17,6,181,49,11,28,6.0,181.0,...,,,,,,,,,,
24997,14,3,247,1677,242,36,38,389,2056.0,5.0,...,,,,,,,,,,
24998,1,344,6,2800,8,1828,118,242,36.0,2175.0,...,,,,,,,,,,


In [None]:
# padding and truncating  padding 因為有max zero，所以沒差
# truncating 截斷，預設從前面開始截

from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,...,410,251,5,166,14,919,343,872,941,80
1,0,0,0,0,0,0,0,0,0,0,...,348,5,1,179,1079,68,14,70,67,291
2,0,0,0,0,0,0,0,0,0,0,...,3,899,91,3,63,49,17,81,261,69
3,0,0,0,0,0,0,0,0,0,0,...,485,9,6,3,615,248,15,29,4,175
4,0,0,0,0,0,0,0,0,0,0,...,2932,49,718,7,7,58,672,43,4,1328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,407,35,612,2,9,283,75,18,213,20
24996,0,0,0,0,0,0,0,0,0,0,...,1756,96,75,9,149,1468,1,338,99,148
24997,0,0,0,0,0,0,0,0,0,0,...,1,2,38,106,6,345,114,5,154,16
24998,0,0,0,0,0,0,0,0,0,0,...,15,256,2,1518,6,32,809,4,1056,424


In [None]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])
y_train

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
# batch_size: 我看多少筆, 把所有筆的斜率做一個平均, 再調整
# epochs: 整份資料看幾遍(你覺得模型訓練好就停下來)
# validation_split: 切出一部份資料來做模型確認
# epochs + validation: 什麼時候停下來(val_loss平滑的時候就停下來)
# verbose: 決定log印多少 1(default):進度條 2:沒有進度條, 0:完全不印
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
callbacks = [
    ModelCheckpoint("sentiment.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)
]
model.fit(x_train_pad,
     y_train,
     batch_size=200,
     epochs=50,
     validation_split=0.1,
     callbacks=callbacks,
     verbose=2)

Epoch 1/50
113/113 - 7s - loss: 0.6411 - accuracy: 0.6238 - val_loss: 0.6530 - val_accuracy: 0.6224 - 7s/epoch - 65ms/step
Epoch 2/50
113/113 - 7s - loss: 0.4900 - accuracy: 0.8157 - val_loss: 0.4979 - val_accuracy: 0.7844 - 7s/epoch - 58ms/step
Epoch 3/50
113/113 - 6s - loss: 0.3824 - accuracy: 0.8586 - val_loss: 0.4329 - val_accuracy: 0.8176 - 6s/epoch - 54ms/step
Epoch 4/50
113/113 - 6s - loss: 0.3277 - accuracy: 0.8759 - val_loss: 0.3712 - val_accuracy: 0.8496 - 6s/epoch - 55ms/step
Epoch 5/50
113/113 - 7s - loss: 0.2959 - accuracy: 0.8873 - val_loss: 0.3526 - val_accuracy: 0.8568 - 7s/epoch - 59ms/step
Epoch 6/50
113/113 - 7s - loss: 0.2745 - accuracy: 0.8944 - val_loss: 0.3348 - val_accuracy: 0.8632 - 7s/epoch - 62ms/step
Epoch 7/50
113/113 - 7s - loss: 0.2593 - accuracy: 0.9001 - val_loss: 0.3166 - val_accuracy: 0.8740 - 7s/epoch - 65ms/step
Epoch 8/50
113/113 - 7s - loss: 0.2476 - accuracy: 0.9049 - val_loss: 0.3386 - val_accuracy: 0.8620 - 7s/epoch - 60ms/step
Epoch 9/50
113/1

<keras.callbacks.History at 0x7f336c0fdfd0>

In [None]:
model.evaluate(x_test_pad, y_test)



[0.29084011912345886, 0.8816400170326233]

In [None]:
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
infer = Sequential(layers)
infer.layers[0].set_weights(model.layers[0].get_weights())
infer.layers[2].set_weights(model.layers[2].get_weights())
infer.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 100)         300100    
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [None]:
comment = "This film just proves how important was Stan Lee and his ingenious story telling was critical to Marvel.  Marvel just spent millions of dollers to show a dodgeball match between Wanda and the rest of the cast with a Halloween backdrop.  It is no wonder the makers had to rely on over the top VFX as there is absolutely no coherent story to weave.  Money down the drain...dont even watch if someone shows it to you for free...." #@param {type:"string"}
seq = tok.texts_to_sequences([comment])
prob = infer.predict(seq)[0]
trans = ["neg", "pos"]
for t, p in zip(trans, prob):
    print(t, "的機率:", p)

neg 的機率: 0.990607
pos 的機率: 0.009392967


In [None]:
layers = [
    Embedding(3001, 100, mask_zero=True),
    GlobalAveragePooling1D(),
]
infer = Sequential(layers)
infer.layers[0].set_weights(model.layers[0].get_weights())
infer.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 100)         300100    
                                                                 
 global_average_pooling1d_2   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
Total params: 300,100
Trainable params: 300,100
Non-trainable params: 0
_________________________________________________________________


In [None]:
from scipy.spatial.distance import cosine
comment1 = "horror" #@param {type:"string"}
comment2 = "scary" #@param {type:"string"}
seq1 = tok.texts_to_sequences([comment1])
v1 = infer.predict(seq1)[0]
seq2 = tok.texts_to_sequences([comment2])
v2 = infer.predict(seq2)[0]
# 越趨近於1越相似
print("相似度:", 1 - cosine(v1, v2))

相似度: 0.8994685411453247
