In [1]:
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# keras指定運行時顯卡及限制GPU用量
import os
import keras.backend.tensorflow_backend as KTF
from tensorflow.python.client import device_lib
import tensorflow as tf
# 指定第一块GPU可用 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

config = tf.ConfigProto()  
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
sess = tf.Session(config=config)


# # 设置session
KTF.set_session(tf.Session(config=tf.ConfigProto(device_count={'gpu':0})))

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_gpus())

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


['/cpu:0', '/gpu:0']


In [2]:
import urllib.request
import tarfile
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filePath = "E:/WorkSpace/TensorFlowWorkSpace/download_data/aclImdb_v1.tar.gz"

download_progress = 0

def report(block_no, block_size, file_size): #列出下載進度function
    global download_progress
    download_progress += block_size
    if (block_no % 500 == 0) or (download_progress == file_size):
        print("Downloaded block %i, %i/%i bytes recieved."% (block_no, download_progress, file_size))

if not os.path.isfile(filePath): 
    result, headers = urllib.request.urlretrieve(url,filePath,reporthook=report)
    print("Download complete, saved as %s" % (result))


In [3]:
if not os.path.exists('E:/WorkSpace/TensorFlowWorkSpace/download_data/aclImdb'):
    tfile = tarfile.open(filePath,'r:gz')  # tfile 壓縮檔 
    result = tfile.extractall('E:/WorkSpace/TensorFlowWorkSpace/download_data') #解壓縮至指定目錄



In [4]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import re
def rm_html_tags(text):
    re_tag = re.compile(r'<[^>]+>') #re_tag : 正則表達式變數
    return re_tag.sub('',text) #使用正則表達式，將html tag轉為空字串
    

In [5]:
def read_files(filetype):
    path = "E:/WorkSpace/TensorFlowWorkSpace/download_data/aclImdb/" #影評資料目錄
    file_list=[]
    
    positive_path = path +filetype +"/pos/"  #設定正評資料目錄路徑
    for f in os.listdir(positive_path) : #依序取出正評資料名稱
        file_list +=[ positive_path + f] #將正評資料路徑依序加入file_list
        
    neg_path = path + filetype +"/neg/"  #設定負評資料目錄路徑
    for f in os.listdir(neg_path) : #依序取出負評資料名稱
        file_list +=[ neg_path + f] #將負評資料路徑依序加入file_list
        
    print('read : ',filetype,", files : ",len(file_list))
    
    all_labels = ([1]*12500 + [0] *12500) #建立一個Label List，前半段為1(代表正評),後半段為0(代表負評)
    all_texts = []
    for fi in file_list:
        with open (fi,encoding="utf8") as file_input: #with語句自動幫我們調用close()關閉檔案
            all_texts +=  [rm_html_tags("".join(file_input.readlines()))]
            # file_input.readlines() : 讀取檔案內容
            # "".join(檔案內容) : 以空字串 作為分隔符，將檔案內容中所有字串合併
            # all_texts += [rm_html_tags(content)] : 移除content裡的html tags並加入到all_texts
    return all_labels,all_texts 

In [6]:
y_train,train_text = read_files("train")
y_test,test_text = read_files("test")

read :  train , files :  25000
read :  test , files :  25000


In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #設定Jupyter對獨佔一行的所有變數或者語句都自動顯示
y_train[0]
train_text[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [8]:
token = Tokenizer(num_words=3800) #使用Tokenizer建立一個3800字的字典
token.fit_on_texts(train_text) #將排序前3800名的英文字加入到字典


In [9]:
token.document_count #token讀取的影評筆數
token.word_index #文字字典

25000

{'cunty': 70721,
 'sports': 2249,
 'catastrophically': 82744,
 'forlornly': 87153,
 'eacb': 52277,
 "'hear'": 78268,
 'franjo': 61624,
 'blocks': 13150,
 'videographer': 40527,
 "went'": 87113,
 'naala': 71421,
 'sheffielders': 87492,
 "'son'": 58447,
 'featured': 2561,
 'baiscally': 58329,
 'opened': 3063,
 'erotica': 16061,
 "'pet": 60276,
 'gracefully': 12561,
 'eighth': 11617,
 'chetnik': 81062,
 'raining': 15221,
 'frighten': 14192,
 'rober': 46753,
 'reverses': 22636,
 'bal¨': 59846,
 'incensed': 28678,
 'psychoanalytical': 34459,
 'slow': 547,
 'inners': 64009,
 'thrillers': 3093,
 "tomas'": 51738,
 'haven’t': 51010,
 'ond': 26777,
 'keye': 36709,
 'undertakers': 49987,
 'alligator': 9392,
 'location': 1621,
 "lungren's": 51656,
 'sheez': 76963,
 'lyrical': 9329,
 'gridiron': 45040,
 'shack': 9409,
 "dreyfuss'": 72967,
 'burstyn': 12478,
 'banjoes': 48227,
 'ally': 6331,
 'propagandized': 44730,
 'ipecac': 52483,
 "vincenzo's": 52897,
 'literally': 1221,
 'portends': 39954,
 'bl

In [10]:
#影評中的文字 => 轉為數字list (有出現在字典的字才會轉換)
x_train_seq = token.texts_to_sequences(train_text) 
x_test_seq = token.texts_to_sequences(test_text)
train_text[0]
x_train_seq[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

[308,
 6,
 3,
 1068,
 208,
 8,
 2160,
 29,
 1,
 168,
 54,
 13,
 45,
 81,
 40,
 391,
 109,
 137,
 13,
 57,
 149,
 7,
 1,
 481,
 68,
 5,
 260,
 11,
 2000,
 6,
 72,
 2422,
 5,
 631,
 70,
 6,
 1,
 5,
 2001,
 1,
 1530,
 33,
 66,
 63,
 204,
 139,
 64,
 1229,
 1,
 4,
 1,
 222,
 899,
 28,
 3021,
 68,
 4,
 1,
 9,
 693,
 2,
 64,
 1530,
 50,
 9,
 215,
 1,
 386,
 7,
 59,
 3,
 1470,
 3710,
 798,
 5,
 3509,
 176,
 1,
 391,
 9,
 1235,
 29,
 308,
 3,
 352,
 343,
 2970,
 142,
 129,
 5,
 27,
 4,
 125,
 1470,
 2372,
 5,
 308,
 9,
 532,
 11,
 107,
 1466,
 4,
 57,
 554,
 100,
 11,
 308,
 6,
 226,
 47,
 3,
 2231,
 11,
 8,
 214]

In [11]:
print("單字數 : ",len(train_text[0].split( )))
print('數字list : ',len(x_train_seq[0]))

單字數 :  140
數字list :  116


In [12]:
#截長補短至長度100
x_train = sequence.pad_sequences(x_train_seq,maxlen=380)  #x_train截長補短後的測試資料
x_test = sequence.pad_sequences(x_test_seq,maxlen=380)    #x_test截長補短後的測試資料

In [13]:
print('數字list : ',len(x_train_seq[1]),'\n',x_train_seq[1],'\n')
print('前截長後數字list : ',len(x_train[1]),'\n',x_train[1],'\n')

print('數字list : ',len(x_train_seq[6]),'\n',x_train_seq[6],'\n')
print('前補短後數字list : ',len(x_train[6]),'\n',x_train[6],'\n')

數字list :  365 
 [38, 13, 739, 3413, 43, 73, 31, 1828, 14, 149, 17, 111, 3, 1338, 5, 335, 144, 19, 1, 886, 11, 67, 276, 1190, 402, 33, 118, 282, 35, 166, 5, 391, 153, 38, 2304, 14, 1, 546, 87, 80, 100, 4, 1, 3263, 13, 39, 3, 412, 1199, 133, 40, 179, 137, 13, 3080, 1, 321, 19, 358, 5, 3107, 2126, 1, 38, 44, 3656, 25, 371, 5, 126, 52, 19, 1, 1980, 17, 47, 44, 21, 67, 344, 3, 2128, 5, 408, 19, 1, 1980, 14, 3, 3229, 205, 1, 21, 276, 65, 35, 3, 340, 1, 719, 725, 3, 1264, 19, 1, 1506, 3, 1220, 2, 282, 21, 276, 2523, 5, 63, 47, 41, 36, 5, 25, 3263, 11, 6, 2030, 3763, 3208, 33, 33, 379, 13, 294, 3, 1022, 128, 33, 43, 282, 7, 1, 178, 362, 5, 93, 3, 2128, 15, 3, 2995, 5, 63, 44, 26, 66, 408, 7, 1, 1980, 14, 3247, 499, 205, 1, 44, 2875, 26, 66, 78, 47, 26, 490, 15, 3, 701, 1181, 4, 227, 49, 1, 19, 117, 6, 1367, 19, 1, 886, 15, 3, 19, 23, 5, 23, 171, 843, 117, 26, 187, 1483, 121, 1, 236, 344, 1, 30, 3, 99, 41, 394, 19, 23, 117, 888, 81, 101, 582, 3, 251, 30, 1, 399, 4, 1956, 31, 1230, 3183, 33, 184

In [14]:
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Flatten
from keras.layers.embeddings import Embedding
model = Sequential()

#Embedding層 : 負責將數字list轉為向量list
model.add(Embedding(input_dim=3800,  # input_dim : 輸入的字典維度(共3800個字)
                    output_dim=32,   #output_dim :轉換出來的向量維度
                    input_length=380)) #input_length : 輸入的數字list長度
model.add(Dropout(0.2))

#Flatten平坦層 
model.add(Flatten()) #將多维的向量list一维化 (共有100*32個神經元)

model.add(Dense(units=256,activation='relu'))
model.add(Dropout(0.35))

model.add(Dense(units=1,activation='sigmoid')) #輸出層只有1個神經元(1為正評,0為負評)

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 12160)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               3113216   
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 3,235,073
Trainable params: 3,235,073
Non-trainable params: 0
_________________________________________________________________


In [15]:
train_history = model.fit(x_train, # x_train : 訓練資料的特徵
                          y_train, # y_train : 訓練資料的label
                          batch_size=100,
                          epochs=10,
                          verbose=2,
                          validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 3s - loss: 0.4677 - acc: 0.7631 - val_loss: 0.3303 - val_acc: 0.8642
Epoch 2/10
 - 2s - loss: 0.1928 - acc: 0.9239 - val_loss: 0.3909 - val_acc: 0.8464
Epoch 3/10
 - 2s - loss: 0.0752 - acc: 0.9754 - val_loss: 0.6867 - val_acc: 0.7858
Epoch 4/10
 - 2s - loss: 0.0267 - acc: 0.9927 - val_loss: 0.7166 - val_acc: 0.8194
Epoch 5/10
 - 2s - loss: 0.0143 - acc: 0.9959 - val_loss: 0.9208 - val_acc: 0.7988
Epoch 6/10
 - 2s - loss: 0.0123 - acc: 0.9963 - val_loss: 0.7616 - val_acc: 0.8404
Epoch 7/10
 - 2s - loss: 0.0135 - acc: 0.9955 - val_loss: 1.0829 - val_acc: 0.7974
Epoch 8/10
 - 2s - loss: 0.0144 - acc: 0.9953 - val_loss: 1.2176 - val_acc: 0.7872
Epoch 9/10
 - 2s - loss: 0.0171 - acc: 0.9944 - val_loss: 1.0085 - val_acc: 0.8148
Epoch 10/10
 - 2s - loss: 0.0127 - acc: 0.9955 - val_loss: 0.7938 - val_acc: 0.8532


In [16]:
scores = model.evaluate(x_test, # x_test : 測試資料的特徵
                        y_test, # y_test : 測試資料的Label
                        verbose=1)
scores[1] #模型準確率



0.85048

In [17]:
predit = model.predict_classes(x_test)
predit[:10] #前10筆預測結果

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1]])

In [18]:
predict_classes = predit.reshape(-1) # predict_classes :轉為一維向量
predict_classes[:10]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1])

In [19]:
評價字典 = {1:'正面的',0:'負面的'}
def display_test_評價(i):
    print(test_text[i])
    print('Label 真實結果:',評價字典[y_test[i]] # y_test[i] : 第i筆影評測試結果
          ,',預測結果',評價字典[predict_classes[i]]) #p redict_classes[i]] : 第i筆影評預測結果
display_test_評價(12502)

First of all I hate those moronic rappers, who could'nt act if they had a gun pressed against their foreheads. All they do is curse and shoot each other and acting like cliché'e version of gangsters.The movie doesn't take more than five minutes to explain what is going on before we're already at the warehouse There is not a single sympathetic character in this movie, except for the homeless guy, who is also the only one with half a brain.Bill Paxton and William Sadler are both hill billies and Sadlers character is just as much a villain as the gangsters. I did'nt like him right from the start.The movie is filled with pointless violence and Walter Hills specialty: people falling through windows with glass flying everywhere. There is pretty much no plot and it is a big problem when you root for no-one. Everybody dies, except from Paxton and the homeless guy and everybody get what they deserve.The only two black people that can act is the homeless guy and the junkie but they're actors by 

In [20]:
# commit : 網路影評文字
commit = '''  
Where do I start. This adaptation of Disney's 1991 Beauty and the Beast was an utter disappointment. Emma Watson as Belle was extremely unconvincing from the start to the end. She had the same expressions as the actress from Twilight. The animators did a terrible job with the Beast. He looked fake and lifeless. They could have used special makeup to create the beast similar to the Grinch where we get to see Jim Carrey's expressions. The side character animations were poorly executed. Overall I felt the film was rushed as there was lack of compassion and chemistry between the characters. There was a lot of CGI and green screen which could have been replaced by normal acting, because then why make an animated version of an animated film? This is by far the worst remake of an animated classic.
'''
def predit_review(commit):
    input_seq = token.texts_to_sequences([commit]) # 將網路影評文字 => 數字list
    pad_input_seq = sequence.pad_sequences(input_seq,maxlen=380)  #數字list截長補短至100
    predit_result = model.predict_classes(pad_input_seq)
    print(commit)
    print(評價字典[predit_result[0][0]])
    
predit_review(commit)

  
Where do I start. This adaptation of Disney's 1991 Beauty and the Beast was an utter disappointment. Emma Watson as Belle was extremely unconvincing from the start to the end. She had the same expressions as the actress from Twilight. The animators did a terrible job with the Beast. He looked fake and lifeless. They could have used special makeup to create the beast similar to the Grinch where we get to see Jim Carrey's expressions. The side character animations were poorly executed. Overall I felt the film was rushed as there was lack of compassion and chemistry between the characters. There was a lot of CGI and green screen which could have been replaced by normal acting, because then why make an animated version of an animated film? This is by far the worst remake of an animated classic.

負面的
