In [2]:
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# keras指定運行時顯卡及限制GPU用量
import os
import keras.backend.tensorflow_backend as KTF
from tensorflow.python.client import device_lib
import tensorflow as tf

import ML
ML.init()
Sess = ML.limitGPUByGrowth()
# 设置session
KTF.set_session(Sess)

['/cpu:0', '/gpu:0']


In [3]:
import urllib.request
import tarfile
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filePath = "E:/WorkSpace/TensorFlowWorkSpace/download_data/aclImdb_v1.tar.gz"

download_progress = 0

def report(block_no, block_size, file_size): #列出下載進度function
    global download_progress
    download_progress += block_size
    if (block_no % 500 == 0) or (download_progress == file_size):
        print("Downloaded block %i, %i/%i bytes recieved."% (block_no, download_progress, file_size))

if not os.path.isfile(filePath): 
    result, headers = urllib.request.urlretrieve(url,filePath,reporthook=report)
    print("Download complete, saved as %s" % (result))


In [4]:
if not os.path.exists('E:/WorkSpace/TensorFlowWorkSpace/download_data/aclImdb'):
    tfile = tarfile.open(filePath,'r:gz')  # tfile 壓縮檔 
    result = tfile.extractall('E:/WorkSpace/TensorFlowWorkSpace/download_data') #解壓縮至指定目錄



In [5]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import re
def rm_html_tags(text):
    re_tag = re.compile(r'<[^>]+>') #re_tag : 正則表達式變數
    return re_tag.sub('',text) #使用正則表達式，將html tag轉為空字串
    

In [6]:
def read_files(filetype):
    path = "E:/WorkSpace/TensorFlowWorkSpace/download_data/aclImdb/" #影評資料目錄
    file_list=[]
    
    positive_path = path +filetype +"/pos/"  #設定正評資料目錄路徑
    for f in os.listdir(positive_path) : #依序取出正評資料名稱
        file_list +=[ positive_path + f] #將正評資料路徑依序加入file_list
        
    neg_path = path + filetype +"/neg/"  #設定負評資料目錄路徑
    for f in os.listdir(neg_path) : #依序取出負評資料名稱
        file_list +=[ neg_path + f] #將負評資料路徑依序加入file_list
        
    print('read : ',filetype,", files : ",len(file_list))
    
    all_labels = ([1]*12500 + [0] *12500) #建立一個Label List，前半段為1(代表正評),後半段為0(代表負評)
    all_texts = []
    for fi in file_list:
        with open (fi,encoding="utf8") as file_input: #with語句自動幫我們調用close()關閉檔案
            all_texts +=  [rm_html_tags("".join(file_input.readlines()))]
            # file_input.readlines() : 讀取檔案內容
            # "".join(檔案內容) : 以空字串 作為分隔符，將檔案內容中所有字串合併
            # all_texts += [rm_html_tags(content)] : 移除content裡的html tags並加入到all_texts
    return all_labels,all_texts 

In [7]:
y_train,train_text = read_files("train")
y_test,test_text = read_files("test")

read :  train , files :  25000
read :  test , files :  25000


In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #設定Jupyter對獨佔一行的所有變數或者語句都自動顯示
y_train[0]
train_text[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [9]:
token = Tokenizer(num_words=2000) #使用Tokenizer建立一個2000字的字典
token.fit_on_texts(train_text) #將排序前2000名的英文字加入到字典


In [11]:
token.document_count #token讀取的影評筆數
token.word_index #文字字典

25000

{'machetes': 45793,
 'calzone': 79488,
 'rising': 4677,
 'syrian': 74143,
 'multy': 55126,
 'shipmate': 64974,
 "guiness's": 48209,
 'acedmy': 61271,
 'socialized': 55452,
 'cada': 62532,
 'ishtar': 17117,
 'informers': 68072,
 'culls': 55389,
 'providency': 61819,
 'promising': 2430,
 "dino's": 25173,
 'heeru': 84300,
 'churchill': 14494,
 'hurl': 17166,
 'sequentially': 70814,
 'prays': 20718,
 'health': 3351,
 'mermaids': 27434,
 "pachebel's": 88334,
 "maia's": 69940,
 'twentyfive': 85173,
 "frasier's": 60422,
 'converging': 39100,
 'humorously': 22174,
 'gayle': 25803,
 'sumptous': 59709,
 '2004s': 56533,
 'rabochiy': 62236,
 'benet': 10748,
 'deeply': 1684,
 "shadyac's": 52518,
 'disapproves': 29337,
 "claus'": 43570,
 'encountering': 18110,
 'lemmon': 4342,
 'extravagantly': 31008,
 'fortunetly': 37268,
 'hema': 84281,
 'applying': 13470,
 'milkman': 46325,
 'burton': 3537,
 'valve': 38512,
 'cid': 32646,
 'staginess': 26320,
 'isolated': 4366,
 'muito': 73590,
 'ok': 606,
 'park

In [19]:
#影評中的文字 => 轉為數字list (有出現在字典的字才會轉換)
x_train_seq = token.texts_to_sequences(train_text) 
x_test_seq = token.texts_to_sequences(test_text)
train_text[0]
x_train_seq[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

[308,
 6,
 3,
 1068,
 208,
 8,
 29,
 1,
 168,
 54,
 13,
 45,
 81,
 40,
 391,
 109,
 137,
 13,
 57,
 149,
 7,
 1,
 481,
 68,
 5,
 260,
 11,
 6,
 72,
 5,
 631,
 70,
 6,
 1,
 5,
 1,
 1530,
 33,
 66,
 63,
 204,
 139,
 64,
 1229,
 1,
 4,
 1,
 222,
 899,
 28,
 68,
 4,
 1,
 9,
 693,
 2,
 64,
 1530,
 50,
 9,
 215,
 1,
 386,
 7,
 59,
 3,
 1470,
 798,
 5,
 176,
 1,
 391,
 9,
 1235,
 29,
 308,
 3,
 352,
 343,
 142,
 129,
 5,
 27,
 4,
 125,
 1470,
 5,
 308,
 9,
 532,
 11,
 107,
 1466,
 4,
 57,
 554,
 100,
 11,
 308,
 6,
 226,
 47,
 3,
 11,
 8,
 214]

In [22]:
print("單字數 : ",len(str.split( )))
print('數字list : ',len(x_train_seq[0]))

單字數 :  140
數字list :  106


In [23]:
#截長補短至長度100
x_train = sequence.pad_sequences(x_train_seq,maxlen=100)  #x_train截長補短後的測試資料
x_test = sequence.pad_sequences(x_test_seq,maxlen=100)    #x_test截長補短後的測試資料

In [36]:
print('數字list : ',len(x_train_seq[1]),'\n',x_train_seq[1],'\n')
print('前截長後數字list : ',len(x_train[1]),'\n',x_train[1],'\n')

print('數字list : ',len(x_train_seq[6]),'\n',x_train_seq[6],'\n')
print('前補短後數字list : ',len(x_train[6]),'\n',x_train[6],'\n')

數字list :  335 
 [38, 13, 739, 43, 73, 31, 1828, 14, 149, 17, 111, 3, 1338, 5, 335, 144, 19, 1, 886, 11, 67, 276, 1190, 402, 33, 118, 282, 35, 166, 5, 391, 153, 38, 14, 1, 546, 87, 80, 100, 4, 1, 13, 39, 3, 412, 1199, 133, 40, 179, 137, 13, 1, 321, 19, 358, 5, 1, 38, 44, 25, 371, 5, 126, 52, 19, 1, 1980, 17, 47, 44, 21, 67, 344, 3, 5, 408, 19, 1, 1980, 14, 3, 205, 1, 21, 276, 65, 35, 3, 340, 1, 719, 725, 3, 1264, 19, 1, 1506, 3, 1220, 2, 282, 21, 276, 5, 63, 47, 41, 36, 5, 25, 11, 6, 33, 33, 379, 13, 294, 3, 1022, 128, 33, 43, 282, 7, 1, 178, 362, 5, 93, 3, 15, 3, 5, 63, 44, 26, 66, 408, 7, 1, 1980, 14, 499, 205, 1, 44, 26, 66, 78, 47, 26, 490, 15, 3, 701, 1181, 4, 227, 49, 1, 19, 117, 6, 1367, 19, 1, 886, 15, 3, 19, 23, 5, 23, 171, 843, 117, 26, 187, 1483, 121, 1, 236, 344, 1, 30, 3, 99, 41, 394, 19, 23, 117, 888, 81, 101, 582, 3, 251, 30, 1, 399, 4, 1956, 31, 1230, 33, 184, 154, 37, 340, 2, 37, 2, 33, 22, 454, 338, 5, 1, 1980, 502, 214, 236, 20, 338, 5, 36, 26, 276, 118, 50, 108, 1022