In [None]:
# urlretrieve 是一個方便的東西
# 他直接結合 urlopen + file.write 幫你做完儲存工作
from urllib.request import urlretrieve
import os
# MAC 要加入這段, SSL 證書才不會被視為無效
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# 如果 data 資料夾不存在就創一下
if not os.path.exists("./data"):
    print("data 資料夾不存在, 現在幫你創唷")
    os.mkdir("./data")
# 還沒下載過就下載一下
filepath = "./data/aclImdb_v1.tar.gz"
if not os.path.exists(filepath):
    print("還沒下載過資料, 現在幫你下載唷")
    urlretrieve(url, filepath)
else:
    print("已下載過")

已下載過


In [2]:
import tarfile
if not os.path.exists("data/aclImdb"):
    print("還沒解壓縮過, 現在幫你解壓縮")
    tfile = tarfile.open(filepath, 'r')
    tfile.extractall('data')
else:
    print("已解壓縮過")

已解壓縮過


In [3]:
import pandas as pd

# 為了顯示的漂亮, 我刻意的把印出來的 row 只顯示 15 個和 column 只顯示十個
# 大家練習的時候可以去掉下面兩行
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 10)

train_df = pd.DataFrame(columns = ["content", "sentiment"])

# 走過 pos 的資料夾，把資料夾整理好
pos_path = "data/aclImdb/train/pos"
for fpath in os.listdir(pos_path):
    if not fpath.startswith("."):
        fpath = os.path.join(pos_path, fpath)
        f = open(fpath, "r", encoding = "utf-8")
        content = f.read()
        s = pd.Series([content, 1], index = ["content", "sentiment"])
        train_df = train_df.append(s, ignore_index = True)

# 走過 neg 的資料夾，把資料夾整理好
neg_path = "data/aclImdb/train/neg"
for fpath in os.listdir(neg_path):
    if not fpath.startswith("."):
        fpath = os.path.join(neg_path, fpath)
        f = open(fpath, "r", encoding = "utf-8")
        content = f.read()
        s = pd.Series([content, 0], index = ["content", "sentiment"])
        train_df = train_df.append(s, ignore_index = True)
train_df

Unnamed: 0,content,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
5,"This isn't the comedic Robin Williams, nor is ...",1
6,Yes its an art... to successfully make a slow ...,1
...,...,...
24993,Although the production and Jerry Jameson's di...,0
24994,Capt. Gallagher (Lemmon) and flight attendant ...,0


In [3]:
test_df = pd.DataFrame(columns = ["content", "sentiment"])
pos_path = "data/aclImdb/test/pos"
for fpath in os.listdir(pos_path):
    if not fpath.startswith("."):
        fpath = os.path.join(pos_path, fpath)
        f = open(fpath, "r", encoding = "utf-8")
        content = f.read()
        s = pd.Series([content, 1], index = ["content", "sentiment"])
        test_df = test_df.append(s, ignore_index = True)
        
neg_path = "data/aclImdb/test/neg"
for fpath in os.listdir(neg_path):
    if not fpath.startswith("."):
        fpath = os.path.join(neg_path, fpath)
        f = open(fpath, "r", encoding = "utf-8")
        content = f.read()
        s = pd.Series([content, 0], index = ["content", "sentiment"])
        test_df = test_df.append(s, ignore_index = True)

test_df

Unnamed: 0,content,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
5,"I saw this film on September 1st, 2005 in Indi...",1
6,"Maybe I'm reading into this too much, but I wo...",1
...,...,...
24993,"This is one dreary, inert, self-important bore...",0
24994,"Awful, awful, awful times a hundred still does...",0


In [6]:
from keras.preprocessing.text import Tokenizer
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_df["content"])
# 我省略了這裡的印出，讀者可以把註解秀出字典的樣子
# token.word_index

Using TensorFlow backend.


ImportError: DLL load failed: 找不到指定的程序。

In [None]:
x_train_seq = token.texts_to_sequences(train_df["content"])
x_test_seq = token.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
from keras.preprocessing import sequence
x_train_pad = sequence.pad_sequences(x_train_seq, maxlen = 100)
x_test_pad = sequence.pad_sequences(x_test_seq, maxlen = 100)
pd.DataFrame(x_train_pad)

In [None]:
x_train_pad[0]

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.embeddings import Embedding
model = Sequential()
model.add(Embedding(output_dim=32, input_dim=2000, input_length=100))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
# 注意一下, 因為我們是二元分類, 最後的激勵函數選擇 sigmoid
# sigmoid(正 + 負 =100%) softmax(類別全部 =100%)
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
 model.summary()

In [None]:
import numpy as np
# 特別注意一下, 因為我們只是二元分類, 所以這裡的 loss 選擇 binary_crossentropy
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

from keras.datasets import imdb
y_train = train_df['sentiment']
train_history = model.fit(x_train_pad, 
                          y_train,batch_size = 100,
                          epochs = 3,
                          verbose = 2,
                          validation_split = 0.2)

In [None]:
y_test = test_df['sentiment']
# 正確率是 list 第二個元素
model.evaluate(x_test_pad, y_test)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(train_history.history["loss"])
plt.plot(train_history.history["val_loss"])
plt.title("Loss Graph")
plt.legend(['loss', 'val_loss'], loc="upper left")

In [None]:
model.layers

In [None]:
from keras.models import Model

embedding_layer_model = Model(inputs=model.input,
                              outputs=model.layers[0].output)
# 把第一筆文章拿來給你看轉換後的維度
em = embedding_layer_model.predict(x_test_pad[0:1])
em

In [None]:
print("維度:", em.shape)
print("第一個詞被轉換過的向量:", em[0][0])

In [None]:
from keras.layers import SimpleRNN

model = Sequential()
model.add(Embedding(output_dim=32, input_dim=2000, input_length=100))
model.add(Dropout(0.2))
# RNN: 記憶 16 個狀態
model.add(SimpleRNN(units=16))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
# 注意一下, 因為我們是二元分類, 最後的激勵函數選擇 sigmoid
# sigmoid(正 + 負 =100%) softmax(類別全部 =100%)
model.add(Dense(units=1, activation='sigmoid'))
# 特別注意一下, 因為我們只是二元分類, 所以這裡的 loss 選擇 binary_crossentropy
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

In [None]:
train_history = model.fit(x_train_pad,
                          y_train,batch_size = 100,
                          epochs = 3,
                          verbose = 2,
                          validation_split = 0.2)
y_test = test_df['sentiment']
# 正確率是 list 第二個元素
model.evaluate(x_test_pad, y_test)