<a href="https://colab.research.google.com/github/LeeKLTW/bert/blob/master/Predicting_Movie_Reviews_with_BERT_on_TF_Hub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

W0605 02:54:17.483054 140064389609344 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [0]:
! pip install bert_tensorflow

Collecting bert_tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 2.0MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


In [0]:
import bert
# from bert import run_classifier
# from bert import optimization
# from bert import tokenization

In [0]:
OUTPUT_DIR = 'OUTPUT_DIR_NAME'
try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
except:
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: OUTPUT_DIR_NAME *****


```
aclImdb
├── README
├── imdb.vocab
├── imdbEr.txt
├── test
│   ├── labeledBow.feat
│   ├── neg
│   ├── pos
│   ├── urls_neg.txt
│   └── urls_pos.txt
└── train
    ├── labeledBow.feat
    ├── neg
    ├── pos
    ├── unsup
    ├── unsupBow.feat
    ├── urls_neg.txt
    ├── urls_pos.txt
    └── urls_unsup.txt
  
```



In [0]:
from tensorflow import keras
import os
import re

def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    data = pd.DataFrame.from_dict(data)
    return data

def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory,"pos"))
    neg_df = load_directory_data(os.path.join(directory,"neg"))
    pos_df['polarity'] = 1
    neg_df['polarity'] = 0
    data = pd.concat([pos_df, neg_df]).sample(frac=1.).reset_index(drop=True)
    return data
    
def download_and_load_datasets(force_download=False):
    dataset = keras.utils.get_file(fname="aclImdb.tar.gz",
                                   origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", extract=True)
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))
    return train_df, test_df
    


In [0]:
train_df, test_df = download_and_load_datasets()

In [0]:
train_data = train_df.sample(5000)
test_data = test_df.sample(5000)

In [21]:
train_data.head()

Unnamed: 0,sentence,sentiment,polarity
2078,Just given the fact that it is based on the mo...,9,1
19596,The film opens with Bill Coles (Melvyn Douglas...,8,1
21368,It's rare that I come across a film this awful...,1,0
19951,Very poor effort that offers pretty much nothi...,3,0
17765,Be careful with this one. Once you get yer mit...,10,1


In [22]:
test_data.head()

Unnamed: 0,sentence,sentiment,polarity
19267,"I hate to admit it, but they were right to sac...",4,0
23314,"1930's comedy mystery about ""The Crooked Circl...",7,1
18116,I remember seeing this movie 34 years ago and ...,10,1
23726,Whoever plays the part of J. Douglas Williamso...,9,1
23014,Detective Burt Williams has been on the trail ...,3,0


In [0]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
label_list = [0,1]

In [0]:
# bert.run_classifier.InputExample
train_input_example = train_data.apply(lambda x:bert.run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN], text_b=None,label=x[LABEL_COLUMN]),axis=1)
test_input_example = test_data.apply(lambda x:bert.run_classifier.InputExample(guid=None, text_a = x[DATA_COLUMN], text_b=None,label=x[LABEL_COLUMN]),axis=1)


In [54]:
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case =sess.run([tokenization_info["vocab_file"],tokenization_info["do_lower_case"]])
            
            return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0605 04:45:00.023222 140064389609344 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [60]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

['ps',
 '##uti',
 '##l',
 '(',
 'process',
 'and',
 'system',
 'utilities',
 ')',
 'is',
 'a',
 'cross',
 '-',
 'platform',
 'library',
 'for',
 're',
 '##tri',
 '##eving',
 'information',
 'on',
 'running',
 'processes',
 'and',
 'system',
 'utilization',
 '(',
 'cpu',
 ',',
 'memory',
 ',',
 'disks',
 ',',
 'network',
 ',',
 'sensors',
 ')',
 'in',
 'python']

In [55]:
tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[unused0]', 1),
             ('[unused1]', 2),
             ('[unused2]', 3),
             ('[unused3]', 4),
             ('[unused4]', 5),
             ('[unused5]', 6),
             ('[unused6]', 7),
             ('[unused7]', 8),
             ('[unused8]', 9),
             ('[unused9]', 10),
             ('[unused10]', 11),
             ('[unused11]', 12),
             ('[unused12]', 13),
             ('[unused13]', 14),
             ('[unused14]', 15),
             ('[unused15]', 16),
             ('[unused16]', 17),
             ('[unused17]', 18),
             ('[unused18]', 19),
             ('[unused19]', 20),
             ('[unused20]', 21),
             ('[unused21]', 22),
             ('[unused22]', 23),
             ('[unused23]', 24),
             ('[unused24]', 25),
             ('[unused25]', 26),
             ('[unused26]', 27),
             ('[unused27]', 28),
             ('[unused28]', 29),
             ('[unused29]', 30),
  