In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 8.7MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 23.0MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 42.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |██

In [2]:
#######################################
### -------- Load libraries ------- ###

# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split



In [3]:
!git clone https://github.com/BeFranke/text_gcn

Cloning into 'text_gcn'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 26830 (delta 0), reused 2 (delta 0), pack-reused 26825[K
Receiving objects: 100% (26830/26830), 919.13 MiB | 22.95 MiB/s, done.
Resolving deltas: 100% (218/218), done.
Checking out files: 100% (26402/26402), done.


In [10]:

#######################################
### --------- Import data --------- ###

# Import data from csv
data = pd.read_csv("text_gcn/data/amazon/train.csv")
data.head()

Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3.0,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5.0,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5.0,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,B000GRA6N8,Westing Game,unknown,0/0,5.0,897696000,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,B00000DMDQ,I SPY A is For Jigsaw Puzzle 63pc,unknown,2/4,5.0,911865600,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


In [5]:
data.Text.str.len().mean()

435.201875

In [6]:
data.Text.str.len().max()

6589

In [11]:

# Select required columns
data = data[['Text', 'Cat2']]

# Remove a row if any of the three remaining columns are missing
data = data.dropna()


In [18]:



# Set your model output as categorical and save in new label col
data['Cat2_label'] = pd.Categorical(data['Cat2'])


# Transform your output to numeric
data['Cat2'] = data['Cat2_label'].cat.codes

data.head()

Unnamed: 0,Text,Cat2,Cat2_label
5599,"My 8-month old got this for Christmas, and it ...",0,action toy figures
23510,We bought and returned the transportation stat...,58,sports outdoor play
18258,THIS WAS ON MY GRANDSONS CHRISTMAS LIST AND HE...,25,fragrance
35970,These batteries were inexpensive and industria...,37,household supplies
17019,this is as cute at home as it is on your web s...,7,beverages


In [19]:

# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Cat2']])



In [20]:
#######################################
### --------- Setup BERT ---------- ###

# Name of the BERT model to use
model_name = 'bert-base-uncased'

# Max length of tokens
max_length = 100

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [21]:
#######################################
### ------- Build the model ------- ###

# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

# Load the MainLayer
bert = transformer_model.layers[0]

# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') # Ignores padded part of sentences
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
#inputs = {'input_ids': input_ids}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Then build your model output
output = Dense(units=len(data.Cat2_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='Cat2')(pooled_output)

# And combine it all in a model object
model = Model(inputs=inputs, outputs=output, name='BERT_MultiClass')

# Take a look at the model
model.summary()

Model: "BERT_MultiClass"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          ((None, 100, 768), ( 109482240   attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
pooled_output (Dropout)         (None, 768)          0           bert[0][1]         

In [22]:
#######################################
### ------- Train the model ------- ###

# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('accuracy')

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_Cat2 = to_categorical(data['Cat2'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


In [23]:

# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    #x={'input_ids': x['input_ids']},
    y=y_Cat2,
    validation_split=0.2,
    batch_size=64,
    epochs=2) #10


Epoch 1/2
Epoch 2/2


In [None]:
!mkdir -p saved_model
model.save('saved_model/my_model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: saved_model/my_model/assets


In [24]:
#######################################
### ----- Evaluate the model ------ ###

# Ready test data
test_y_Cat2 = to_categorical(data_test['Cat2'])
test_x = tokenizer(
    text=data_test['Text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


In [25]:

# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids'], 'attention_mask': test_x['attention_mask']},
    y=test_y_Cat2)



In [26]:
aux= data_test[0:10].copy()

In [27]:
pred_x = tokenizer(
    text=aux['Text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [28]:
out=model.predict(x={'input_ids': pred_x['input_ids'],'attention_mask': pred_x['attention_mask']})

In [29]:
import numpy as np

In [30]:
np.argmax(out,axis=1)

array([15, 18, 27, 25, 45, 27, 55,  4, 23,  4])

In [31]:
data_test.iloc[0]

Text          I've written another review for the other type...
Cat2                                                          8
Cat2_label                                                birds
Name: 16455, dtype: object

In [32]:
categories=data["Cat2_label"].cat.categories

In [33]:
aux["predictes"]=np.argmax(out,axis=1)

In [34]:
aux

Unnamed: 0,Text,Cat2,Cat2_label,predictes
16455,I've written another review for the other type...,8,birds,15
32339,Pampers Baby Dry and Luvs have been our two go...,18,diapering,18
20639,"The clock works, but the display is incredibly...",27,games,27
29326,"Bless Dr Bronner, this is one of the products ...",5,bath body,25
10685,I saved $15 by purchasing from Amazon. The pro...,45,nutrition wellness,45
32338,I picked this up at a church rummage sale for ...,27,games,27
16208,"I have had acne problems for YEARS. Now, at 17...",55,skin care,55
5629,"Oh, i lov this toy!When you lay her down she p...",20,dolls accessories,4
19889,This is a great wrap and easy to use. Very sof...,28,gear,23
5727,"My four year old loves it, but I, on the other...",0,action toy figures,4


In [None]:
!zip -r my_model.zip saved_model/my_model


  adding: saved_model/my_model/ (stored 0%)
  adding: saved_model/my_model/variables/ (stored 0%)
  adding: saved_model/my_model/variables/variables.index (deflated 81%)
  adding: saved_model/my_model/variables/variables.data-00000-of-00001 (deflated 14%)
  adding: saved_model/my_model/assets/ (stored 0%)
  adding: saved_model/my_model/saved_model.pb (deflated 92%)


In [None]:
from google.colab import files
files.download("my_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>