<a href="https://colab.research.google.com/github/Nempickaxe/bert-classfier-huggingface/blob/main/BERT_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#download complaints file
!wget -o /tmp/zipfile http://files.consumerfinance.gov/ccdb/complaints.csv.zip
!unzip complaints.csv.zip
!rm complaints.csv.zip
!ls

Archive:  complaints.csv.zip
replace complaints.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
complaints.csv	sample_data


In [3]:
!pip install transformers



In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import BertConfig, BertTokenizerFast, TFBertModel

from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [47]:
data = pd.read_csv('complaints.csv', usecols=['Consumer complaint narrative', 'Product', 'Issue']).sample(50000)
data.dropna(inplace=True)

In [48]:
data.shape

(16924, 3)

In [49]:
#filter issues with atleast a single count
data = data.groupby('Issue').filter(lambda x: len(x)>1)
data = data.groupby('Product').filter(lambda x: len(x)>1)

In [50]:
le_product = LabelEncoder()
le_issue = LabelEncoder()

data['Product_label'] = le_product.fit_transform(data['Product'])
data['Issue_label'] = le_issue.fit_transform(data['Issue'])

In [51]:
train_data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue_label']])
train_data.shape

(13530, 5)

In [52]:
data.head()

Unnamed: 0,Product,Issue,Consumer complaint narrative,Product_label,Issue_label
351182,Debt collection,Attempts to collect debt not owed,I received a bill from Credence Collection sta...,7,9
1861462,"Credit reporting, credit repair services, or o...",Improper use of your report,I created an online account with Quicken Loans...,6,56
519609,"Credit reporting, credit repair services, or o...",Incorrect information on your report,I was trying to cosign for my cousin a new veh...,6,58
318020,"Credit reporting, credit repair services, or o...",Incorrect information on your report,Midwest Recovery Services added a collections ...,6,58
1593215,Payday loan,Charged fees or interest I didn't expect,I got a pay day loan from a company that was t...,12,20


In [13]:
#Bert 

model_name = 'bert-base-uncased'
max_length = 100
config = BertConfig.from_pretrained(pretrained_model_name_or_path=model_name)
config.output_hidden_states = False

tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name, config=config)
bert_pretrained_model = TFBertModel.from_pretrained(pretrained_model_name_or_path=model_name, config=config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
bert_model = bert_pretrained_model.layers[0]

input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

bert_output = bert_model(inputs)[1] #pooled output

dropout_layer = Dropout(config.hidden_dropout_prob, name='dropout_layer')
dropout = dropout_layer(bert_output, training=False)

issue = Dense(units=len(le_issue.classes_), name='issue', kernel_initializer=TruncatedNormal(stddev=config.initializer_range))(dropout)
product = Dense(units=len(le_product.classes_), name='product', kernel_initializer=TruncatedNormal(stddev=config.initializer_range))(dropout)

outputs = {'issue':issue, 'product':product}

model = Model(inputs=inputs, outputs=outputs, name='BERTmodel')

model.summary()

Model: "BERTmodel"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
__________________________________________________________________________________________________
dropout_layer (Dropout)         (None, 768)          0           bert[0][1]                       
__________________________________________________________________________________________________
issue (Dense)                   (None, 129)          99201       dropout_layer[0][0]              
__________________________________________________________________________________________

In [15]:
optimizer = Adam(learning_rate=5e-5, clipnorm=0.001)

loss = {'issue': CategoricalCrossentropy(from_logits=True), 'product': CategoricalCrossentropy(from_logits=True)}
metrics = {'issue': CategoricalAccuracy(name='accuracy'), 'product': CategoricalAccuracy(name='accuracy')}

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [53]:
x = tokenizer(
    text=train_data['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    padding=True, 
    truncation=True, 
    max_length=max_length,
    return_tensors='tf', 
    return_token_type_ids = False, 
    return_attention_mask = False, 
    verbose = True)

In [58]:
x

{'input_ids': <tf.Tensor: shape=(13530, 100), dtype=int32, numpy=
array([[  101,  1045,  2031, ...,  5517,  2008,   102],
       [  101,  2619, 10312, ...,  2221,  1012,   102],
       [  101,  1045,  2288, ...,     0,     0,     0],
       ...,
       [  101,  1045,  2031, ...,  2382,  2420,   102],
       [  101,  4374,  4455, ...,     0,     0,     0],
       [  101,  2026, 14344, ...,  2026, 22038,   102]], dtype=int32)>}

In [54]:
history = model.fit(x={'input_ids': x['input_ids']},
                    y={'issue': to_categorical(train_data['Issue_label'].tolist()), 'product':to_categorical(train_data['Product_label'].tolist())},
                    validation_split=0.2,
                    epochs=10)

Epoch 1/10


TypeError: ignored

In [57]:
y_issue = to_categorical(train_data['Issue_label'].tolist())
y_product = to_categorical(train_data['Product_label'].tolist())

history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

Epoch 1/10


TypeError: ignored