In [20]:
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from transformers import BertTokenizer
import numpy as np

In [3]:
test_csv = pd.read_csv('datasets/cleaned_test_data.csv')
train_csv = pd.read_csv('datasets/cleaned_trained_data.csv')
test_csv = test_csv.drop(test_csv.index[0])
train_csv = train_csv.drop(train_csv.index[0])

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 946kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 29.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 


In [7]:
categories = {'Society & Culture' :1,'Science & Mathematics':2,'Health':3, 'Education & Reference':4,
            'Computers & Internet' :5,'Sports' :6,'Business & Finance' :7,'Entertainment & Music' : 8,
            'Family & Relationships':9, 'Politics & Government':10}
train_csv['class']=train_csv['class'].map(categories)
train_csv['class']=train_csv['class'].astype(int)

In [8]:
train_csv

Unnamed: 0,text,class
1,n't optical mouse work glass table ? even surf...,5
2,best off-road motorcycle trail ? long-distance...,6
3,Trans Fat ? reduce ? heard tras fat bad body ....,3
4,many planes Fedex ? heard largest airline worl...,7
5,"san francisco bay area , make sense rent buy ?...",7
...,...,...
768321,"believe hopelessness ? believe religion , lack...",1
768322,get horse 's skeletal muscular systems web pri...,4
768323,quest promote racial equality government/media...,10
768324,Ways sell video games ? Like want sell video g...,7


In [10]:
token = tokenizer.encode_plus(
    train_csv['text'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [14]:
X_input_ids = np.zeros((len(train_csv), 256))
X_attn_masks = np.zeros((len(train_csv), 256))

In [18]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [21]:
X_input_ids, X_attn_masks = generate_training_data(train_csv, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

768325it [24:54, 514.27it/s]


In [43]:
labels = np.zeros((len(train_csv), 10))
labels.shape

(768325, 10)

In [45]:
labels[np.arange(len(train_csv)), train_csv['class'].values-1] = 1

In [56]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

In [57]:
def DatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [58]:
dataset = dataset.map(DatasetMapFunction)

In [60]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

In [61]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)

In [62]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 10), dtype=tf.float64, name=None))>

In [64]:
p = 0.8
train_size = int((len(train_csv)//16)*p)

In [66]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [67]:
from transformers import TFBertModel

In [68]:
model = TFBertModel.from_pretrained('bert-base-cased')

Downloading model.safetensors: 100%|██████████| 436M/436M [03:23<00:00, 2.14MB/s] 
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model 

In [71]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(10, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

topic_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
topic_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [72]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [73]:
topic_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [75]:
hist = topic_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
   11/38416 [..............................] - ETA: 239:48:18 - loss: 2.5298 - accuracy: 0.0568

KeyboardInterrupt: 