In [38]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
# !pip install transformers
from transformers import BertTokenizer
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

In [39]:
# Upsampling the 4 class division of Sharma dataset where the maximum class count is 67
sharma_4 = pd.read_csv("Sharma_4_class.csv")

class_counts = sharma_4['seek_response'].value_counts()
sharma_4_upsampled = pd.DataFrame()

for i,j in class_counts.items():
    if (j == 67):
        df_majority = sharma_4[sharma_4["seek_response"]==i]
        
        # Combine majority class with empty class
        sharma_4_upsampled = pd.concat([sharma_4_upsampled,df_majority])
        continue
    else:
        df_minority = sharma_4[sharma_4["seek_response"]==i]
        # Upsample minority class
        df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=67,    # to match majority class
                                 random_state=123) # reproducible results
 
        # Combine minority class with other classes class
        sharma_4_upsampled = pd.concat([sharma_4_upsampled, df_minority_upsampled])

sharma_4_upsampled = sharma_4_upsampled.loc[:, ~sharma_4_upsampled.columns.str.contains('^Unnamed')]
sharma_4_upsampled = sharma_4_upsampled.drop(['seeker_post','seeking?','response_post','level'], axis=1)

sharma_4_upsampled['seek_response'].value_counts()
# sharma_4_upsampled

2    67
3    67
1    67
0    67
Name: seek_response, dtype: int64

In [40]:
train_val, test = train_test_split(sharma_4_upsampled, test_size=0.2)
len(train_val)

214

In [41]:
# find duplicates by performing an inner join
duplicates = train_val.merge(test, on=['seek_response', '2_utterance_convo'], how='inner')

# check the number of duplicates
print(len(duplicates))

46


In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [43]:
token = tokenizer.encode_plus(
    train_val['2_utterance_convo'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [44]:
X_input_ids = np.zeros((len(train_val), 256))
X_attn_masks = np.zeros((len(train_val), 256))

In [45]:
def generate_training_data(df_balanced, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(train_val['2_utterance_convo'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [46]:
labels = np.zeros((len(train_val), 4))
labels.shape

(214, 4)

In [47]:
labels[np.arange(len(train_val)), train_val['seek_response'].values] = 1 # one-hot encoded target tensor

In [48]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [49]:
from transformers import TFBertModel

model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(4, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [50]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [51]:
kf = KFold(n_splits=10)
a=1
for train_index, val_index in kf.split(train_val):
    
    df_train = train_val.iloc[train_index]
    df_val = train_val.iloc[val_index]
    
    X_input_ids_train, X_attn_masks_train = generate_training_data(df_train, X_input_ids, X_attn_masks, tokenizer)
    X_input_ids_val, X_attn_masks_val = generate_training_data(df_val, X_input_ids, X_attn_masks, tokenizer)

    # creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
    dataset_train = tf.data.Dataset.from_tensor_slices((X_input_ids_train, X_attn_masks_train, labels))
    dataset_val = tf.data.Dataset.from_tensor_slices((X_input_ids_val, X_attn_masks_val, labels))

    dataset_train = dataset_train.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 
    dataset_val = dataset_val.map(SentimentDatasetMapFunction)
    
    dataset_train = dataset_train.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
    dataset_val = dataset_val.shuffle(10000).batch(16, drop_remainder=True)
    
    print(a)
    a = a+1
    hist = sentiment_model.fit(dataset_train,validation_data=dataset_val,epochs=2)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

1
Epoch 1/2


2023-02-13 23:24:02.819081: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-02-13 23:24:43.617664: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

2
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

3
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

4
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

5
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

6
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

7
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

8
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

9
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

10
Epoch 1/2
Epoch 2/2


In [52]:
sentiment_model.save('sentiment_model_4_upsample_10-fold')



INFO:tensorflow:Assets written to: sentiment_model_4_upsample_10-fold/assets


INFO:tensorflow:Assets written to: sentiment_model_4_upsample_10-fold/assets


In [55]:
sentiment_model_4 = tf.keras.models.load_model('sentiment_model_4_upsample_10-fold')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def predict_class(processed_data):
    '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
    result = []
    for i in processed_data:
        pred = sentiment_model_4.predict(i)
        result.append(np.argmax(pred))
    return result


In [56]:
from sklearn.metrics import classification_report

processed_data = []

for i in test["2_utterance_convo"]:
    processed_data.append(prepare_data(i, tokenizer))
# print(processed_data)
y_pred = predict_class(processed_data)
print(y_pred)
print(classification_report(test["seek_response"].to_list(), y_pred))

2023-02-13 23:55:32.086177: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[3, 2, 2, 0, 1, 0, 0, 2, 3, 2, 2, 2, 2, 2, 2, 3, 1, 0, 2, 0, 0, 2, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 2, 0, 1, 2, 0, 0, 0, 3, 2, 3, 0, 0, 3, 2]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       0.90      1.00      0.95         9
           2       0.82      0.90      0.86        20
           3       0.90      0.64      0.75        14

    accuracy                           0.87        54
   macro avg       0.88      0.89      0.88        54
weighted avg       0.87      0.87      0.86        54

