In [78]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
# !pip install transformers
from transformers import BertTokenizer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

In [79]:
# Upsampling the 4 class division of Sharma dataset where the maximum class count is 67
sharma_4 = pd.read_csv("Sharma_4_class.csv")

class_counts = sharma_4['seek_response'].value_counts()
sharma_4_upsampled = pd.DataFrame()

for i,j in class_counts.items():
    if (j == 67):
        df_majority = sharma_4[sharma_4["seek_response"]==i]
        
        # Combine majority class with empty class
        sharma_4_upsampled = pd.concat([sharma_4_upsampled,df_majority])
        continue
    else:
        df_minority = sharma_4[sharma_4["seek_response"]==i]
        # Upsample minority class
        df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=67,    # to match majority class
                                 random_state=123) # reproducible results
 
        # Combine minority class with other classes class
        sharma_4_upsampled = pd.concat([sharma_4_upsampled, df_minority_upsampled])

sharma_4_upsampled = sharma_4_upsampled.loc[:, ~sharma_4_upsampled.columns.str.contains('^Unnamed')]
sharma_4_upsampled = sharma_4_upsampled.drop(['seeker_post','seeking?','response_post','level'], axis=1)

sharma_4_upsampled['seek_response'].value_counts()
# sharma_4_upsampled

2    67
3    67
1    67
0    67
Name: seek_response, dtype: int64

In [159]:
train_val, test = train_test_split(sharma_4_upsampled, test_size=0.2)
len(train_val)

214

In [160]:
# train_val_chutiya = sharma_4_upsampled.sample(frac=0.8)
# print(len(train_val_chutiya))
# # remove the rows that occur in both dataframes
# test_chutiya = sharma_4_upsampled.merge(train_val, on=['2_utterance_convo'], how='outer', indicator=True)
# print(test_chutiya['_merge'].value_counts())
# test_chutiya = test_chutiya[test_chutiya['_merge'] == 'left_only']
# test_chutiya.drop(columns=['_merge'], inplace=True)

In [162]:
# find duplicates by performing an inner join
duplicates = train_val.merge(test, on=['seek_response', '2_utterance_convo'], how='inner')

# check the number of duplicates
print(len(duplicates))

41


In [163]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [164]:
token = tokenizer.encode_plus(
    train_val['2_utterance_convo'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [165]:
X_input_ids = np.zeros((len(train_val), 256))
X_attn_masks = np.zeros((len(train_val), 256))

In [166]:
def generate_training_data(df_balanced, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(train_val['2_utterance_convo'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [167]:
X_input_ids, X_attn_masks = generate_training_data(train_val, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [168]:
labels = np.zeros((len(train_val), 4))
labels.shape

(214, 4)

In [169]:
np.arange(len(train_val))

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [170]:
train_val['seek_response'].values

array([3, 2, 1, 0, 3, 3, 2, 0, 0, 1, 1, 2, 3, 3, 2, 3, 3, 2, 3, 0, 1, 3,
       0, 3, 1, 0, 0, 1, 0, 3, 2, 0, 2, 0, 0, 0, 0, 1, 1, 0, 3, 2, 1, 1,
       2, 3, 2, 1, 0, 3, 2, 3, 3, 0, 1, 1, 2, 3, 3, 1, 1, 1, 2, 3, 3, 0,
       0, 2, 0, 2, 0, 3, 1, 0, 3, 2, 3, 3, 3, 1, 0, 3, 2, 0, 1, 1, 0, 1,
       0, 1, 1, 2, 2, 0, 2, 2, 2, 2, 0, 3, 2, 2, 1, 1, 0, 1, 2, 2, 3, 3,
       1, 1, 0, 3, 0, 1, 3, 3, 2, 1, 3, 0, 1, 2, 0, 2, 0, 3, 1, 3, 3, 3,
       1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 0, 1, 3, 3, 2, 1, 2, 1, 0, 2, 0, 1,
       2, 2, 1, 0, 2, 3, 3, 2, 0, 2, 3, 0, 0, 0, 0, 2, 2, 0, 0, 1, 3, 1,
       1, 0, 1, 0, 2, 3, 3, 1, 1, 3, 3, 2, 1, 0, 3, 0, 0, 0, 3, 2, 0, 0,
       2, 0, 2, 1, 0, 0, 3, 0, 3, 3, 2, 1, 0, 3, 1, 3])

In [171]:
labels[np.arange(len(train_val)), train_val['seek_response'].values] = 1 # one-hot encoded target tensor

In [172]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(4,), dtype=tf.float64, name=None))>

In [173]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [174]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [175]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [176]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 4), dtype=tf.float64, name=None))>

In [177]:
p = 0.75
train_size = int((len(train_val)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [178]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [179]:
from transformers import TFBertModel

In [180]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [181]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(4, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [182]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [183]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [184]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2


2023-02-13 23:22:05.068917: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-02-13 23:22:31.352125: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


In [185]:
sentiment_model.save('sentiment_model_4_upsampled')



INFO:tensorflow:Assets written to: sentiment_model_4_upsampled/assets


INFO:tensorflow:Assets written to: sentiment_model_4_upsampled/assets


In [186]:
sentiment_model_4 = tf.keras.models.load_model('sentiment_model_4_upsampled')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def predict_class(processed_data):
    '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
    result = []
    for i in processed_data:
        pred = sentiment_model_4.predict(i)
        result.append(np.argmax(pred))
    return result


In [188]:
from sklearn.metrics import classification_report

processed_data = []

for i in test["2_utterance_convo"]:
    processed_data.append(prepare_data(i, tokenizer))
# print(processed_data)
y_pred = predict_class(processed_data)
print(y_pred)
print(classification_report(test["seek_response"].to_list(), y_pred))

[3, 3, 1, 0, 3, 3, 3, 0, 0, 0, 0, 1, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 0, 3, 0, 1, 0, 3, 3, 0, 1, 3, 3, 0, 3, 3, 3, 3, 1, 3, 0, 3, 3, 0, 0, 1, 3, 0, 3, 0, 0]
              precision    recall  f1-score   support

           0       0.26      0.56      0.36         9
           1       0.17      0.07      0.10        15
           2       0.00      0.00      0.00        17
           3       0.24      0.54      0.33        13

    accuracy                           0.24        54
   macro avg       0.17      0.29      0.20        54
weighted avg       0.15      0.24      0.17        54



In [158]:
print(classification_report(test["seek_response"].to_list(), y_pred))

              precision    recall  f1-score   support

           0       0.37      1.00      0.54        83
           1       0.00      0.00      0.00        88
           2       0.46      0.83      0.59       306
           3       0.34      0.04      0.06       337

    accuracy                           0.43       814
   macro avg       0.29      0.47      0.30       814
weighted avg       0.35      0.43      0.30       814

