In [21]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
# !pip install transformers
from transformers import BertTokenizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [22]:
sharma_4 = pd.read_csv("Sharma_4_class.csv")



sharma_4['seek_response'].value_counts()

2    67
3    59
1    41
0    34
Name: seek_response, dtype: int64

In [27]:
df_0 = sharma_4[sharma_4["seek_response"]==0]
df_1 = sharma_4[sharma_4["seek_response"]==1]
df_2 = sharma_4[sharma_4["seek_response"]==2]
df_3 = sharma_4[sharma_4["seek_response"]==3]

df_1_downsampled = df_1.sample(df_0.shape[0])
df_2_downsampled = df_2.sample(df_0.shape[0])
df_3_downsampled = df_3.sample(df_0.shape[0])

df = pd.concat([df_0, df_1_downsampled, df_2_downsampled, df_3_downsampled])
# df = pd.concat([df_0_downsampled, df_1])
df["seek_response"].value_counts()

0    34
1    34
2    34
3    34
Name: seek_response, dtype: int64

In [28]:
train_val, test = train_test_split(sharma_4, test_size=0.2)
len(train_val)

160

In [29]:
# find duplicates by performing an inner join
duplicates = train_val.merge(test, on=['seek_response', '2_utterance_convo'], how='inner')

# check the number of duplicates
print(len(duplicates))

0


In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [31]:
token = tokenizer.encode_plus(
    train_val['2_utterance_convo'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [32]:
X_input_ids = np.zeros((len(train_val), 256))
X_attn_masks = np.zeros((len(train_val), 256))

In [33]:
def generate_training_data(df_balanced, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(train_val['2_utterance_convo'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [34]:
labels = np.zeros((len(train_val), 4))
labels.shape

(160, 4)

In [35]:
labels[np.arange(len(train_val)), train_val['seek_response'].values] = 1 # one-hot encoded target tensor

In [36]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [37]:
from transformers import TFBertModel

model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(4, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [38]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [39]:
kf = KFold(n_splits=10)
a=1
for train_index, val_index in kf.split(train_val):
    
    df_train = train_val.iloc[train_index]
    df_val = train_val.iloc[val_index]
    
    X_input_ids_train, X_attn_masks_train = generate_training_data(df_train, X_input_ids, X_attn_masks, tokenizer)
    X_input_ids_val, X_attn_masks_val = generate_training_data(df_val, X_input_ids, X_attn_masks, tokenizer)

    # creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
    dataset_train = tf.data.Dataset.from_tensor_slices((X_input_ids_train, X_attn_masks_train, labels))
    dataset_val = tf.data.Dataset.from_tensor_slices((X_input_ids_val, X_attn_masks_val, labels))

    dataset_train = dataset_train.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 
    dataset_val = dataset_val.map(SentimentDatasetMapFunction)
    
    dataset_train = dataset_train.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
    dataset_val = dataset_val.shuffle(10000).batch(16, drop_remainder=True)
    
    print(a)
    a = a+1
    hist = sentiment_model.fit(dataset_train,validation_data=dataset_val,epochs=2)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

1
Epoch 1/2


2023-02-13 23:34:37.330137: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-02-13 23:35:10.611932: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

2
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

3
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

4
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

5
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

6
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

7
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

8
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

9
Epoch 1/2
Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

10
Epoch 1/2
Epoch 2/2


In [150]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2


2023-02-08 00:54:41.763855: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-02-08 01:03:46.058239: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


In [40]:
sentiment_model.save('sentiment_model_4_downsampled_10-fold')



INFO:tensorflow:Assets written to: sentiment_model_4_downsampled_10-fold/assets


INFO:tensorflow:Assets written to: sentiment_model_4_downsampled_10-fold/assets


In [42]:
sentiment_model_4 = tf.keras.models.load_model('sentiment_model_4_downsampled_10-fold')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def predict_class(processed_data):
    '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
    result = []
    for i in processed_data:
        pred = sentiment_model_4.predict(i)
        result.append(np.argmax(pred))
    return result


In [44]:
test

Unnamed: 0.1,Unnamed: 0,seeker_post,seeking?,response_post,level,seek_response,2_utterance_convo
15,46,Invisible. Went out tonight with a few friends...,1,I can't remember the last time I went out with...,0,2,[CLS] Invisible. Went out tonight with a few f...
118,354,Depression and celebrities. Hey there everyone...,0,"Not Hollywood per se, but Ned Vizzini suicide ...",1,1,[CLS] Depression and celebrities. Hey there ev...
133,398,What helps you lighten up your mood when there...,0,Everyone suggests to exercise but its easier s...,1,1,[CLS] What helps you lighten up your mood when...
184,547,Getting out of your situation or not is a choi...,0,"Well yeah, but what about the relatives and th...",0,0,[CLS] Getting out of your situation or not is ...
7,23,If just one person would call me.... If just o...,1,Pm me if you want to talk,0,2,[CLS] If just one person would call me.... If ...
53,161,Float tanks?. Has anybody had any luck using f...,1,"I tried one once. It was a cool experience, bu...",1,3,[CLS] Float tanks?. Has anybody had any luck u...
1,5,Will things ever start looking up. I'm on a do...,1,Thing will only start looking up when you do. ...,1,3,[CLS] Will things ever start looking up. I'm o...
12,34,I just had a dream where the girl of the dream...,0,And this is why I am sleeping like 12 hours a ...,0,0,[CLS] I just had a dream where the girl of the...
127,382,I have become comfortably numb. afraid of who ...,1,What does that mean in practice? Are you one o...,0,2,[CLS] I have become comfortably numb. afraid o...
86,262,Currently on Lexapro sub and going to be presc...,0,I was on Lexapro and Wellbutrin for the longes...,1,1,[CLS] Currently on Lexapro sub and going to be...


In [45]:
from sklearn.metrics import classification_report

processed_data = []

for i in test["2_utterance_convo"]:
    processed_data.append(prepare_data(i, tokenizer))
# print(processed_data)
y_pred = predict_class(processed_data)
print(y_pred)
print(classification_report(test["seek_response"].to_list(), y_pred))

[2, 2, 1, 0, 2, 2, 3, 2, 2, 3, 2, 3, 3, 0, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 3, 1, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 0]
              precision    recall  f1-score   support

           0       0.40      0.33      0.36         6
           1       0.50      0.10      0.17        10
           2       0.56      0.88      0.68        17
           3       0.43      0.38      0.40         8

    accuracy                           0.51        41
   macro avg       0.47      0.42      0.40        41
weighted avg       0.49      0.51      0.45        41

