In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
# !pip install transformers
from transformers import BertTokenizer
from sklearn.model_selection import KFold

2023-02-14 03:25:59.667450: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# None=0, Seek=1, Provide=2

df = pd.read_csv("CSN.csv")
df['label'].value_counts()

0    2995
1    1046
2     966
Name: label, dtype: int64

In [3]:
df_0 = df[df['label']==0]
df_1 = df[df['label']==1]
df_2 = df[df['label']==2]

df_0_downsampled = df_0.sample(df_2.shape[0])
df_1_downsampled = df_1.sample(df_2.shape[0])

df = pd.concat([df_0_downsampled, df_1_downsampled, df_2])
# df = pd.concat([df_0_downsampled, df_1])
df['label'].value_counts()

0    966
1    966
2    966
Name: label, dtype: int64

In [4]:
df

Unnamed: 0,text,label
4342,That is how much time I took off work initally.,0
4142,"Well, he only has two more chemotherapy treatm...",0
910,"Just one little prick, and that's it.",0
4296,On the x-ray they can't tell whether it is inf...,0
1391,Your surgeon will tell you when to start exerc...,0
...,...,...
4975,We will all be along for the ride.,2
4987,I am so very sorry that you are suffering ..,2
4991,"That has got to be the best news ever, well, e...",2
4993,There is really nothing anyone can say that is...,2


In [5]:
df_seek = df.copy()
print(df_seek.groupby('label').describe())
df_response = df.copy()
print(df_response.groupby('label').describe())
df_seek.loc[df_seek['label'] == 2, 'label'] = 0
print(df_seek.groupby('label').describe())
df_response.loc[df_response['label'] == 1, 'label'] = 0
print(df_response.groupby('label').describe())
df_response.loc[df_response['label'] == 2, 'label'] = 1
print(df_response.groupby('label').describe())

       text                                                               
      count unique                                                top freq
label                                                                     
0       966    966    That is how much time I took off work initally.    1
1       966    966  i just feel like i am watching him die and i c...    1
2       966    966  I just wanted to let you know I was here if yo...    1
       text                                                               
      count unique                                                top freq
label                                                                     
0       966    966    That is how much time I took off work initally.    1
1       966    966  i just feel like i am watching him die and i c...    1
2       966    966  I just wanted to let you know I was here if yo...    1
       text                                                               
      count unique       

TRAINING SEEKER SIDE ON CSN

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [7]:
token = tokenizer.encode_plus(
    df_seek['text'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

Metal device set to: Apple M1 Pro


2023-02-14 03:26:05.493041: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-14 03:26:05.495430: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-14 03:26:05.495468: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [9]:
def generate_training_data(df_balanced, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df_seek['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [10]:
labels = np.zeros((len(df_seek), 2))
labels.shape

(2898, 2)

In [11]:
labels[np.arange(len(df_seek)), df_seek['label'].values] = 1 # one-hot encoded target te

In [12]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [13]:
from transformers import TFBertModel

model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(2, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [14]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
kf = KFold(n_splits=10)
a=1
for train_index, val_index in kf.split(df):
    
    df_train = df.iloc[train_index]
    df_val = df.iloc[val_index]
    
    X_input_ids_train, X_attn_masks_train = generate_training_data(df_train, X_input_ids, X_attn_masks, tokenizer)
    X_input_ids_val, X_attn_masks_val = generate_training_data(df_val, X_input_ids, X_attn_masks, tokenizer)

    # creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
    dataset_train = tf.data.Dataset.from_tensor_slices((X_input_ids_train, X_attn_masks_train, labels))
    dataset_val = tf.data.Dataset.from_tensor_slices((X_input_ids_val, X_attn_masks_val, labels))

    dataset_train = dataset_train.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 
    dataset_val = dataset_val.map(SentimentDatasetMapFunction)
    
    dataset_train = dataset_train.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
    dataset_val = dataset_val.shuffle(10000).batch(16, drop_remainder=True)
    
    print(a)
    a = a+1
    hist = sentiment_model.fit(dataset_train,validation_data=dataset_val,epochs=2)
    
    

0it [00:00, ?it/s]

0it [00:00, ?it/s]

1
Epoch 1/2


2023-02-14 03:26:21.065675: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-02-14 03:30:12.011174: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


0it [00:00, ?it/s]

0it [00:00, ?it/s]

2
Epoch 1/2

In [None]:
sentiment_model.save('sentiment_model_seek_downsampled_10-fold')

In [None]:
test_seek = pd.read_csv("self_eval_seek_2.csv")
test_seek = test_seek[test_seek["seeking?"].notna()]
test_seek = test_seek.loc[:, ~test_seek.columns.str.contains('^Unnamed')]
test_seek = test_seek.drop_duplicates(subset='seeker_post', keep="last")
test_seek.loc[test_seek["seeking?"]=="Seeking(subtly)", "seeking?"] = int(0)
test_seek.loc[test_seek["seeking?"]=="Not Seeking", "seeking?"] = int(0)
test_seek.loc[test_seek["seeking?"]=="Not Seeking/Maybe", "seeking?"] = int(0)
test_seek.loc[test_seek["seeking?"]=="Seeking(truly)", "seeking?"] = int(1)
test_seek["seeking?"].value_counts()


In [None]:
sentiment_model_seek = tf.keras.models.load_model('sentiment_model_seek_downsampled_10-fold')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def predict_class(processed_data):
    '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
    result = []
    for i in processed_data:
        pred = sentiment_model_seek.predict(i)
        result.append(np.argmax(pred))
    return result
#     print(sentiment_model_seek.predict(processed_data))
#     return [np.argmax(pred) for pred in sentiment_model_seek.predict(processed_data)]
#     probs = model.predict(processed_data)[0]
#     return classes[np.argmax(probs)]


In [None]:
from sklearn.metrics import classification_report

processed_data = []

for i in test_seek["seeker_post"]:
    processed_data.append(prepare_data(i, tokenizer))
# print(processed_data)
y_pred = predict_class(processed_data)
print(y_pred)
print(classification_report(test_seek["seeking?"].to_list(), y_pred))

TRAINING RESPONSE SIDE

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
token = tokenizer.encode_plus(
    df_response['text'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
def generate_training_data(df_balanced, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df_response['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
labels = np.zeros((len(df_response), 2))
labels.shape

In [None]:
labels[np.arange(len(df_response)), df_response['label'].values] = 1 # one-hot encoded target te

In [None]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
from transformers import TFBertModel

model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(2, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

In [None]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
kf = KFold(n_splits=10)
a=1
for train_index, val_index in kf.split(df):
    
    df_train = df.iloc[train_index]
    df_val = df.iloc[val_index]
    
    X_input_ids_train, X_attn_masks_train = generate_training_data(df_train, X_input_ids, X_attn_masks, tokenizer)
    X_input_ids_val, X_attn_masks_val = generate_training_data(df_val, X_input_ids, X_attn_masks, tokenizer)

    # creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
    dataset_train = tf.data.Dataset.from_tensor_slices((X_input_ids_train, X_attn_masks_train, labels))
    dataset_val = tf.data.Dataset.from_tensor_slices((X_input_ids_val, X_attn_masks_val, labels))

    dataset_train = dataset_train.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 
    dataset_val = dataset_val.map(SentimentDatasetMapFunction)
    
    dataset_train = dataset_train.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
    dataset_val = dataset_val.shuffle(10000).batch(16, drop_remainder=True)
    
    print(a)
    a = a+1
    hist = sentiment_model.fit(dataset_train,validation_data=dataset_val,epochs=2)

In [None]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

In [None]:
sentiment_model.save('sentiment_model_response_downsampled_10-fold')

In [None]:
# print(test_seek["seeking?"].to_list())
print(classification_report(test_seek["seeking?"].to_list(), y_pred))
# from sklearn.metrics import confusion_metric
# print(confusion_metric(test_set["seeking?"], y_pred))

In [None]:
test_response = pd.read_csv("Sharma_response_0_1.csv")
# test_response.loc[test_response["empathetic?"]=="Empathetic", "empathetic?"] = 1
# test_response.loc[test_seek["empathetic?"]=="Non empathetic", "empathetic?"] = 0
test_response = test_response.loc[:, ~test_response.columns.str.contains('^Unnamed')]
test_response

In [None]:
sentiment_model_seek = tf.keras.models.load_model('sentiment_model_response_downsampled_10-fold')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def predict_class(processed_data):
    '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
    result = []
    for i in processed_data:
        pred = sentiment_model_seek.predict(i)
        result.append(np.argmax(pred))
    return result
#     print(sentiment_model_seek.predict(processed_data))
#     return [np.argmax(pred) for pred in sentiment_model_seek.predict(processed_data)]
#     probs = model.predict(processed_data)[0]
#     return classes[np.argmax(probs)]

from sklearn.metrics import classification_report

processed_data = []

for i in test_response["response_post"]:
    processed_data.append(prepare_data(i, tokenizer))
# print(processed_data)
y_pred = predict_class(processed_data)
print(y_pred)
print(classification_report(test_response["level"].to_list(), y_pred))

In [None]:
# print(test_seek["seeking?"].to_list())
print(classification_report(test_response["level"].to_list(), y_pred))
# from sklearn.metrics import confusion_metric
# print(confusion_metric(test_set["seeking?"], y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_seek["empathetic?"].to_list(), y_pred))