In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score, roc_curve
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from tqdm import tqdm
import os
import torch.utils.data as utils
import matplotlib.pyplot as plt
import ast

In [None]:
data2021 = {}
path = 'eRisk2021_T1/data/'

for fname in os.listdir(path):
    if not fname.endswith('.xml'):
        continue
    file_path = os.path.join(path, fname)
    with open(file_path, 'r') as file:
        _id = fname[:-4]
        data2021[_id] = {'time':[], 'text':[]}
        for line in [x.strip() for x in file.read().split('\n')]:
            if line.startswith('<TEXT>'):
                data2021[_id]['text'].append(line[6:-7])
            elif line.startswith('<DATE>'):
                data2021[_id]['time'].append((line[6:-7]))

arr = np.loadtxt('eRisk2021_T1/risk_golden_truth.txt', delimiter=' ', dtype=object)
for row in arr:
  _id, isSad = row
  data2021[_id]['isSad'] = int(isSad)

data2022 = {}
path = 'eRisk2022_T1/data/'

for fname in os.listdir(path):
    if not fname.endswith('.xml'):
        continue
    file_path = os.path.join(path, fname)
    with open(file_path, 'r') as file:
        _id = fname[:-4]
        data2022[_id] = {'time':[], 'text':[]}
        for line in [x.strip() for x in file.read().split('\n')]:
            if line.startswith('<TEXT>'):
                data2022[_id]['text'].append(line[6:-7])
            elif line.startswith('<DATE>'):
                data2022[_id]['time'].append((line[6:-7]))
arr = np.loadtxt('eRisk2022_T1/risk_golden_truth.txt', delimiter='	', dtype=object)
for row in arr:
  _id, isSad = row
  data2022[_id]['isSad'] = int(isSad)

In [None]:
df2021 = pd.DataFrame([(k, v['time'], v['text'], v['isSad']) for k, v in data2021.items()], columns=['ID', 'Timestamp', 'Text', 'Label'])
df2022 = pd.DataFrame([(k, v['time'], v['text'], v['isSad']) for k, v in data2022.items()], columns=['ID', 'Timestamp', 'Text', 'Label'])
df = pd.concat([df2021, df2022], ignore_index=True)

In [None]:
def clean_empty_lists(row):
    if all([x == '' for x in row['Text']]):
        return np.nan
    else:
        return row
df_clean = df.apply(clean_empty_lists, axis=1).dropna()

In [None]:
df_sorted = df_clean.sort_values('Label', ascending=False)
# get all rows with Labels=1
df_label_1 = df_sorted[df_sorted['Label'] == 1]
# get the same amount of random samples from Labels=0
num_samples = len(df_label_1)
df_label_0 = df_sorted[df_sorted['Label'] == 0].sample(n=num_samples)
# conctenate the two DataFrames and shuffle the rows
df_concat = pd.concat([df_label_1, df_label_0]).sample(frac=1).reset_index(drop=True)
df_concat.head()

In [52]:
# Iterate over each row of the DataFrame
for index, row in df_concat.iterrows():
    # Get the list of timestamps and the list of text for this row
    timestamps = row['Timestamp']
    text_list = row['Text']
    cleaned_text_list = [text for text in text_list if text != '']
    if len(cleaned_text_list) == 0:
        continue
    cleaned_timestamps = [timestamps[i] for i in range(len(text_list)) if text_list[i] != '']
    df_concat.at[index, 'Text'] = cleaned_text_list
    df_concat.at[index, 'Timestamp'] = cleaned_timestamps
    
df_concat.head()

Unnamed: 0,ID,Timestamp,Text,Label,embeddings
0,subject483,"[2019-12-02 03:45:11, 2020-03-27 22:35:24, 202...",[I posted asking to join it. But noting yet. I...,1.0,"[[-0.046624403, 0.03369027, 0.03198913, -0.020..."
1,subject8468,"[2020-10-23 08:47:48, 2020-11-03 12:17:25, 202...",[My friend just got a Switch and finally can j...,0.0,"[[-0.049074702, -0.06591597, -0.069207065, 0.0..."
2,subject3732,"[2021-03-09 18:25:47, 2021-03-25 18:08:36, 202...",[Is anyone else investing in Lambo stonks? See...,0.0,"[[0.010822672, -0.08347642, -0.036187872, -0.0..."
3,subject2759,"[2020-02-14 08:54:16, 2020-02-14 10:05:01, 202...",[Hello. Im 28 years old. Right now I'm jobless...,1.0,"[[0.0015010262, -0.07187804, -0.031160373, 0.0..."
4,subject7645,"[2019-10-17 19:35:56, 2019-10-20 17:32:49, 201...","[Clever! ;-), They don't call it a [bully pulp...",0.0,"[[0.020647164, -0.038092427, -0.0014518382, 0...."


In [54]:
# import tensorflow
# import tensorflow_hub as hub
# # Load the pre-trained model
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# df_concat['embeddings'] = 0
# huge_emb_lst = []
# for i in range(len(df_concat)):
#     emb_lst = []
#     for text_piece in df_concat['Text'].iloc[i]:
#         if text_piece != '':
#             embedd = embed([str(text_piece)])
#             embedd = np.squeeze(embedd.numpy())
#         else:
#             embedd = np.zeros(512)
#         emb_lst.append(embedd)
#     # df_concat['embeddings'].iloc[i] = emb_lst
#     huge_emb_lst.append(emb_lst)
# df_concat['embeddings'] = huge_emb_lst

In [None]:
df_concat

In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Generate BERT embeddings for each text in the dataset

df_concat['embeddings_bert'] = 0
huge_emb_lst = []
for i in range(336, len(df_concat)):
    emb_lst = []
    for text_piece in df_concat['Text'].iloc[i]:
        embedd = tokenizer(text_piece, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
            outputs = model(**embedd)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        emb_lst.append(embeddings)
    # df_concat['embeddings'].iloc[i] = emb_lst
    huge_emb_lst.append(emb_lst)
df_concat['embeddings_bert'] = huge_emb_lst
df_concat.to_csv('df_concat.csv')

In [None]:
df_concat

Unnamed: 0.1,Unnamed: 0,ID,Timestamp,Text,Label,embeddings_bert
0,0,subject9354,"['2020-10-04 18:02:39', '2020-10-07 15:46:24',...","['me 2.', 'same thing, no double dipping.', 'I...",0.0,"[array([[-2.27833837e-01, -3.56629044e-01, -9...."
1,1,subject6720,"['2018-11-12 16:55:23', '2018-11-13 07:17:56',...","[""[https://www.twitch.tv/videos/334960643](htt...",0.0,"[array([[ 3.61625291e-02, -2.95236349e-01, 1...."
2,2,subject1939,"['2017-05-03 16:49:50', '2017-05-03 22:06:28',...","[""I just lost all of my money gambling and thi...",1.0,"[array([[ 1.22508835e-02, -1.09029643e-01, 2...."
3,3,subject7835,"['2017-12-29 20:00:16', '2017-12-29 23:54:22',...",['Seeking experienced voice actors to particip...,0.0,"[array([[-3.89923854e-03, -2.23664463e-01, 4...."
4,4,subject2804,"['2019-03-09 16:33:46', '2019-11-26 04:57:58',...","['Absolutly', '8', 'Hot!', 'Cute!', 'Very cute...",0.0,"[array([[-5.30905664e-01, -3.51056904e-01, -1...."
...,...,...,...,...,...,...
485,485,subject6517,"['2021-11-10 15:33:32', '2021-11-10 15:34:13',...",['Have you got rock bottom? Do you want to qui...,1.0,"[array([[ 2.34189004e-01, -3.63695145e-01, -5...."
486,486,subject8492,"['2020-08-04 21:42:26', '2020-11-24 20:20:19',...",['Ive missed well over 2 months. There is no p...,0.0,"[array([[ 2.21290961e-02, -1.10986859e-01, 4...."
487,487,subject5487,"['2021-12-20 15:27:03', '2021-12-20 16:10:38',...","[""Hello there, I am a former JW that served as...",0.0,"[array([[ 2.86593795e-01, -5.11154626e-03, 1...."
488,488,subject4802,"['2020-02-04 20:19:32', '2020-02-04 20:54:40',...","['Im feeling absolutely helpless right now, no...",1.0,"[array([[ 8.15021992e-03, 2.23562643e-01, 4...."


In [55]:
df_concat = pd.read_csv('df_concat.csv')

In [56]:
import tensorflow
import tensorflow_hub as hub
# Load the pre-trained model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
df_concat['embeddings_use'] = 0
huge_emb_lst = []
for i in range(len(df_concat)):
    emb_lst = []
    for text_piece in eval(df_concat['Text'].iloc[i]):
        if text_piece != '':
            embedd = embed([str(text_piece)])
            embedd = np.squeeze(embedd.numpy())
        else:
            embedd = np.zeros(512)
        emb_lst.append(embedd)
    # df_concat['embeddings'].iloc[i] = emb_lst
    huge_emb_lst.append(emb_lst)
df_concat['embeddings_use'] = huge_emb_lst

In [57]:
df_concat

Unnamed: 0.1,Unnamed: 0,ID,Timestamp,Text,Label,embeddings_bert,embeddings_use
0,0,subject9354,"['2020-10-04 18:02:39', '2020-10-07 15:46:24',...","['me 2.', 'same thing, no double dipping.', 'I...",0.0,"[array([[-2.27833837e-01, -3.56629044e-01, -9....","[[0.09742043, -0.10078834, 0.05655467, 0.03573..."
1,1,subject6720,"['2018-11-12 16:55:23', '2018-11-13 07:17:56',...","[""[https://www.twitch.tv/videos/334960643](htt...",0.0,"[array([[ 3.61625291e-02, -2.95236349e-01, 1....","[[0.016732637, -0.07019263, -0.06763446, 0.022..."
2,2,subject1939,"['2017-05-03 16:49:50', '2017-05-03 22:06:28',...","[""I just lost all of my money gambling and thi...",1.0,"[array([[ 1.22508835e-02, -1.09029643e-01, 2....","[[-0.041338928, -0.083965495, -0.0031749755, 0..."
3,3,subject7835,"['2017-12-29 20:00:16', '2017-12-29 23:54:22',...",['Seeking experienced voice actors to particip...,0.0,"[array([[-3.89923854e-03, -2.23664463e-01, 4....","[[-0.05422537, -0.028290626, -0.0151080135, 0...."
4,4,subject2804,"['2019-03-09 16:33:46', '2019-11-26 04:57:58',...","['Absolutly', '8', 'Hot!', 'Cute!', 'Very cute...",0.0,"[array([[-5.30905664e-01, -3.51056904e-01, -1....","[[-0.015544569, -0.03773666, 0.0076418677, 0.0..."
...,...,...,...,...,...,...,...
485,485,subject6517,"['2021-11-10 15:33:32', '2021-11-10 15:34:13',...",['Have you got rock bottom? Do you want to qui...,1.0,"[array([[ 2.34189004e-01, -3.63695145e-01, -5....","[[-0.009765577, -0.08657441, -0.010570338, 0.0..."
486,486,subject8492,"['2020-08-04 21:42:26', '2020-11-24 20:20:19',...",['Ive missed well over 2 months. There is no p...,0.0,"[array([[ 2.21290961e-02, -1.10986859e-01, 4....","[[-0.05554608, -0.098843925, -0.0073674256, -0..."
487,487,subject5487,"['2021-12-20 15:27:03', '2021-12-20 16:10:38',...","[""Hello there, I am a former JW that served as...",0.0,"[array([[ 2.86593795e-01, -5.11154626e-03, 1....","[[-0.002484351, -0.0827845, -0.051220387, -0.0..."
488,488,subject4802,"['2020-02-04 20:19:32', '2020-02-04 20:54:40',...","['Im feeling absolutely helpless right now, no...",1.0,"[array([[ 8.15021992e-03, 2.23562643e-01, 4....","[[-0.018387843, -0.090804316, -0.010818297, -0..."


In [None]:
# import re
# import numpy as np
# big_list_of_arrays = []
# for i in range(len(df_concat)):
#     string = df_concat['embeddings_bert'].iloc[i]
#     embedding_str = string.replace('\n', '')
#     embedding_str = embedding_str.replace('  ', ' ')
#     embedding_str = embedding_str.replace('  ', ' ')
#     embedding_str = embedding_str.replace('  ', ' ')
#     embedding_str = embedding_str.replace(', dtype=float32', '')
#     array_strings = re.findall(r'array\(\[(.*?)\]\)', embedding_str)
#     arrays = [np.array(ast.literal_eval(s)) for s in array_strings]
#     big_list_of_arrays.append(arrays)

In [None]:
df_concat['embeddings_bert_converted'] = big_list_of_arrays

In [61]:
time_lst = []
for i in range(len(df_concat)):
    time_lst.append(ast.literal_eval(df_concat['Timestamp'].iloc[i]))

In [62]:
df_concat['Timestamp'] = time_lst

In [59]:
train_data = np.load('train_data.npy')
val_data = np.load('val_data.npy')
test_data = np.load('test_data.npy')
train_timestamps_padded = np.load('train_timestamps_padded.npy')
val_timestamps_padded = np.load('val_timestamps_padded.npy')
test_timestamps_padded = np.load('test_timestamps_padded.npy')
train_labels = np.load('train_labels.npy')
val_labels = np.load('val_labels.npy')
test_labels = np.load('test_labels.npy')

In [77]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Define the number of classes
NUM_CLASSES = 2

# Define the learning rate decay parameters
INIT_LR = 0.001
DECAY_FACTOR = 0.1
DECAY_EPOCHS = 10

# Define the GRU parameters
GRU_HIDDEN_DIM = 256
DROPOUT_RATE = 0.2

# Split the data into training and validation sets
train_texts, val_test_texts, train_labels, val_test_labels, train_timestamps, val_test_timestamps = train_test_split(
    df_concat['embeddings_use'].values,
    df_concat['Label'].values,
    df_concat['Timestamp'].values,
    test_size=0.2,
    random_state=420
)
val_texts, test_texts, val_labels, test_labels, val_timestamps, test_timestamps = train_test_split(
    val_test_texts,
    val_test_labels,
    val_test_timestamps,
    train_size=0.5,
    random_state=420
)

MAX_SEQ_LENGTH = max(len(x) for x in train_texts[:][:])

# Convert the labels to one-hot vectors
train_labels = tf.keras.utils.to_categorical(train_labels, NUM_CLASSES)
val_labels = tf.keras.utils.to_categorical(val_labels, NUM_CLASSES)
test_labels = tf.keras.utils.to_categorical(test_labels, NUM_CLASSES)

# Pad the sequences to the same length
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_texts, maxlen=MAX_SEQ_LENGTH, dtype='float32', padding='post', truncating='post')
val_data = tf.keras.preprocessing.sequence.pad_sequences(val_texts, maxlen=MAX_SEQ_LENGTH, dtype='float32', padding='post', truncating='post')
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_texts, maxlen=MAX_SEQ_LENGTH, dtype='float32', padding='post', truncating='post')

# Convert the timestamps to datetime objects
train_data_timestamps = []
for ts_list in train_timestamps:
    ts_list = [int(datetime.strptime(ts, '%Y-%m-%d %H:%M:%S').timestamp()) for ts in ts_list]
    train_data_timestamps.append(ts_list)

val_data_timestamps = []
for ts_list in val_timestamps:
    ts_list = [int(datetime.strptime(ts, '%Y-%m-%d %H:%M:%S').timestamp()) for ts in ts_list]
    val_data_timestamps.append(ts_list)

test_data_timestamps = []
for ts_list in test_timestamps:
    ts_list = [int(datetime.strptime(ts, '%Y-%m-%d %H:%M:%S').timestamp()) for ts in ts_list]
    test_data_timestamps.append(ts_list)

train_timestamps_padded = tf.keras.preprocessing.sequence.pad_sequences(train_data_timestamps, padding='post', maxlen=MAX_SEQ_LENGTH)
val_timestamps_padded = tf.keras.preprocessing.sequence.pad_sequences(val_data_timestamps, padding='post', maxlen=MAX_SEQ_LENGTH)
test_timestamps_padded = tf.keras.preprocessing.sequence.pad_sequences(test_data_timestamps, padding='post', maxlen=MAX_SEQ_LENGTH)


In [78]:
MAX_SEQ_LENGTH = 1265
# normalizing time inputs was super important!!
max_time = 1./np.max(train_timestamps_padded)
train_timestamps_padded = (train_timestamps_padded*max_time).astype(np.float32)
val_timestamps_padded = (val_timestamps_padded*max_time).astype(np.float32)
test_timestamps_padded = (test_timestamps_padded*max_time).astype(np.float32)

In [79]:
# Define the time decay function
def time_decay(epoch):
    lrate = INIT_LR * pow(DECAY_FACTOR, np.floor((1+epoch)/DECAY_EPOCHS))
    return lrate

# Define the input layers
EmbeddingInput = Input(shape=(MAX_SEQ_LENGTH,512), name='embeddings')
TimeInput = Input(shape=(MAX_SEQ_LENGTH,), name='times')

decay_layer = Lambda(lambda t: tf.math.exp(-(t - tf.roll(t, shift=1, axis=1)) / 86400), name='decay_layer')(TimeInput)

decay_layer_2 = tf.expand_dims(decay_layer, axis=-1, name='decay_layer_2')

gru_layer = GRU(GRU_HIDDEN_DIM, dropout=DROPOUT_RATE, return_sequences=True, name='gru')(EmbeddingInput, mask=EmbeddingInput._keras_mask)
multiply = tf.keras.layers.Multiply(name='multiply')([gru_layer, decay_layer_2])
flatten_layer = tf.keras.layers.Flatten(name='flatten')(multiply)
outputs = Dense(NUM_CLASSES, activation='softmax')(flatten_layer)

model = Model(inputs=[EmbeddingInput, TimeInput], outputs=outputs)

optimizer = Adam(learning_rate=INIT_LR)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [73]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 times (InputLayer)             [(None, 1265)]       0           []                               
                                                                                                  
 embeddings (InputLayer)        [(None, 1265, 512)]  0           []                               
                                                                                                  
 decay_layer (Lambda)           (None, 1265)         0           ['times[0][0]']                  
                                                                                                  
 gru (GRU)                      (None, 1265, 256)    591360      ['embeddings[0][0]']             
                                                                                            

In [80]:
lr_scheduler = LearningRateScheduler(time_decay)

history = model.fit([train_data, train_timestamps_padded], train_labels, validation_data=([val_data, val_timestamps_padded], val_labels), epochs=10, batch_size=32, callbacks=[lr_scheduler])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
model.evaluate([test_data, test_timestamps_padded], test_labels)



[0.18137145042419434, 0.918367326259613]

In [82]:
test_preds = model.predict([test_data, test_timestamps_padded])
test_preds = np.argmax(test_preds, axis=1)
test_f1_score = f1_score(np.argmax(test_labels, axis=1), test_preds, average='macro')
print('Test F1 Score:', test_f1_score)
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, recall_score, precision_score
# Calculate AUROC
test_auroc = roc_auc_score(np.argmax(test_labels, axis=1), test_preds)
# Calculate AUPRC
test_auprc = average_precision_score(np.argmax(test_labels, axis=1), test_preds)
# Calculate accuracy
test_accuracy = accuracy_score(np.argmax(test_labels, axis=1), test_preds)
# Calculate recall
test_recall = recall_score(np.argmax(test_labels, axis=1), test_preds, average='macro')
# Calculate precision
test_precision = precision_score(np.argmax(test_labels, axis=1), test_preds, average='macro')

print('Test AUROC:', test_auroc)
print('Test AUPRC:', test_auprc)
print('Test Accuracy:', test_accuracy)
print('Test Recall:', test_recall)
print('Test Precision:', test_precision)
print('Test F1 Score:', test_f1_score)


Test F1 Score: 0.9180602006688963
Test AUROC: 0.9175
Test AUPRC: 0.8737414965986394
Test Accuracy: 0.9183673469387755
Test Recall: 0.9175
Test Precision: 0.9217171717171717
Test F1 Score: 0.9180602006688963
