In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, BatchSampler, random_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

import numpy as np
import json
import pandas as pd
import os
from collections import defaultdict

import re
import emoji

In [None]:
class EmbDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

def validate(net, device, criterion, val_dl):
    net.to(device)
    net.train()
    loss = 0
    num_batches = 0
    preds = []
    labels = []

    with torch.no_grad():
        for i, (batch_data, batch_labels) in enumerate(val_dl):
            labels.append(batch_labels.numpy())
            
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device).long()
            out = net(batch_data)
            batch_loss = criterion(out, batch_labels)
    
            loss += batch_loss.item()
            num_batches += 1
            batch_predictions = torch.argmax(out, axis=-1)
            preds.append(batch_predictions.cpu().numpy())

    labels = np.concatenate(labels, axis=0)
    preds = np.concatenate(preds, axis=0)
    loss = loss / num_batches
    report = {
        'loss': loss,
        'acc': metrics.accuracy_score(labels, preds),
        'macro_f1': metrics.f1_score(labels, preds, average='macro', zero_division=0),
        'macro_precision': metrics.precision_score(labels, preds, average='macro', zero_division=0),
        'macro_recall': metrics.recall_score(labels, preds, average='macro', zero_division=0),
        'micro_f1': metrics.f1_score(labels, preds, average='micro', zero_division=0),
        'micro_precision': metrics.precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall': metrics.recall_score(labels, preds, average='micro', zero_division=0),
        'cls_report': metrics.classification_report(labels, preds, zero_division=0),
        'cfm': metrics.confusion_matrix(labels, preds)
    }
    return report

def train_gdro(net, optimizer, device, criterion, train_dl, q, eta=0.1):
    net.to(device)
    net.train()
    loss = 0
    num_batches = 0
    preds = []
    labels = []

    for i, (batch_data, batch_labels) in enumerate(train_dl):
        labels.append(batch_labels.numpy())
        
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device).long()
        optimizer.zero_grad()
        out = net(batch_data)
        batch_losses = F.cross_entropy(out, batch_labels, reduction='none')

        ## compute loss here
        for cls in torch.unique(batch_labels):
            idx_cls = batch_labels == cls
            q[cls] *= (eta * batch_losses[idx_cls].mean()).exp().item()

        q /= q.sum()

        loss_value = 0
        for cls in torch.unique(batch_labels):
            idx_cls = batch_labels == cls
            loss_value += q[cls] * batch_losses[idx_cls].mean()

        loss_value.backward()
        optimizer.step()

        loss += loss_value.item()
        num_batches += 1
        batch_predictions = torch.argmax(out, axis=-1)
        preds.append(batch_predictions.cpu().numpy())

    labels = np.concatenate(labels, axis=0)
    preds = np.concatenate(preds, axis=0)
    loss = loss / num_batches
    report = {
        'loss': loss,
        'acc': metrics.accuracy_score(labels, preds),
        'macro_f1': metrics.f1_score(labels, preds, average='macro', zero_division=0),
        'macro_precision': metrics.precision_score(labels, preds, average='macro', zero_division=0),
        'macro_recall': metrics.recall_score(labels, preds, average='macro', zero_division=0),
        'micro_f1': metrics.f1_score(labels, preds, average='micro', zero_division=0),
        'micro_precision': metrics.precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall': metrics.recall_score(labels, preds, average='micro', zero_division=0),
        'cls_report': metrics.classification_report(labels, preds, zero_division=0),
        'cfm': metrics.confusion_matrix(labels, preds)
    }
    return report, q

def run_train_gdro(net, optimizer, criterion, device, train_dl, val_dl, output_dir, max_epochs=30, n_classes=3):
    num_epochs_ni = 0
    best_vacc = 0
    best_val_f1 = 0
    logs = {'train':defaultdict(list), 'val':defaultdict(list)}
    q = torch.ones(n_classes, dtype=torch.float32, device=device) / n_classes
    
    for epoch in tqdm(range(max_epochs)):
        train_report, q = train_gdro(net, optimizer, device, criterion, train_dl, q=q)
        for k in train_report.keys():
            logs['train'][k].append(train_report[k])
            
        val_report = validate(net, device, criterion, val_dl)
        for k in val_report.keys():
            logs['val'][k].append(val_report[k])
            
        if val_report['macro_f1'] > best_val_f1:
            torch.save(net.cpu().state_dict(), f'{output_dir}/model.pt')
            best_val_f1 = val_report['macro_f1']
    return logs


In [None]:
def get_cls_embeddings(all_messages, model, tokenizer, device, m_length=96):
    model.to(device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for subject_messages in tqdm(all_messages):
            input = tokenizer(subject_messages, padding=True, truncation=True, max_length=m_length, return_tensors='pt')
            output = model(**input.to(device))
            embeddings.append(output.last_hidden_state[:, 0, :].cpu().numpy())
    # embeddings = torch.vstack(embeddings)
    return embeddings

In [None]:
def make_plot(train_scores, val_scores, y_label, figsize=(8,5)):
    fig, ax = plt.subplots(1,1,figsize=figsize)
    ax.plot(train_scores, label='Train')
    ax.plot(val_scores, label='Val')
    ax.set_xlabel('Epoch')
    ax.set_ylabel(y_label)
    ax.legend()

    return fig, ax

In [None]:
textual_emoticons_to_spanish = {
        ":)": "cara sonriente",
        ":(": "cara triste",
        ";)": "guiño",
        ":D": "cara riendo con los ojos abiertos",
        "XD": "cara riendo con los ojos cerrados",
        "xD": "cara riendo con los ojos cerrados",
        ":P": "cara sacando la lengua",
        "<3": "corazón",
        ":'(": "cara llorando",
        ":-)": "cara sonriente",
        ":-(": "cara triste",
        ";-)": "guiño",
        ":-D": "cara riendo con los ojos abiertos",
        ":-P": "cara sacando la lengua",
        "(heart)": "corazón",
        ":o": "cara sorprendida",
        ":-o": "cara sorprendida",
        ":/": "cara de duda"
    }

def preprocess(text):
    # print(text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'(ja)+', 'jaja', text, flags=re.IGNORECASE)
    text = re.sub(r'(js)+', 'jsjs', text, flags=re.IGNORECASE)
    text = text.replace(" @","o")
    for x,y in textual_emoticons_to_spanish.items():
        text = text.replace(x,y)
    text = emoji.demojize(text,language='es')
    spanishVowels = 'aeiouáéíóú'
    uppercaseVowels =spanishVowels.upper()
    for vow in spanishVowels + uppercaseVowels:
        pattern = re.compile(f"{vow}{vow}{vow}+")
        text = pattern.sub(f'{vow}',text)
    return text

In [None]:
def get_cls_embeddings_2(messages, model, tokenizer, device, m_length=96):
    model.to(device)
    model.eval()
    embeddings = []
    n_messages = len(messages)
    with torch.no_grad():
        for i in tqdm(range(0, n_messages, 128)):
            msg_batch = messages[i: min(i+128, n_messages)]
            input = tokenizer(msg_batch, padding=True, truncation=True, max_length=m_length, return_tensors='pt')
            output = model(**input.to(device))
            embeddings.append(output.last_hidden_state[:, 0, :].cpu().numpy())
    
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [None]:
# training data provided by the organizers for the first two tasks
data_dir = '/path/to/data/dir'

LABEL_MAP = {'none':0, 'anxiety':1, 'depression':2}
REVERSE_LABEL_MAP = {0:'none', 1:'anxiety', 2:'depression'}

CONTEXTS = ['addiction','emergency','family','work','social','other']
CONTEXT_MAP = {c:i for i,c in enumerate(CONTEXTS)}
CONTEXT_MAP['none'] = len(CONTEXTS)
REVERSE_CONTEXT_MAP = {CONTEXT_MAP[c]:c for c in CONTEXT_MAP.keys()}

data = {'messages':[], 'labels1':[], 'labels2':[], 'dates':[]}

# read data and preprocess text messages
for split in ['trial', 'train']:
    df = pd.read_csv(os.path.join(data_dir, split, 'gold_task2.txt'))
    labels1 = df['label'].map(lambda x: LABEL_MAP[x]).to_list()
    data['labels1'] += labels1
    
    subjects = df['Subject'].to_list()
    messages = []
    dates = []
    for subject in subjects:
        subject_data = json.load(open(os.path.join(data_dir, split, 'subjects', f'{subject}.json') , 'r', encoding='utf-8'))
        messages.append([preprocess(x['message']) for x in subject_data])
        dates.append([x['date'] for x in subject_data])
    data['messages'] += messages
    data['dates'] += dates
    data['labels2'].append(df[CONTEXTS].to_numpy().astype(np.int32))

data['labels2'] = np.concatenate(data['labels2'], axis=0).astype(np.float32)
data['labels1'] = np.array(data['labels1'], dtype=np.int32)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

labels = data['labels1']
name = 'pysentimiento/robertuito-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModel.from_pretrained(name)
model.eval()
model.to(device)


msg_healthy = [x for x,y in zip(data['messages'], data['labels1']) if y == 0]
embs = get_cls_embeddings(msg_healthy, model, tokenizer, device)
embs_flat = np.concatenate(embs, axis=0) # embeddings of messages from individuals not suffering from depression or anxiety

msg_other = [x for x,y in zip(data['messages'], data['labels1']) if y != 0]
embs_other = get_cls_embeddings(msg_other, model, tokenizer, device)
embs_other = np.concatenate(embs_other, axis=0)

In [None]:
task3_data_dir = '/path/to/additional/data' 
# datasets downloded from https://huggingface.co/datasets/somosnlp-hackathon-2023/suicide-comments-es 
# and https://github.com/kvvaldez/spanish_suicide/tree/master

data1 = pd.read_csv(os.path.join(task3_data_dir ,'suicide_comments_es.csv'))
data2 = pd.read_csv(os.path.join(task3_data_dir ,'suicidio_notacion.csv'))

pos_samples1 = data1.Text[data1.Label == 1].map(lambda x: preprocess(x)).to_list()
pos_samples2 = data2.tweet_clean[data2.suicidio == 1].map(lambda x: preprocess(x)).to_list()

pos_embs1 = get_cls_embeddings_2(pos_samples1, model, tokenizer, device=device, m_length=128)
pos_embs2 = get_cls_embeddings_2(pos_samples2, model, tokenizer, device=device, m_length=128)
pos_embs = np.concatenate((pos_embs1, pos_embs2), axis=0)

all_embs = np.concatenate([embs_flat, pos_embs], axis=0)

all_labels = np.zeros(all_embs.shape[0], dtype=np.int32)
all_labels[embs_flat.shape[0]:] = 1
assert np.sum(all_labels) == pos_embs.shape[0]

In [None]:
save_dir = '/path/save/networks/'

ds = EmbDataset(all_embs, all_labels)
train_ds, test_ds = random_split(ds, [0.8, 0.2], generator=torch.Generator().manual_seed(1007))
test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, drop_last=False)

f1s = []
for batch_size in [32, 64, 128]:
    scores = []
    for lr_idx, lr in enumerate([5e-2, 1e-3, 5e-3]):
        output_dir = os.path.join(save_dir, f'linear_preprocess_{batch_size}_{lr_idx}')
        os.makedirs(output_dir, exist_ok=True)
        
        random.seed(1007)
        np.random.seed(1007)
        torch.manual_seed(1007)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)

        net = nn.Linear(all_embs[0].shape[-1], 2, bias=True)
        optimizer = torch.optim.Adam(net.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()
        
        logs = run_train_gdro(net,optimizer,loss_fn,device, train_dl, test_dl, output_dir, max_epochs=50, n_classes=2)
        for k in logs['train'].keys():
            if k != 'cls_report' and k != 'cfm':
                fig, ax = make_plot(logs['train'][k], logs['val'][k], k)
                fig.savefig(f'{output_dir}/{k}.png')
                plt.close(fig)
        
        np.save(f'{output_dir}/logs.npy', logs, allow_pickle=True)
        arg = np.argmax(logs['val']['macro_f1'])
        print(f"batch_size: {batch_size}, lr: {lr}, arg: {arg}")
        print(f"Val macro_f1: {logs['val']['macro_f1'][arg]:.4f} | Train macro_f1: {logs['train']['macro_f1'][arg]:.4f}")
        print('Val', logs['val']['cfm'][arg], logs['val']['cls_report'][arg], sep='\n')
        print('Train', logs['train']['cfm'][arg], logs['train']['cls_report'][arg], sep='\n')
        
        scores.append(logs['val']['macro_f1'][arg])
    f1s.append(scores)

In [None]:
embs_flat_ = np.concatenate([embs_flat, embs_other], axis=0) # add the subjects suffering from depression or anxiety as negative examples (non-suicidal)

all_embs = np.concatenate([embs_flat_, pos_embs], axis=0)

all_labels = np.zeros(all_embs.shape[0], dtype=np.int32)
all_labels[embs_flat.shape[0]:] = 1
assert np.sum(all_labels) == pos_embs.shape[0]

In [None]:
save_dir = '/path/save/networks'

ds = EmbDataset(all_embs, all_labels)
train_ds, test_ds = random_split(ds, [0.8, 0.2], generator=torch.Generator().manual_seed(1007))
test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, drop_last=False)

f1s = []
for batch_size in [32, 64, 128]:
    scores = []
    for lr_idx, lr in enumerate([5e-2, 1e-3, 5e-3]):
        output_dir = os.path.join(save_dir, f'linear_preprocess_all_{batch_size}_{lr_idx}')
        os.makedirs(output_dir, exist_ok=True)
        
        random.seed(1007)
        np.random.seed(1007)
        torch.manual_seed(1007)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)

        net = nn.Linear(all_embs[0].shape[-1], 2, bias=True)
        optimizer = torch.optim.Adam(net.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()
        
        logs = run_train_gdro(net,optimizer,loss_fn,device, train_dl, test_dl, output_dir, max_epochs=50, n_classes=2)
        for k in logs['train'].keys():
            if k != 'cls_report' and k != 'cfm':
                fig, ax = make_plot(logs['train'][k], logs['val'][k], k)
                fig.savefig(f'{output_dir}/{k}.png')
                plt.close(fig)
        
        np.save(f'{output_dir}/logs.npy', logs, allow_pickle=True)
        arg = np.argmax(logs['val']['macro_f1'])
        print(f"batch_size: {batch_size}, lr: {lr}, arg: {arg}")
        print(f"Val macro_f1: {logs['val']['macro_f1'][arg]:.4f} | Train macro_f1: {logs['train']['macro_f1'][arg]:.4f}")
        print('Val', logs['val']['cfm'][arg], logs['val']['cls_report'][arg], sep='\n')
        print('Train', logs['train']['cfm'][arg], logs['train']['cls_report'][arg], sep='\n')
        
        scores.append(logs['val']['macro_f1'][arg])
    f1s.append(scores)