In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, BatchSampler, random_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.nn.utils.rnn import pad_sequence
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

import numpy as np
import json
import pandas as pd
import os
from collections import defaultdict

import re
import emoji

In [None]:
class EmbDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [None]:
def make_plot(train_scores, val_scores, y_label, figsize=(8,5)):
    fig, ax = plt.subplots(1,1,figsize=figsize)
    ax.plot(train_scores, label='Train')
    ax.plot(val_scores, label='Val')
    ax.set_xlabel('Epoch')
    ax.set_ylabel(y_label)
    ax.legend()

    return fig, ax

In [None]:
def get_cls_embeddings(all_messages, model, tokenizer, device, m_length=96):
    model.to(device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for subject_messages in tqdm(all_messages):
            input = tokenizer(subject_messages, padding=True, truncation=True, max_length=m_length, return_tensors='pt')
            output = model(**input.to(device))
            embeddings.append(output.last_hidden_state[:, 0, :].cpu().numpy())
    # embeddings = torch.vstack(embeddings)
    return embeddings

In [None]:
textual_emoticons_to_spanish = {
        ":)": "cara sonriente",
        ":(": "cara triste",
        ";)": "guiño",
        ":D": "cara riendo con los ojos abiertos",
        "XD": "cara riendo con los ojos cerrados",
        "xD": "cara riendo con los ojos cerrados",
        ":P": "cara sacando la lengua",
        "<3": "corazón",
        ":'(": "cara llorando",
        ":-)": "cara sonriente",
        ":-(": "cara triste",
        ";-)": "guiño",
        ":-D": "cara riendo con los ojos abiertos",
        ":-P": "cara sacando la lengua",
        "(heart)": "corazón",
        ":o": "cara sorprendida",
        ":-o": "cara sorprendida",
        ":/": "cara de duda"
    }

def preprocess(text):
    # print(text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'(ja)+', 'jaja', text, flags=re.IGNORECASE)
    text = re.sub(r'(js)+', 'jsjs', text, flags=re.IGNORECASE)
    text = text.replace(" @","o")
    for x,y in textual_emoticons_to_spanish.items():
        text = text.replace(x,y)
    text = emoji.demojize(text,language='es')
    spanishVowels = 'aeiouáéíóú'
    uppercaseVowels =spanishVowels.upper()
    for vow in spanishVowels + uppercaseVowels:
        pattern = re.compile(f"{vow}{vow}{vow}+")
        text = pattern.sub(f'{vow}',text)
    return text

In [None]:
def lstm_collate(batch):
    labels = [x[1] for x in batch]
    labels = torch.tensor(labels, dtype=torch.long)
 
    data = [torch.tensor(x[0], dtype=torch.float32) for x in batch]
    batch_data = pad_sequence(data)
    
    lens = torch.tensor([len(x) for x in data], dtype=torch.long).unsqueeze(0).unsqueeze(-1) # 1, N, 1
    lens -= 1
    return batch_data, lens, labels
    

In [None]:
class LSTMClassifier(nn.Module):

    def __init__(self, input_size, h_size, output_dim, dropout=0):
        super().__init__()
        self.input_size = input_size
        self.h_size = h_size
        self.output_dim = output_dim
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=h_size, num_layers=1, batch_first=False,
                           dropout=dropout, bidirectional=False)
        self.classifier = nn.Linear(h_size, output_dim)

    def forward(self, seq_data, seq_lens, state=None):
        # seq_data : (S, N, input_size)
        # seq_lens: (N,) -> numbers between 0 and S-1 -> position of last actual sample before padding
        if state is None:
            state = (
                torch.zeros(1, seq_data.shape[1], self.h_size, device=seq_data.device),
                torch.zeros(1, seq_data.shape[1], self.h_size, device=seq_data.device)
            )

        out_states, _ = self.lstm(seq_data, state) # S, N, H
        pred_states = torch.take_along_dim(out_states, seq_lens, dim=0).squeeze() # remove seq dimension
        out = self.classifier(pred_states)
        return out

    def predict_all_timesteps(self, seq_data, seq_lens, state=None):
        if state is None:
            state = (
                torch.zeros(1, seq_data.shape[1], self.h_size, device=seq_data.device),
                torch.zeros(1, seq_data.shape[1], self.h_size, device=seq_data.device)
            )
        
        out_states, _ = self.lstm(seq_data, state) # [S, N, H]
        logits_all = self.classifier(out_states) # [S, N, n_classes]
        pred_all = torch.argmax(logits_all, dim=2) # [S, N]
        ts_predictions = []
        for i in range(pred_all.shape[1]): # batch_dim
            ts_predictions.append(pred_all[:seq_lens[0, i].item(), i].squeeze().cpu().numpy())

        return ts_predictions

    def predict_one_step(self, data, states=None):
        # data: [N, E] -> [1,N,E] # there is only 1 item in each sequence
        # state: None or tuple([N, H], [N, H])
        
        if states == None:
            # init states with zeros
            states = (
                torch.zeros(1, seq_data.shape[1], self.h_size, device=data.device, dtype=data.dtype),
                torch.zeros(1, seq_data.shape[1], self.h_size, device=data.device, dtype=data.dtype)
            )
        
        _, (hn, cn) = self.lstm(data, states) # ([1, N, H], [1, N, H])
        out = self.classifier(hn.squeeze()) # [N, C]
        predictions = torch.argmax(out, axis=-1).cpu().numpy() # [N}]
        
        return predictions, (hn.permute(1,0,2), cn.permute(1,0,2))


In [None]:
def validate_tms_rnn(subject_embs, labels, net, device):
    predictions = []
    positions = []
    net.to(device)
    net.eval()
    with torch.no_grad():
        batch_data, batch_lens, _ = lstm_collate([(embs, label) for embs,label in zip(subject_embs, labels)]) 
        batch_data, batch_lens = batch_data.to(device), batch_lens.to(device) 
        seq_predictions = net.predict_all_timesteps(batch_data, batch_lens)
        for i, (seq_pred, true_label) in enumerate(zip(seq_predictions, labels)):
            idxs = np.nonzero(seq_pred)[0]
            if len(idxs) == 0: # every prediction is 0
                predictions.append(0)
                positions.append(-1)
            else:
                predictions.append(seq_pred[idxs[0]])
                positions.append(idxs[0])

    preds = predictions
    report = {
        'acc': metrics.accuracy_score(labels, preds),
        'macro_f1': metrics.f1_score(labels, preds, average='macro', zero_division=0),
        'macro_precision': metrics.precision_score(labels, preds, average='macro', zero_division=0),
        'macro_recall': metrics.recall_score(labels, preds, average='macro', zero_division=0),
        'micro_f1': metrics.f1_score(labels, preds, average='micro', zero_division=0),
        'micro_precision': metrics.precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall': metrics.recall_score(labels, preds, average='micro', zero_division=0),
        'cls_report': metrics.classification_report(labels, preds, zero_division=0),
        'cfm': metrics.confusion_matrix(labels, preds)
    }
    return report


In [None]:
## soft labels
def train_gdro_rnn_sl(net, optimizer, device, criterion, train_dl, q, soft_labels, eta=0.1):
    net.to(device)
    net.train()
    loss = 0
    num_batches = 0
    preds = []
    labels = []
    
    for batch_data, batch_lens, batch_labels in train_dl:
        labels.append(batch_labels.numpy())
        unique_batch_labels = np.unique(batch_labels.numpy())
        
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
        batch_lens = batch_lens.to(device)
        optimizer.zero_grad()
        out = net(batch_data, batch_lens)
        batch_losses = F.cross_entropy(out, soft_labels[batch_labels], reduction='none')
        
        ## compute loss here
        for cls in unique_batch_labels:
            idx_cls = batch_labels == cls
            q[cls] *= (eta * batch_losses[idx_cls].mean()).exp().item()

        q /= q.sum()

        loss_value = 0
        for cls in unique_batch_labels:
            idx_cls = batch_labels == cls
            loss_value += q[cls] * batch_losses[idx_cls].mean()

        loss_value.backward()
        optimizer.step()

        loss += loss_value.item()
        num_batches += 1
        batch_predictions = torch.argmax(out, axis=-1)
        preds.append(batch_predictions.cpu().numpy())

    labels = np.concatenate(labels, axis=0)
    preds = np.concatenate(preds, axis=0)
    loss = loss / num_batches
    report = {
        'loss': loss,
        'acc': metrics.accuracy_score(labels, preds),
        'macro_f1': metrics.f1_score(labels, preds, average='macro', zero_division=0),
        'macro_precision': metrics.precision_score(labels, preds, average='macro', zero_division=0),
        'macro_recall': metrics.recall_score(labels, preds, average='macro', zero_division=0),
        'micro_f1': metrics.f1_score(labels, preds, average='micro', zero_division=0),
        'micro_precision': metrics.precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall': metrics.recall_score(labels, preds, average='micro', zero_division=0),
        'cls_report': metrics.classification_report(labels, preds, zero_division=0),
        'cfm': metrics.confusion_matrix(labels, preds)
    }
    return report, q


In [None]:
def run_train_gdro_rnn_sl(net, optimizer, criterion, device, train_dl, train_embs, train_labels,
                          val_embs, val_labels, soft_labels, output_dir, max_epochs=200, n_classes=3, eta=0.1):
    num_epochs_ni = 0
    best_vacc = 0
    logs = {'train':defaultdict(list), 'val':defaultdict(list), 'train_eval': defaultdict(list), 'epoch':0}
    q = torch.ones(n_classes, dtype=torch.float32, device=device) / n_classes
    best_macro_f1_val=0
    
    for epoch in tqdm(range(max_epochs)):
        train_report, q = train_gdro_rnn_sl(net, optimizer, device, criterion, train_dl, q=q, soft_labels=soft_labels, eta=eta)
        for k in train_report.keys():
            logs['train'][k].append(train_report[k])
        
        val_report = validate_tms_rnn(val_embs, val_labels, net, device)
        for k in val_report.keys():
            logs['val'][k].append(val_report[k])
        
        if val_report['macro_f1'] >= best_macro_f1_val:
            best_macro_f1_val = val_report['macro_f1']
            torch.save(net.cpu().state_dict(), f'{output_dir}/net_params.pt')

            logs['epoch'] = epoch
            train_report_eval = validate_tms_rnn(train_embs, train_labels, net, device)
            for k in train_report_eval.keys():
                logs['train_eval'][k].append(train_report_eval[k])
    return logs
    

In [None]:
class EmbDatasetRNNAug(Dataset):
    '''
    For samples in class 'none' avg embeddings from a random number of messages
    '''
    def __init__(self, embeddings, labels, thr_rng=0.5, n_msg=10):
        self.embeddings = embeddings
        self.labels = labels
        self.emb0 = [self.embeddings[i] for i in range(len(self.labels)) if self.labels[i] == 0]
        self.thr_rng = thr_rng
        self.n_msg = n_msg

    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.labels[idx] == 0:
            nr_msg = np.random.randint(1, len(self.embeddings[idx]+1))
            return self.embeddings[idx][:nr_msg], self.labels[idx]
        else:
            rnd = np.random.uniform()
            if rnd > self.thr_rng:
                neutral = self.emb0[np.random.randint(0, len(self.emb0))] # subject from which to take neutral comments
                n_extra = np.random.randint(1, min(len(neutral), self.n_msg)) # number of extra comments
                return np.concatenate([neutral[:n_extra], self.embeddings[idx]], axis=0), self.labels[idx]
            else:
                return self.embeddings[idx], self.labels[idx]


In [None]:
# training data provided by the organizers for the first two tasks
data_dir = '/path/to/data/dir'

LABEL_MAP = {'none':0, 'anxiety':1, 'depression':2}
REVERSE_LABEL_MAP = {0:'none', 1:'anxiety', 2:'depression'}

CONTEXTS = ['addiction','emergency','family','work','social','other']
CONTEXT_MAP = {c:i for i,c in enumerate(CONTEXTS)}
CONTEXT_MAP['none'] = len(CONTEXTS)
REVERSE_CONTEXT_MAP = {CONTEXT_MAP[c]:c for c in CONTEXT_MAP.keys()}

data = {'messages':[], 'labels1':[], 'labels2':[], 'dates':[]}

for split in ['trial', 'train']:
    df = pd.read_csv(os.path.join(data_dir, split, 'gold_task2.txt'))
    labels1 = df['label'].map(lambda x: LABEL_MAP[x]).to_list()
    data['labels1'] += labels1
    
    subjects = df['Subject'].to_list()
    messages = []
    dates = []
    for subject in subjects:
        subject_data = json.load(open(os.path.join(data_dir, split, 'subjects', f'{subject}.json') , 'r', encoding='utf-8'))
        messages.append([x['message'] for x in subject_data])
        dates.append([x['date'] for x in subject_data])
    data['messages'] += messages
    data['dates'] += dates
    data['labels2'].append(df[CONTEXTS].to_numpy().astype(np.int32))

data['labels2'] = np.concatenate(data['labels2'], axis=0).astype(np.float32)
data['labels1'] = np.array(data['labels1'], dtype=np.int32)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
labels = data['labels1']
name = 'pysentimiento/robertuito-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModel.from_pretrained(name)
model.eval()
model.to(device)
embs = get_cls_embeddings(data['messages'], model, tokenizer, device)

In [None]:
ds = EmbDatasetRNNAug(embs, labels, 0.7, 10)
train_ds, test_ds = random_split(ds, [0.8, 0.2], generator=torch.Generator().manual_seed(1007))

val_embs = [embs[i] for i in test_ds.indices]
val_labels = [labels[i] for i in test_ds.indices]

train_embs = [embs[i] for i in train_ds.indices]
train_labels = [labels[i] for i in train_ds.indices]

In [None]:
save_dir = '/path/to/saved/networks'

soft_labels = torch.tensor([
    [1,  0,  0],
    [0,0.9,0.1],
    [0,0.1,0.9],
], dtype=torch.float32, device=device)
test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, drop_last=False,collate_fn=lstm_collate)

f1s = []
for h_size in [64, 96, 128, 160]:
    scores = []
    for bs in [32, 64, 96]:
        dir_name = 'rnn' + '_gdro' + '_eval' + '_sl_09' + '_aug_07_10' + f'_h_{h_size}_bs_{bs}'
        output_dir = os.path.join(save_dir, dir_name)
        os.makedirs(output_dir, exist_ok=True)
        
        random.seed(1007)
        np.random.seed(1007)
        torch.manual_seed(1007)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
        train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False,collate_fn=lstm_collate)
        net = LSTMClassifier(embs[0].shape[-1], h_size=h_size, output_dim=len(LABEL_MAP))
        optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss() # not used either way in gDRO
        
        logs = run_train_gdro_rnn_sl(net,optimizer,loss_fn,device, train_dl, train_embs, train_labels, val_embs, val_labels, 
                                     soft_labels=soft_labels, output_dir=output_dir, max_epochs=100)
        
        for k in logs['train_eval'].keys():
            if k != 'cls_report' and k != 'cfm':
                fig, ax = make_plot(logs['train'][k], logs['val'][k], k)
                fig.savefig(f'{output_dir}/{k}.png')
                plt.close(fig)
                fig, ax = make_plot(logs['train_eval'][k], logs['val'][k], k)
                fig.savefig(f'{output_dir}/{k}_eval.png')
                plt.close(fig)
        
        np.save(f'{output_dir}/logs.npy', logs, allow_pickle=True)
        
        arg = np.argmax(logs['val']['macro_f1'])
        print(f'h_size: {h_size}, batch_size: {bs}')
        print(arg)
        print(f"Val macro_f1: {logs['val']['macro_f1'][arg]:.4f} | Train_eval macro_f1: {logs['train_eval']['macro_f1'][-1]:.4f}")
        print('Val', logs['val']['cfm'][arg], logs['val']['cls_report'][arg], sep='\n')
        # print('Train', logs['train']['cfm'][arg], logs['train']['cls_report'][arg], sep='\n')
        print('Train_eval', logs['train_eval']['cfm'][-1], logs['train_eval']['cls_report'][-1], sep='\n')
        scores.append(logs['val']['macro_f1'][arg])
    f1s.append(scores)

In [None]:
# training data provided by the organizers for the first two tasks
## applies preprocessing
data_dir = '/path/to/data/dir'

LABEL_MAP = {'none':0, 'anxiety':1, 'depression':2}
REVERSE_LABEL_MAP = {0:'none', 1:'anxiety', 2:'depression'}

CONTEXTS = ['addiction','emergency','family','work','social','other']
CONTEXT_MAP = {c:i for i,c in enumerate(CONTEXTS)}
CONTEXT_MAP['none'] = len(CONTEXTS)
REVERSE_CONTEXT_MAP = {CONTEXT_MAP[c]:c for c in CONTEXT_MAP.keys()}

data = {'messages':[], 'labels1':[], 'labels2':[], 'dates':[]}

# read data and preprocess text messages
for split in ['trial', 'train']:
    df = pd.read_csv(os.path.join(data_dir, split, 'gold_task2.txt'))
    labels1 = df['label'].map(lambda x: LABEL_MAP[x]).to_list()
    data['labels1'] += labels1
    
    subjects = df['Subject'].to_list()
    messages = []
    dates = []
    for subject in subjects:
        subject_data = json.load(open(os.path.join(data_dir, split, 'subjects', f'{subject}.json') , 'r', encoding='utf-8'))
        messages.append([preprocess(x['message']) for x in subject_data])
        dates.append([x['date'] for x in subject_data])
    data['messages'] += messages
    data['dates'] += dates
    data['labels2'].append(df[CONTEXTS].to_numpy().astype(np.int32))

data['labels2'] = np.concatenate(data['labels2'], axis=0).astype(np.float32)
data['labels1'] = np.array(data['labels1'], dtype=np.int32)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
labels = data['labels1']
name = 'pysentimiento/robertuito-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModel.from_pretrained(name)
model.eval()
model.to(device)
embs = get_cls_embeddings(data['messages'], model, tokenizer, device)

In [None]:
ds = EmbDatasetRNNAug(embs, labels, 0.7, 10)
train_ds, test_ds = random_split(ds, [0.8, 0.2], generator=torch.Generator().manual_seed(1007))

val_embs = [embs[i] for i in test_ds.indices]
val_labels = [labels[i] for i in test_ds.indices]

train_embs = [embs[i] for i in train_ds.indices]
train_labels = [labels[i] for i in train_ds.indices]

In [None]:
save_dir = '/path/to/saved/networks'
soft_labels = torch.tensor([
    [1,  0,  0],
    [0,0.9,0.1],
    [0,0.1,0.9],
], dtype=torch.float32, device=device)
test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, drop_last=False,collate_fn=lstm_collate)

f1s = []
for h_size in [64, 96, 128, 160]:
    scores = []
    for bs in [32, 64, 96]:
        dir_name = 'rnn_preprocess' + '_gdro' + '_eval' + '_sl_09' + '_aug_07_10' + f'_h_{h_size}_bs_{bs}'
        output_dir = os.path.join(save_dir, dir_name)
        os.makedirs(output_dir, exist_ok=True)
        
        random.seed(1007)
        np.random.seed(1007)
        torch.manual_seed(1007)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
        train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False,collate_fn=lstm_collate)
        net = LSTMClassifier(embs[0].shape[-1], h_size=h_size, output_dim=len(LABEL_MAP))
        optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss() # not used either way in gDRO
        
        logs = run_train_gdro_rnn_sl(net,optimizer,loss_fn,device, train_dl, train_embs, train_labels, val_embs, val_labels, 
                                     soft_labels=soft_labels, output_dir=output_dir, max_epochs=100)
        
        for k in logs['train_eval'].keys():
            if k != 'cls_report' and k != 'cfm':
                fig, ax = make_plot(logs['train'][k], logs['val'][k], k)
                fig.savefig(f'{output_dir}/{k}.png')
                plt.close(fig)
                fig, ax = make_plot(logs['train_eval'][k], logs['val'][k], k)
                fig.savefig(f'{output_dir}/{k}_eval.png')
                plt.close(fig)
        
        np.save(f'{output_dir}/logs.npy', logs, allow_pickle=True)
        
        arg = np.argmax(logs['val']['macro_f1'])
        print(f'h_size: {h_size}, batch_size: {bs}')
        print(arg)
        print(f"Val macro_f1: {logs['val']['macro_f1'][arg]:.4f} | Train_eval macro_f1: {logs['train_eval']['macro_f1'][-1]:.4f}")
        print('Val', logs['val']['cfm'][arg], logs['val']['cls_report'][arg], sep='\n')
        # print('Train', logs['train']['cfm'][arg], logs['train']['cls_report'][arg], sep='\n')
        print('Train_eval', logs['train_eval']['cfm'][-1], logs['train_eval']['cls_report'][-1], sep='\n')
        scores.append(logs['val']['macro_f1'][arg])
    f1s.append(scores)