In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, BatchSampler, random_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import random
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd
import os
import emoji
from collections import defaultdict

import sklearn.metrics as metrics
from sklearn.svm import LinearSVC


In [None]:
torch.cuda.is_available()

In [None]:
textual_emoticons_to_spanish = {
        ":)": "cara sonriente",
        ":(": "cara triste",
        ";)": "guiño",
        ":D": "cara riendo con los ojos abiertos",
        "XD": "cara riendo con los ojos cerrados",
        "xD": "cara riendo con los ojos cerrados",
        ":P": "cara sacando la lengua",
        "<3": "corazón",
        ":'(": "cara llorando",
        ":-)": "cara sonriente",
        ":-(": "cara triste",
        ";-)": "guiño",
        ":-D": "cara riendo con los ojos abiertos",
        ":-P": "cara sacando la lengua",
        "(heart)": "corazón",
        ":o": "cara sorprendida",
        ":-o": "cara sorprendida",
        ":/": "cara de duda"
    }

In [None]:
def preprocess(text):

    print(text)
    text = re.sub(r'@\w+', '', text)

    text = re.sub(r'(ja)+', 'jaja', text, flags=re.IGNORECASE)
    text = re.sub(r'(js)+', 'jsjs', text, flags=re.IGNORECASE)
    
    text = text.replace(" @","o")

    for x,y in textual_emoticons_to_spanish.items():
        text = text.replace(x,y)

    text = emoji.demojize(text,language='es')

    spanishVowels = 'aeiouáéíóú' 
    uppercaseVowels =spanishVowels.upper()
    
    for vow in spanishVowels + uppercaseVowels:

        pattern = re.compile(f"{vow}{vow}{vow}+")

        text = pattern.sub(f'{vow}',text)
        
    return text

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
LABEL_MAP = {'none':0, 'anxiety':1, 'depression':2}
REVERSE_LABEL_MAP = {0:'none', 1:'anxiety', 2:'depression'}

CONTEXTS = ['addiction','emergency','family','work','social','other','none']
CONTEXT_MAP = {c:i for i,c in enumerate(CONTEXTS)}
REVERSE_CONTEXT_MAP = {i:c for i,c in enumerate(CONTEXTS)}

data = {'messages':[], 'labels1':[], 'labels2':[]}


for split in ['trial']:
    df = pd.read_csv(f'./data/task2/{split}/gold_task2.txt')
    labels1 = df['label'].map(lambda x: LABEL_MAP[x]).to_list()
    data['labels1'] += labels1
    
    subjects = df['Subject'].to_list()
    messages = []
    for subject in subjects:
        subject_data = json.load(open(f'./data/task2/{split}/subjects/{subject}.json', 'r', encoding='utf-8'))
        messages.append([preprocess(x['message']) for x in subject_data])
    data['messages'] += messages
    data['labels2'].append(df[CONTEXTS].to_numpy().astype(np.int32))

data['labels2'] = np.concatenate(data['labels2'], axis=0)
data['labels1'] = np.array(data['labels1'], dtype=np.int32)

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ignacio-ave/beto-sentiment-analysis-spanish")
model_seq = AutoModelForSequenceClassification.from_pretrained("ignacio-ave/beto-sentiment-analysis-spanish")
model = AutoModel.from_pretrained("ignacio-ave/beto-sentiment-analysis-spanish")
model.eval()
model.to(device)
model_seq.eval()
model_seq.to(device)

In [None]:
isTrain = False

In [None]:
def get_cls_embeddings(all_messages, model, tokenizer, device, m_length=96):
    model.to(device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for subject_messages in tqdm(all_messages):
            input = tokenizer(subject_messages, padding=True, truncation=True, max_length=m_length, return_tensors='pt')
            output = model(**input.to(device))
            embeddings.append(output.last_hidden_state[:, 0, :].cpu().numpy())
    return embeddings

In [None]:
def validate_rnn(net, device, criterion, val_dl):
    net.to(device)
    net.eval()
    loss, num_batches = 0, 0
    preds, labels = [], []

    with torch.no_grad():
        for batch_data, batch_lens, batch_labels in val_dl:
            labels.append(batch_labels.numpy())
            
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device).long()
            batch_labels = batch_labels.to(torch.float)
            
            batch_lens = batch_lens.to(device)
            out = net(batch_data, batch_lens)
            batch_loss = criterion(out, batch_labels)
    
            loss += batch_loss.sum()
            num_batches += 1
            out = out.detach().cpu().numpy()
            batch_predictions = (out>=0.5).astype(int)
        
            
            preds.append(batch_predictions)
            
  
   
    labels = np.concatenate(labels, axis=0)

    preds = np.concatenate(preds, axis=0)
    
 
    loss = loss / num_batches

    reports = []

    for i in range(len(CONTEXTS)):

        #print(labels[:,i])
        #print(preds[:,i])
        report = {
            'loss': loss,
            'acc': metrics.accuracy_score(labels[:,i], preds[:,i]),
            'macro_f1': metrics.f1_score(labels[:,i], preds[:,i], average='macro', zero_division=0),
            'macro_precision': metrics.precision_score(labels[:,i], preds[:,i], average='macro', zero_division=0),
            'macro_recall': metrics.recall_score(labels[:,i], preds[:,i], average='macro', zero_division=0),
            'micro_f1': metrics.f1_score(labels[:,i], preds[:,i], average='micro', zero_division=0),
            'micro_precision': metrics.precision_score(labels[:,i], preds[:,i], average='micro', zero_division=0),
            'micro_recall': metrics.recall_score(labels[:,i], preds[:,i], average='micro', zero_division=0),
            'cls_report': metrics.classification_report(labels[:,i], preds[:,i], zero_division=0),
            'cfm': metrics.confusion_matrix(labels[:,i], preds[:,i])
        }

        #print(report)
        reports.append(report)

    #print(reports)
    return reports

In [None]:
def train_gdro_rnn(net, optimizer, device, criterion, train_dl, q, eta=0.1):
    net.to(device)
    net.train()
    loss = 0
    num_batches = 0
    preds = []
    labels = []

    for batch_data, batch_lens, batch_labels in train_dl:
        
        labels.append(batch_labels.numpy())
        unique_batch_labels = np.unique(batch_labels.numpy())
        batch_labels = batch_labels.to(torch.float)
    
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
        batch_lens = batch_lens.to(device)
        
        optimizer.zero_grad()
        out = net(batch_data, batch_lens)

        batch_losses = criterion(out, batch_labels)
        
        batch_losses.backward()
        optimizer.step()
        
        loss += batch_losses.sum()
      
        num_batches += 1
        out = out.detach().cpu().numpy()
        batch_predictions = (out>=0.5).astype(int)
        preds.append(batch_predictions)
        
    labels = np.concatenate(labels, axis=0)
    preds = np.concatenate(preds, axis=0)
    loss = loss / num_batches
    
    reports = []

    for i in range(len(CONTEXTS)):
        report = {
            'loss': loss,
            'acc': metrics.accuracy_score(labels[:,i], preds[:,i]),
            'macro_f1': metrics.f1_score(labels[:,i], preds[:,i], average='macro', zero_division=0),
            'macro_precision': metrics.precision_score(labels[:,i], preds[:,i], average='macro', zero_division=0),
            'macro_recall': metrics.recall_score(labels[:,i], preds[:,i], average='macro', zero_division=0),
            'micro_f1': metrics.f1_score(labels[:,i], preds[:,i], average='micro', zero_division=0),
            'micro_precision': metrics.precision_score(labels[:,i], preds[:,i], average='micro', zero_division=0),
            'micro_recall': metrics.recall_score(labels[:,i], preds[:,i], average='micro', zero_division=0),
            'cls_report': metrics.classification_report(labels[:,i], preds[:,i], zero_division=0),
            'cfm': metrics.confusion_matrix(labels[:,i], preds[:,i])
        }
        reports.append(report)
    return reports, q

In [None]:
def run_train_gdro_rnn(net, optimizer, criterion, device, train_dl, val_dl, max_epochs,n_classes):
    num_epochs_ni = 0
    best_vacc = 0

    D = dict()
    D2 = dict()
    for i in range(len(CONTEXTS)):
        D[i] = defaultdict(list)
        D2[i] = defaultdict(list)
    logs = {'train':D, 'val':D2}
    q = torch.ones(n_classes, dtype=torch.float32, device=device) / n_classes
    
    for epoch in tqdm(range(max_epochs)):
        
        train_report, q = train_gdro_rnn(net, optimizer, device, criterion, train_dl, q=q)

        for i in range(len(CONTEXTS)):
            for k in train_report[i].keys():
                logs['train'][i][k].append(train_report[i][k])

        val_report = validate_rnn(net, device, criterion, val_dl)

        for i in range(len(CONTEXTS)):
            for k in val_report[i].keys():
                logs['val'][i][k].append(val_report[i][k])
    return logs

In [None]:
class EmbDatasetRNN(Dataset):
    '''
    For samples in class 'none' avg embeddings from a random number of messages
    '''
    def __init__(self, embeddings, labels,isTrain):
        self.embeddings = embeddings
        self.labels = labels
        self.isTrain = isTrain
        self.mappingDict = dict()
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        #print(self.isTrain)
        if self.isTrain:
            if self.labels[idx][-1]==1:
                return self.embeddings[idx], self.labels[idx]
            else:

                p = random.random()

                if p<0.6:
                    x = None 
                    
                    if False and idx in self.mappingDict.keys():
                        x = self.mappingDict[idx]
                    else:
                        x = random.randint(0,len(self.embeddings)-1)
        
                        if x==idx:
                            x = random.randint(0,len(self.embeddings)-1)
    
                        self.mappingDict[idx] = x
                    return np.concatenate([self.embeddings[x],self.embeddings[idx]],axis=0), np.concatenate([self.labels[idx][:-1] | self.labels[x][-1], self.labels[idx][-1:] & self.labels[x][-1:]],axis=0)
                else:
                    return self.embeddings[idx], self.labels[idx]
        return self.embeddings[idx], self.labels[idx]


In [None]:
class LSTMClassifier(nn.Module):

    def __init__(self, input_size, h_size, output_dim, dropout=0):
        super().__init__()
        self.input_size = input_size
        self.h_size = h_size
        self.output_dim = output_dim
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=h_size, num_layers=1, batch_first=False,
                           dropout=dropout, bidirectional=False)
        self.classifier = nn.Linear(h_size, output_dim)

    def forward(self, seq_data, seq_lens, state=None):
        # seq_data : (S, N, input_size)
        # seq_lens: (N,) -> numbers between 0 and S-1 -> position of last actual sample before padding
        if state is None:
            state = (
                torch.zeros(1, seq_data.shape[1], self.h_size, device=seq_data.device),
                torch.zeros(1, seq_data.shape[1], self.h_size, device=seq_data.device)
            )

        out_states, _ = self.lstm(seq_data, state) # S, N, H
        pred_states = torch.take_along_dim(out_states, seq_lens, dim=0).squeeze() # remove seq dimension
        out = self.classifier(pred_states)

        return out


In [None]:
from torch.nn.utils.rnn import pad_sequence

def lstm_collate(batch):

    labels = [x[1] for x in batch]
    labels = torch.tensor(labels, dtype=torch.long)
 
    data = [torch.tensor(x[0], dtype=torch.float32) for x in batch]
    batch_data = pad_sequence(data)
    
    lens = torch.tensor([len(x) for x in data], dtype=torch.long).unsqueeze(0).unsqueeze(-1) # 1, N, 1

    lens -= 1
    return batch_data, lens, labels

In [None]:
def make_plot(train_scores, val_scores, y_label, figsize=(8,5)):
    fig, ax = plt.subplots(1,1,figsize=figsize)
    ax.plot(train_scores, label='Train')
    ax.plot(val_scores, label='Val')
    ax.set_xlabel('Epoch')
    ax.set_ylabel(y_label)
    ax.legend()

    return fig, ax

In [None]:
import random
model_names = [
    'ignacio-ave/beto-sentiment-analysis-spanish',    
]

labels2 = data['labels2'][data['labels1']!=0]

messages2 = [x for i,x in enumerate(data['messages']) if data['labels1'][i] != 0]

class_positive_weight = torch.tensor(data['labels2'].shape[0] / np.sum(data['labels2'], axis=0), device=device, dtype=torch.float32)

for name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModel.from_pretrained(name)
    model.eval()
    model.to(device)
    embs = get_cls_embeddings(messages2, model, tokenizer, device)
    ds = EmbDatasetRNN(embs, labels2, True)

    train_ds, test_ds = random_split(ds, [0.8, 0.2], generator=torch.Generator().manual_seed(1007))

    embds = []
    labels = []
    for x,y in train_ds:
        embds.append(x)
        labels.append(y)

    add_embds = []
    add_labels = []

    for i in range(2):
        for x,lx in zip(embds,labels):
            y = random.randint(0,len(embds)-1)
            add_embds.append(np.concatenate([np.array(embds[y]),np.array(x)],axis=0))
            add_labels.append(lx | labels[y])

    train_ds = EmbDatasetRNN(embds,labels,True)


    embds = []
    labels = []
    
    for x,y in test_ds:
        embds.append(x)
        labels.append(y)

    test_ds = EmbDatasetRNN(embds,labels,False)
        
    train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=False,collate_fn=lstm_collate)
 
    test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, drop_last=False,collate_fn=lstm_collate)
    
    net = LSTMClassifier(embs[0].shape[-1], h_size=128, output_dim=len(CONTEXTS))
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_positive_weight)
    
    logs = run_train_gdro_rnn(net,optimizer,loss_fn,device, train_dl, test_dl, 100,len(CONTEXTS))

    
    for i in range(len(CONTEXTS)):
        arg = np.argmax(logs['val'][i]['macro_f1'])
        print(f"Val macro_f1: {logs['val'][i]['macro_f1'][arg]:.4f} | Train macro_f1: {logs['train'][i]['macro_f1'][arg]:.4f}")
        print('Val', logs['val'][i]['cfm'][arg], logs['val'][i]['cls_report'][arg], sep='\n')
        print('Train', logs['train'][i]['cfm'][arg], logs['train'][i]['cls_report'][arg], sep='\n')
