#Install

In [None]:
!pip install transformers --quiet

In [None]:
!pip install simpletransformers --quiet

In [None]:
import simpletransformers

#Imports

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import os, sys, gc
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, f1_score, accuracy_score

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn.functional as F

In [None]:
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, BertPreTrainedModel, AdamW, get_linear_schedule_with_warmup

In [None]:
from keras.utils import to_categorical

Using TensorFlow backend.


#Envs

In [None]:
seed = 42

In [None]:
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if torch.cuda.is_available(): 
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

#Data

In [None]:
path = './../data/'

In [None]:
train = pd.read_csv(path+'extended_train_from_fr_to_english.csv')
test = pd.read_csv(path+'extended_test_from_fr_to_english.csv')
sample = pd.read_csv(path+'SampleSubmission.csv')

In [None]:
train = train.drop_duplicates('text').reset_index(drop=True)

In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = text.strip(' ')
    return text

# clean the comment_text in test_df [Thanks, Pulkit Jha.]
train['text'] = train['text'].map(lambda com : clean_text(com))
# clean the comment_text in test_df [Thanks, Pulkit Jha.]
test['text'] = test['text'].map(lambda com : clean_text(com))

In [None]:
train.head(15)

Unnamed: 0,ID,text,label,Depression,Alcohol,Suicide,Drugs
0,SUAVK39Z,i feel it was betteer i die happy,0,1,0,0,0
1,9JDAGUV3,why do i hallucinate?,3,0,0,0,1
2,419WR1LQ,i am stressed by lack of financial support at ...,0,1,0,0,0
3,6UY7DX6Q,why is life important?,2,0,0,1,0
4,FYC0FTFB,how can i be helped through depression?,0,1,0,0,0
5,V6VSDJ5I,what are the health effects of depression,0,1,0,0,0
6,9736J4UE,why is everything so difficult to manage in th...,0,1,0,0,0
7,AY8L479Y,i feel emotionally overwhelmed,0,1,0,0,0
8,OSFJV5EC,how to manage alcohol consumption?,1,0,1,0,0
9,U4SGUGGM,is the sky open for us who smoke bhang?,3,0,0,0,1


In [None]:
train['Depression'] = 0

train['Alcohol'] = 0

train['Suicide'] = 0

train['Drugs'] = 0

train[['Depression', 'Alcohol', 'Suicide', 'Drugs']] = to_categorical(train.label.values).astype(int)

In [None]:
train = train[['text', 'Depression', 'Alcohol', 'Suicide', 'Drugs', 'label']]

In [None]:
train.head()

Unnamed: 0,text,Depression,Alcohol,Suicide,Drugs,label
0,i feel it was betteer i die happy,1,0,0,0,0
1,why do i hallucinate?,0,0,0,1,3
2,i am stressed by lack of financial support at ...,1,0,0,0,0
3,why is life important?,0,0,1,0,2
4,how can i be helped through depression?,1,0,0,0,0


In [None]:
train.label.value_counts()

0    340
1    126
2     64
3     56
Name: label, dtype: int64

#Utilities

In [None]:
class SDataset(Dataset):
  def __init__(self, be, y=None):
    super(SDataset, self).__init__()
    self.be = be
    self.y = y

  def __len__(self):
    return len(self.be['input_ids'])

  def __getitem__(self, idx):
    ids = self.be['input_ids'][idx]
    mask = self.be['attention_mask'][idx]
    tids = self.be['token_type_ids'][idx]

    y = torch.tensor(self.y[idx], dtype=torch.float) if self.y is not None else [-1]

    x = { 'input_ids': torch.tensor(ids), 'attention_mask': torch.tensor(mask), 'token_type_ids': torch.tensor(tids) }

    return x, y

In [None]:
class SModel(nn.Module):
  def __init__(self, model_name, config, **kwargs):
    super(SModel, self).__init__()

    self.model = AutoModel.from_pretrained(model_name, config=config)
    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(config.hidden_size, 4)

    nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, attention_mask=None, token_type_ids=None):
    outputs = self.model(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
        )

    sequence_output = outputs[0]
    pooled_output = torch.mean(sequence_output, dim=1)
    # pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    logits = F.softmax(logits)

    return logits

In [None]:
from collections import Counter

In [None]:
def balance_training(df):
  threshold = 235

  dep = df[df.label==0].copy()
  alc = df[df.label==1].copy()
  sui = df[df.label==2].copy()
  dru = df[df.label==3].copy()

  ndep = dep.sample(n=threshold, random_state=seed)

  ndf = pd.concat([sui, dru, ndep, alc], axis=0)
  ndf = ndf.sample(frac=1, random_state=seed).reset_index(drop=True)

  print('Rebalance ', Counter(ndf.label.values))

  return ndf.copy()

In [None]:
def tokenize(corpus, tokenizer, max_length=50):
  be = tokenizer.batch_encode_plus(corpus, max_length=max_length, pad_to_max_length=True, 
                              return_attention_masks=True, return_token_type_ids=True)
  
  return be

In [None]:
def evaluation(ytrue, y_pred):
  score = np.array([ 
    log_loss(ytrue.argmax(1), y_pred, labels=[0,1,2,3]),
    f1_score(ytrue.argmax(1), y_pred.argmax(1), average='weighted'),
  ])

  return score

In [None]:
def loss_fct(y_pred, ytrue, num_labels=4):
  # loss1 = nn.CrossEntropyLoss(weight=weights)(y_pred.view(-1, num_labels), ytrue.argmax(1).view(-1))
  loss2 = nn.BCELoss()(y_pred, ytrue)

  return loss2

In [None]:
# Counter(ytrain.argmax(1)).items()

In [None]:
def train_model(epoch, train_dl, model, opt, scheduler, criterion, device='cpu'):

  model.train()
  tr_loss = 0
  score = np.zeros((2,))

  for (x, y) in (train_dl):

    x = {n:p.to(device) for n,p in x.items()}
    y = y.to(device)

    out = model(**x)

    loss = criterion(out, y)
    tr_loss += loss.item()
    score += evaluation(y.detach().cpu().numpy(), out.detach().cpu().numpy())

    loss.backward()
    opt.step()
    # scheduler.step()
    model.zero_grad()

  score = score / len(train_dl)
  tr_loss = tr_loss/len(train_dl)
  print("[Training] Epoch {} - Loss {:.3f} - logloss: {:.3f} - f1: {:.3f}".format(epoch+1, tr_loss, *score))

In [None]:
def eval_model(epoch, val_dl, model, criterion, device='cpu'):

  model.eval()
  score = np.zeros((2,))
  val_loss = 0

  with torch.no_grad():
    for (x, y) in  val_dl:
      x = {n:p.to(device) for n,p in x.items()}
      y = y.to(device)

      out = model(**x)
      # print(out)
      loss = criterion(out, y)
      val_loss += loss.item()

      score += evaluation(y.detach().cpu().numpy(), out.detach().cpu().numpy())

    score = score / len(val_dl)
    val_loss = val_loss/len(val_dl)
    print("[Eval] Epoch {} - Loss {:.3f} - logloss: {:.3f} - f1: {:.3f}".format(epoch+1, val_loss, *score))

  return score

In [None]:
def make_prdiction(n_fold=10, device='cpu'):

  test_ds = SDataset(be_test)
  test_dl = DataLoader(test_ds, 16)

  preds = []

  for fold in tqdm(range(n_fold)):
    model = SModel(model_name, config)
    model.load_state_dict(torch.load(f'model_{fold}.bin'))
    model.to(device)
    model.eval()

    fold_preds = []

    for (x, y) in test_dl:
      x = {n:p.to(device) for n,p in x.items()}

      out = model(**x)

      fold_preds += (out.detach().cpu().numpy()/n_fold).tolist()

    preds.append(fold_preds)

  return np.sum(preds, axis=0)


In [None]:
def run_fold(n_fold=10, epochs=10, device='cpu'):
  # fold = KFold(n_fold, random_state=seed)
  fold = StratifiedKFold(n_fold, random_state=seed)
  cv_score = 0

  for i, (tr, vr) in enumerate(fold.split(xtrain, ytrain.argmax(1))):
    print("Fold ", i)
    best_eval = np.inf
    best_f1 = -np.inf

    #df_train = balance_training(train.loc[tr])
    df_train = train.loc[tr]
    nytrain = df_train[['Depression', 'Alcohol', 'Suicide', 'Drugs']].values

    X_tra, X_val, y_tra, y_val = df_train.text.values, xtrain[vr], nytrain, ytrain[vr]
    # X_tra, X_val, y_tra, y_val = xtrain[tr], xtrain[vr], ytrain[tr], ytrain[vr]


    be_tra = tokenize(X_tra, tokenizer)
    be_val = tokenize(X_val, tokenizer)

    train_ds = SDataset(be_tra, y_tra)
    val_ds = SDataset(be_val, y_val)

    train_dl = DataLoader(train_ds, 16)
    val_dl = DataLoader(val_ds, 16)

    model = SModel(model_name, config)
    criterion = nn.BCELoss()

    t_total = len(train_dl) // epochs

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    opt = AdamW(optimizer_grouped_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
            opt, num_warmup_steps=0, num_training_steps=t_total
        )

    model.to(device)
    model.zero_grad()

    for epoch in range(epochs):
      train_model(epoch, train_dl, model, opt, scheduler, criterion, device)
      score = eval_model(epoch, val_dl, model, criterion, device)
      
      if score[0] < best_eval:
        best_f1 = score[-1]
        best_eval = score[0]
        torch.save(model.state_dict(), f'model_{i}.bin')
      print()

    cv_score += best_eval
  return cv_score / n_fold

#Tokenization

In [None]:
model_name = 'roberta-base'

In [None]:
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
xtrain = train.text.values
xtest = test.text.values

In [None]:
ytrain = train[['Depression', 'Alcohol', 'Suicide', 'Drugs']].values

In [None]:
be_train = tokenize(xtrain, tokenizer)
be_test = tokenize(xtest, tokenizer)

##Training

In [None]:
# !rm -r *.bin

In [None]:
cv_score = run_fold(device='cuda')
print("Avg Logloss : ", cv_score)

##Prediction

In [None]:
raw_outputs = make_prdiction(device='cuda')

#Submission

In [None]:
path = './../submissions/'

In [None]:
sample[["Depression","Alcohol","Suicide","Drugs"]] = raw_outputs
sample.to_csv(path + 'roberta-base_translated.csv', index=False)

In [None]:
sample.head()

In [None]:
sample.describe()

In [None]:
sample[["Depression","Alcohol","Suicide","Drugs"]].apply(np.argmax, axis=1).value_counts()