<a href="https://colab.research.google.com/github/MathengeKen/-Basic-Needs-Basic-Rights-Kenya---Tech4MentalHealth-/blob/master/tech4mental(bert)1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers

In [2]:
import torch 
from transformers import BertTokenizer,BertModel,get_linear_schedule_with_warmup
import transformers 
import torch
import torch.nn as nn 
import pandas as pd 
import numpy as np 
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from sklearn.model_selection import StratifiedKFold
import os 
import random

In [3]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [4]:
CONFIG = {
    'MAX_LEN':128,
    'TRAIN_BATCH_SIZE':16,
    'VALID_BATCH_SIZE':16,
    'EPOCHS':3,
    'TOKENIZER':BertTokenizer.from_pretrained('bert-base-cased',lowercase=True,truncation=True)
}

In [5]:
import random
from random import randint
import numpy as np
SEED_VAL  = 1000
# Set the seed value all over the place to make this reproducible.
def seed_all(SEED):
  random.seed(SEED_VAL)
  np.random.seed(SEED_VAL)
  torch.manual_seed(SEED_VAL)
  torch.cuda.manual_seed_all(SEED_VAL)
  os.environ['PYTHONHASHSEED'] = str(SEED_VAL)
  torch.backends.cudnn.deterministic = True

In [6]:
class CustomBert(nn.Module):
  def __init__(self, n_classes):
    super(CustomBert, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )

    output = self.drop(pooled_output)
    return self.out(output)

In [7]:
class BertDataset:
  def __init__(self,tweet,target=None,task='train'):
    self.tweet= tweet
    self.target = target
    self.tokenizer = CONFIG['TOKENIZER']
    self.max_len = CONFIG['MAX_LEN']
    self.task = task
  
  def __len__(self):
    return len(self.tweet)

  def __getitem__(self,item):
    tweet = str(self.tweet[item])
    tweet = ' '.join(tweet.split())


    inputs = self.tokenizer.encode_plus(tweet,
                                        max_length=self.max_len,
                                        pad_to_max_length=True,
                                        add_special_tokens=True,
                                        truncation=True)
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    


    to_return= {
        'ids':torch.tensor(ids,dtype=torch.long),
        'mask':torch.tensor(mask,dtype=torch.long),
    }
    if (self.task=='train'):

      to_return.update({'target':torch.tensor(self.target[item])})

    return to_return

In [8]:
def loss_fn(outputs,targets):
  criterion =  nn.CrossEntropyLoss()
  return criterion(outputs,targets)

In [9]:
def train_fn(data_loader,model,optimizer,device,sc=None):
  model.train()
  tot_loss = 0
  for bi, d in enumerate(data_loader):
    ids = d['ids']
    mask = d['mask']
    targets = d['target']

    #send them to cuda gpu 
    ids = ids.to(device,dtype=torch.long)
    mask = mask.to(device,dtype=torch.long)
   
    targets = targets.to(device,dtype=torch.long)
    
    optimizer.zero_grad()

    outputs = model(
        ids,
        mask,
    )
    
    loss = loss_fn(outputs,targets)
    tot_loss += loss.item()
    loss.backward()
    optimizer.step()
    if sc:
      sc.step()
  
  print("Training loss for this epoch: ",tot_loss/len(data_loader))

In [10]:
def eval_fn(data_loader,model,device):
  model.eval()
  fin_targets = []
  fin_outputs =[]
  tot_loss = 0
  with torch.no_grad():
    for bi, d in enumerate(data_loader):
      ids = d['ids']
      mask = d['mask']
      
      targets = d['target']

      #send them to cuda gpu 
      ids = ids.to(device,dtype=torch.long)
      mask = mask.to(device,dtype=torch.long)
      
     
      targets = targets.to(device,dtype=torch.long)
      
      

      outputs = model(
          ids,
          mask
      )

      loss = loss_fn(outputs,targets)
      tot_loss+=loss.item()
      fin_targets.extend(targets.cpu().detach().numpy())
      fin_outputs.extend(torch.nn.functional.softmax(outputs).cpu().detach().numpy())
  return fin_outputs,fin_targets,tot_loss/(len(data_loader))

In [11]:
### Prepairing test data
test = pd.read_csv('MHTest.csv')
test_dataset = BertDataset(
    tweet = test.text.values,
    task = 'test'
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = CONFIG['TRAIN_BATCH_SIZE'],
    num_workers = 4
)

In [12]:
def predict_fn(model):
  fin_outputs = []
  with torch.no_grad():
    for bi, d in enumerate(test_data_loader):
      ids = d['ids']
      mask = d['mask']
  
      #send them to cuda gpu 
      ids = ids.to(device,dtype=torch.long)
      mask = mask.to(device,dtype=torch.long)
      
     
      outputs = model(
          ids,
          mask
      )
      fin_outputs.append(torch.nn.functional.softmax(outputs).cpu().detach().numpy())
      alls = np.vstack(fin_outputs)

  return alls

In [13]:

def run_folds():
    total_folds=5
    all_preds = []
    losses = []
    seed_all(SEED_VAL)
    dfx = pd.read_csv('MHTrain.csv').fillna("none")
    dfx['label'] = dfx['label'].factorize()[0]
    fold=StratifiedKFold(n_splits=total_folds, shuffle=True)
    for i,(train_index, test_index) in enumerate(fold.split(dfx,dfx['label'])):
      print(f'FOLD {i+1}/{total_folds}')
      df_train = dfx.iloc[train_index]
      df_valid = dfx.iloc[test_index]

      train_dataset =BertDataset(
          tweet=df_train.text.values,
          target=df_train.label.values,
          task='train'
      )

      train_data_loader = torch.utils.data.DataLoader(
          train_dataset,
          batch_size=CONFIG['TRAIN_BATCH_SIZE'],
          num_workers=4
      )

      valid_dataset =BertDataset(
          tweet=df_valid.text.values,
          target=df_valid.label.values,
          task='train'
      )

      valid_data_loader = torch.utils.data.DataLoader(
          valid_dataset,
          batch_size=CONFIG['TRAIN_BATCH_SIZE'],
          num_workers=1
      )

      device = torch.device("cuda")
      model = CustomBert(4)
      model.to(device)
      
      param_optimizer = list(model.named_parameters())
      no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
      optimizer_parameters = [
          {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
          {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
      ]

      num_train_steps = int(len(df_train) / CONFIG['TRAIN_BATCH_SIZE'] * CONFIG['EPOCHS'])
      optimizer = AdamW(optimizer_parameters, lr=5e-5)
      
      #scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps)


      best_accuracy = 0
      for epoch in range(CONFIG['EPOCHS']):
          print("----------------EPOCH "+str(epoch+1)+"---------------------")
          train_fn(train_data_loader, model, optimizer, device#scheduler
                  )
          outputs,targets,losss = eval_fn(valid_data_loader ,model, device)
          print("LOSS for this Epoc on val: ",losss)
      losses.append(losss)
      fold_preds = predict_fn(model)
      all_preds.append(fold_preds)
    print("mean losses over all folds: ",np.mean(losses))
    return  all_preds

In [14]:
preds = run_folds()

FOLD 1/5


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…


----------------EPOCH 1---------------------
Training loss for this epoch:  0.9838885991804062




LOSS for this Epoc on val:  0.7138821184635162
----------------EPOCH 2---------------------
Training loss for this epoch:  0.5517459555018333
LOSS for this Epoc on val:  0.5269542168825865
----------------EPOCH 3---------------------
Training loss for this epoch:  0.3307956147097772
LOSS for this Epoc on val:  0.42829550988972187




FOLD 2/5
----------------EPOCH 1---------------------
Training loss for this epoch:  1.0534869009448635
LOSS for this Epoc on val:  0.6819186210632324
----------------EPOCH 2---------------------
Training loss for this epoch:  0.556378385232341
LOSS for this Epoc on val:  0.4192058630287647
----------------EPOCH 3---------------------
Training loss for this epoch:  0.2735674128897728
LOSS for this Epoc on val:  0.4563830839470029
FOLD 3/5
----------------EPOCH 1---------------------
Training loss for this epoch:  1.0688810271601523
LOSS for this Epoc on val:  1.0651965141296387
----------------EPOCH 2---------------------
Training loss for this epoch:  0.5390955286641275
LOSS for this Epoc on val:  0.5329511985182762
----------------EPOCH 3---------------------
Training loss for this epoch:  0.2964496045343338
LOSS for this Epoc on val:  0.4817640744149685
FOLD 4/5
----------------EPOCH 1---------------------
Training loss for this epoch:  1.063847440865732
LOSS for this Epoc on val:  

In [15]:
preds_1 = np.mean(preds,axis=0)

In [16]:
sub=pd.DataFrame()
sub['ID'] = test['ID']
sub['Depression'] = preds_1[:,0]
sub['Alcohol'] = preds_1[:,3]
sub['Suicide'] = preds_1[:,2]
sub['Drugs'] = preds_1[:,1]
sub.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.532257,0.082992,0.360476,0.024275
1,03BMGTOK,0.984143,0.003212,0.009787,0.002857
2,03LZVFM6,0.983973,0.002924,0.010216,0.002888
3,0EPULUM5,0.982621,0.003684,0.010483,0.003212
4,0GM4C5GD,0.018825,0.587403,0.035892,0.35788


In [17]:
sub.to_csv("Bert.csv", index=False)

In [18]:
wS = pd.read_csv("Bert.csv")
wS.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.532257,0.082992,0.360476,0.024275
1,03BMGTOK,0.984143,0.003212,0.009787,0.002857
2,03LZVFM6,0.983973,0.002924,0.010216,0.002888
3,0EPULUM5,0.982622,0.003684,0.010483,0.003212
4,0GM4C5GD,0.018825,0.587403,0.035892,0.35788
