<a href="https://colab.research.google.com/github/MathengeKen/-Basic-Needs-Basic-Rights-Kenya---Tech4MentalHealth-/blob/master/tech4mental(bert)0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch 
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import transformers 
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
import pandas as pd 
import numpy as np 
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import AdamW
from sklearn.model_selection import StratifiedKFold
import os 
import random
from collections import defaultdict

In [3]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [4]:
CONFIG = {
    'MAX_LEN':128,
    'TRAIN_BATCH_SIZE':16,
    'VALID_BATCH_SIZE':16,
    'EPOCHS':3,
    'TOKENIZER':BertTokenizer.from_pretrained('bert-base-cased')
}

In [5]:
import random
from random import randint
import numpy as np
SEED_VAL  = 1000
# Set the seed value all over the place to make this reproducible.
def seed_all(SEED):
  random.seed(SEED_VAL)
  np.random.seed(SEED_VAL)
  torch.manual_seed(SEED_VAL)
  torch.cuda.manual_seed_all(SEED_VAL)
  os.environ['PYTHONHASHSEED'] = str(SEED_VAL)
  torch.backends.cudnn.deterministic = True

In [6]:
class CustomBert(nn.Module):
  def __init__(self, n_classes):
    super(CustomBert, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )

    output = self.drop(pooled_output)
    return self.out(output)

In [7]:
class BertDataset():
  def __init__(self,text,label,task):
    self.text = text
    self.label = label
    self.tokenizer = CONFIG['TOKENIZER']
    self.max_len = CONFIG['MAX_LEN']
    self.task = task

  def __len__(self):
    return len(self.text)

  def __getitem__(self, item):
    text = str(self.text[item])
    label = self.label[item]

    encoding = self.tokenizer.encode_plus(
        text, 
        add_special_tokens = True,
        max_length = self.max_len,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors ='pt',
        truncation = True
    )

    to_return= {
        'text':text,
        'input_ids':encoding['input_ids'].flatten(),
        'attention_mask':encoding['attention_mask'].flatten()
    }

    if (self.task == 'train'):
      to_return.update({'label':torch.tensor(label, dtype=torch.long)})

    return to_return

In [8]:
def loss_fn(outputs, label):
  criterion = nn.CrossEntropyLoss()
  return criterion(outputs, label)

In [15]:
def train_fn(model,
             dataloader,
             optimizer,
             device,
             scheduler):
  model = model.train()
  
  losses =[]
  correct_predictions = 0

  for d in dataloader:
    input_ids = d["input_ids"].to(device, dtype=torch.long)
    attention_mask = d["attention_mask"].to(device, dtype=torch.long)
    label = d["label"].to(device)

    outputs = model(
        input_ids = input_ids,
        attention_mask = attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, label)
    loss = int(loss)
    losses += loss

    correct_predictions += torch.sum(preds == label)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    #accuracy = correct_predictions.double()/n_examples
  return losses/len(dataloader)

In [16]:
def eval_fn(model, dataloader,device):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in dataloader:
      input_ids = d["input_ids"].to(device, dtype=torch.long)
      attention_mask = d["attention_mask"].to(device, dtype=torch.long)
      label = d["label"].to(device, dtype=torch.long)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      

      loss = loss_fn(outputs, label)
      loss = int(loss)

      correct_predictions += torch.sum(preds == label)
      losses += loss

  return losses/len(dataloader)

In [17]:
def predict_fn(model, dataloader):
  final_outputs = []
  with torch.no_grad():
    for d in dataloader:
      input_ids = d["input_ids"].to(device, dtype=torch.long)
      attention_mask = d["attention_mask"].to(device, dtype=torch.long)
      label = d["label"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      
      final_outputs.append(torch.nn.functional.softmax(outputs).cpu().detach().numpy())
      alls = np.vstack(final_outputs)
  return alls

In [18]:
### Prepairing test data
test = pd.read_csv('MHTest.csv')
test_dataset = BertDataset(
        text = test.text.values,
        label = None,
        task = 'predict')

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = CONFIG['TRAIN_BATCH_SIZE'],
    num_workers = 4
)

In [19]:
def run_folds():
  total_folds =5
  preds = []
  losses = []
  seed_all(SEED_VAL)
  df = pd.read_csv('MHTrain.csv').fillna('NONE')
  df['label'] = df['label'].factorize()[0]
  fold = StratifiedKFold(n_splits = total_folds, shuffle=True)
  for i, (train_index, test_index) in enumerate(fold.split(df, df['label'])):
    print (f'FOLD {i+1}/{total_folds}')
    df_train = df.iloc[train_index]
    df_valid = df.iloc[test_index]

    train_dataset = BertDataset(
        text = df_train.text.values,
        label = df_train.label.values,
        task = 'train'
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = CONFIG['TRAIN_BATCH_SIZE'],
        num_workers = 4
    )

    valid_dataset = BertDataset(
        text = df_valid.text.values,
        label = df_valid.label.values,
        task = 'train'
    )

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = CONFIG['TRAIN_BATCH_SIZE'],
        num_workers = 4
    )

    device = torch.device("cuda")
    model = CustomBert(4)
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = int(len(df_train) / CONFIG['TRAIN_BATCH_SIZE'])

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )

    %%time
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(CONFIG['EPOCHS']):
        print("----------------EPOCH "+str(epoch+1)+"---------------------")
        train_loss = train_fn(model, train_dataloader, optimizer, device, scheduler)
        eval_loss = eval_fn(model, valid_dataloader , device)
        print("TRAIN_LOSS for this Epoch:", train_loss)
        print("EVAL_LOSS for this Epoch:",eval_loss)

        #history['train_acc'].append(train_acc)
        #history['train_loss'].append(train_loss)
        #history['val_acc'].append(val_acc)
        #history['val_loss'].append(val_loss)

        #if val_acc > best_accuracy:
          #torch.save(model.state_dict(), 'best_model_state.bin')
          #best_accuracy = val_acc


    losses.append(eval_loss)
    fold_preds = predict_fn(model, test_dataloader)
    all_preds.append(fold_preds)
    print("mean losses over all folds: ",np.mean(losses))
    return  all_preds

In [20]:
preds = run_folds()

FOLD 1/5
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.72 µs
----------------EPOCH 1---------------------


TypeError: ignored

In [None]:
preds_1 = np.mean(preds,axis=0)

In [None]:
sub=pd.DataFrame()
sub['ID'] = test['ID']
sub['Depression'] = preds_1[:,0]
sub['Alcohol'] = preds_1[:,3]
sub['Suicide'] = preds_1[:,2]
sub['Drugs'] = preds_1[:,1]
sub.head()

In [None]:
sub.to_csv("Bert1.csv", index=False)