# install modules

In [None]:
%%capture
# !pip install pytorch-lightning==1.4.9
!pip install -q pytorch-lightning wandb
!pip install gensim --upgrade
!pip install datasets transformers[sentencepiece]

In [None]:
import numpy as np
import collections
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import re,nltk,json
import random
import torch
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.metrics import average_precision_score,roc_auc_score, roc_curve, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
SEED = 1234
from pytorch_lightning import seed_everything
seed_everything(SEED)
torch.backends.cudnn.deterministic = True
from tqdm.notebook import tqdm
import os

In [None]:
import torch.nn as nn
from pytorch_lightning import LightningModule,LightningDataModule,Trainer
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks import LearningRateMonitor,ModelCheckpoint,EarlyStopping,ProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import OneCycleLR
import torch.nn.functional as F

In [None]:
import logging
import datasets
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_metric
import transformers
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import time
import math
import datetime
import torch.nn as nn
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (
    AutoModel,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    default_data_collator,
    set_seed,
    get_constant_schedule_with_warmup
)
import sys

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
logger = logging.getLogger(__name__)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

# mount gsuit drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# training on potrika dataset

## create dataset

In [None]:
Economy_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/Economy_40k.csv',usecols=[1,2])
Education_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/Education_40k.csv',usecols=[1,2])
Entertainment_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/Entertainment_40k.csv',usecols=[1,2])
International_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/International_40k.csv',usecols=[1,2])
politics_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/politics_40k.csv',usecols=[1,2])
National_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/National_40k.csv',usecols=[1,2])
ScienceTechnology_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/ScienceTechnology_40k.csv',usecols=[1,2])
Sports_40k = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/BalancedDataset/Sports_40k.csv',usecols=[1,2])
Economy_40k.head()

In [None]:
all_combined = pd.concat([Economy_40k,Education_40k,Entertainment_40k,International_40k,politics_40k,
                          National_40k,ScienceTechnology_40k,Sports_40k],axis = 0)
print(Economy_40k.shape,Education_40k.shape,Entertainment_40k.shape,International_40k.shape,politics_40k.shape,
                          National_40k.shape,ScienceTechnology_40k.shape,Sports_40k.shape,all_combined.shape)
all_combined.head()

In [None]:
all_combined.article.iloc[1]

In [None]:
import regex as re
def clean_titles(title):
  title =str(title)
  while re.search('[\u0980-\u09ff][\'\’\‘\”\“\,\\.]+[\u0980-\u09ff]', title):
      pos = re.search('[\u0980-\u09ff][\'\’\‘\”\“\,\\.]+[\u0980-\u09ff]', title).start()
      title = title[:pos+1] + title[pos+2:]
  title = re.sub(r"[\’\‘\”\“]+", "'", title)
  title = re.sub(r"[\*\#\;]+", "", title)
  title = re.sub(r"[\n\r]+", "", title)
  title = re.sub(r'আরো পড়ুন.*','',title,flags=re.U|re.S) 
  return title

In [None]:
all_combined = all_combined.drop_duplicates(['article']) 
all_combined['article'] = all_combined['article'].apply(clean_titles) 

In [None]:
all_combined['article_length'] = all_combined['article'].apply(lambda x:len(x.split()))

In [None]:
all_combined.loc[all_combined.article_length<10]

In [None]:
c = collections.Counter(all_combined['article_length'].values)
sorted(c.items(),key = lambda x:x[0])

In [None]:
all_combined = all_combined.loc[all_combined.article_length>5]

In [None]:
all_combined.to_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/potrika_all_combined.csv',index=None)

In [None]:
all_combined.iloc[:1000,:].to_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/potrika_all_combined_mini.csv',index=None)

## load dataset

In [None]:
all_combined = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/potrika_all_combined.csv')
all_combined.head()

In [None]:
all_combined.describe(include='all')

In [None]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(all_combined,stratify = all_combined['class'].values,shuffle=True,test_size=0.2,random_state=42)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.to_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/train.csv',index=None)
test.to_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/test.csv',index=None)

In [None]:
train = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/test.csv')

In [None]:
train.shape,test.shape

In [None]:
label_list = all_combined['class'].unique()
label_list

## dataloader class

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import XLMRobertaTokenizer,BertTokenizerFast,BertTokenizer

In [None]:
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
all_checkpoints=['bert-base-multilingual-cased',
                 'sagorsarker/bangla-bert-base',
                 'neuralspace-reverie/indic-transformers-bn-bert',
                 'neuralspace-reverie/indic-transformers-bn-roberta',
                 'distilbert-base-multilingual-cased',
                 'neuralspace-reverie/indic-transformers-bn-distilbert',
                 'monsoon-nlp/bangla-electra',
                 'csebuetnlp/banglabert',
                 'neuralspace-reverie/indic-transformers-bn-xlmroberta'
                 ]

In [None]:
tokenizer_list=[]
tokenizer_list.append('AutoTokenizer.from_pretrained(all_checkpoints[0])')
tokenizer_list.append('BertTokenizer.from_pretrained(all_checkpoints[1])')
tokenizer_list.append('BertTokenizer.from_pretrained(all_checkpoints[2])')
tokenizer_list.append('AutoTokenizer.from_pretrained(all_checkpoints[3])')
tokenizer_list.append('AutoTokenizer.from_pretrained(all_checkpoints[4])')
tokenizer_list.append('AutoTokenizer.from_pretrained(all_checkpoints[5])')
tokenizer_list.append('AutoTokenizer.from_pretrained(all_checkpoints[6])')
tokenizer_list.append('AutoTokenizer.from_pretrained(all_checkpoints[7])')
tokenizer_list.append('XLMRobertaTokenizer.from_pretrained(all_checkpoints[8])')

In [None]:
class MNISTDataModule(LightningDataModule):
  def __init__(
      self,
      batch_size: int = 64,
      num_workers: int = 4,
      label_list=[0,1,2],
      model_number = 7,
      max_seq_length=64,
  ):
      super().__init__()
      self.batch_size = batch_size
      self.num_workers = num_workers  
      self.label_list = label_list
      self.num_classes = len(self.label_list)
      self.model_name =all_checkpoints[model_number]
      self.max_seq_length = max_seq_length
      self.tokenizer = eval(tokenizer_list[model_number])
      self.traindf = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/train.csv')
      self.traindf = self.traindf.sample(frac=1,random_state=42).reset_index(drop=True)
      self.testdf = pd.read_csv('/content/drive/MyDrive/machine_learning/Potrika Dataset/test.csv')
      self.testdf= self.testdf.sample(frac=1,random_state=42).reset_index(drop=True)
      self.label_map = {}
      for (i, label) in enumerate(self.label_list):
        self.label_map[label] = i
        # Assign test dataset for use in dataloader(s)
      
      #The labeled (test) dataset is assigned with a mask set to True
      self.test_examples = list(zip(self.testdf['article'].values,self.testdf['class'].values))
      self.train_examples = list(zip(self.traindf['article'].values,self.traindf['class'].values))
      
      
  def generate_data_loader(self,input_examples,label_map, do_shuffle = False,train = True):
    '''
    Generate a Dataloader given the input examples, eventually masked if they are 
    to be considered NOT labeled.
    '''
    # Building the TensorDataset
    if train and os.path.exists('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_input_ids.npy'):
      input_ids = torch.from_numpy(np.load('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_input_ids.npy'))
      input_mask_array = torch.from_numpy(np.load('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_input_mask_array.npy'))
      label_id_array = torch.from_numpy(np.load('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_label_id_array.npy'))
      
    elif os.path.exists('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_input_ids.npy'):
      input_ids = torch.from_numpy(np.load('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_input_ids.npy'))
      input_mask_array = torch.from_numpy(np.load('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_input_mask_array.npy'))
      label_id_array = torch.from_numpy(np.load('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_label_id_array.npy'))
      
    else:  
      #-----------------------------------------------
      # Generate input examples to the Transformer
      #-----------------------------------------------
      input_ids = []
      input_mask_array = []
      label_id_array = []

      # Tokenization 
      for text,label in input_examples:
        encoded_sent = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_seq_length, padding="max_length", truncation=True)
        input_ids.append(encoded_sent)
        label_id_array.append(label_map[label])
        
      
      # Attention to token (to ignore padded input wordpieces)
      for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]                          
        input_mask_array.append(att_mask)
      # Convertion to Tensor
      input_ids = torch.tensor(input_ids) 
      input_mask_array = torch.tensor(input_mask_array)
      label_id_array = torch.tensor(label_id_array, dtype=torch.long)
      
      if train:
        np.save('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_input_ids.npy',input_ids.numpy(),allow_pickle =False)
        np.save('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_input_mask_array.npy',input_mask_array.numpy(),allow_pickle =False)
        np.save('/content/drive/MyDrive/machine_learning/Potrika Dataset/train_label_id_array.npy',label_id_array.numpy(),allow_pickle =False)
      else:
        np.save('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_input_ids.npy',input_ids.numpy(),allow_pickle =False)
        np.save('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_input_mask_array.npy',input_mask_array.numpy(),allow_pickle =False)
        np.save('/content/drive/MyDrive/machine_learning/Potrika Dataset/test_label_id_array.npy',label_id_array.numpy(),allow_pickle =False)
    dataset = TensorDataset(input_ids, input_mask_array, label_id_array)
    if do_shuffle:
      sampler = RandomSampler
    else:
      sampler = SequentialSampler

    # Building the DataLoader
    return DataLoader(
                dataset,  # The training samples.
                sampler = sampler(dataset), 
                batch_size = self.batch_size,
                pin_memory = True,
                num_workers=self.num_workers) # Trains with this batch size.

  def train_dataloader(self):
      return self.generate_data_loader(self.train_examples,\
                                  self.label_map, do_shuffle = True)

  def val_dataloader(self):
      return self.generate_data_loader(self.test_examples, self.label_map,\
                                  do_shuffle = False,train=False)

  # def test_dataloader(self):
  #     return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)

In [None]:
gb_dataset = MNISTDataModule(model_number =7,label_list = label_list)
len(gb_dataset.train_examples)

In [None]:
for batch in gb_dataset.train_dataloader():
  print(batch)
  print(batch[0].shape,batch[1].shape,batch[2].shape)
  break

In [None]:
for batch in gb_dataset.val_dataloader():
  print(batch[0].shape,batch[1].shape,batch[2].shape)
  break

In [None]:
import math
sweep_config = {
    'learning_rate': 1e-05,
      'batch_size':64,
      'warmup_proportion':0.1,
      'num_train_examples': len(gb_dataset.train_examples),
      'model_number': 7,
      'epochs': 10,
     'device': 'cuda'}
sweep_config

# Model

In [None]:
class TransformerForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(
        self, pretrained_model_name: str, num_classes: int = 2, dropout: float = 0.5,
        mean_pool: bool=True
    ):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()
        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes
        )
        for k in config.to_dict().keys():
          if 'dropout' in k and 'classifier' not in k:
            config.update({k:0.3})

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        # self.model.encoder = reinit_autoencoder_model(self.model.encoder, reinit_num_layers=1)
        # print('I am here in init method')
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(dropout)
        self.mean_pool=mean_pool
        self.num_labels = num_classes
        if torch.cuda.is_available():
          self.model.cuda()
          self.classifier.cuda()
          self.dropout.cuda()
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        **kwargs
    ):
        # print('I am in forward method')
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        # print('I got output')
        hidden_state = output[0]  # (bs, seq_len, dim)
        outputs = (hidden_state,)
        if not self.mean_pool:
          pooled_output = hidden_state[:, 0]  # (bs, dim)
        else:
          pooled_output = hidden_state.mean(axis=1)  # (bs, dim)

        # pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        # pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        # print('I got output2')
        # print(pooled_output.shape)
        logits = self.classifier(pooled_output)  # (bs, dim)
        # print('I got output3')
        # outputs = (logits,) + output[1:]
        outputs = (logits,) + outputs
        # print('labels:',labels)
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import OrderedDict
def compute_metrics(pred):
    preds,labels = pred
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    # print(acc,precision,recall,f1)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    # return {'accuracy':acc}

In [None]:
def build_optimizer(model, config):
    transformer_vars = [i for i in model.parameters()]
    optimizer = torch.optim.AdamW(transformer_vars, lr=config.learning_rate)
    num_train_steps = int(config.num_train_examples / config.batch_size * config.epochs)
    num_warmup_steps = int(num_train_steps * config.warmup_proportion)

    scheduler = get_constant_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = num_warmup_steps)
      
    return optimizer,scheduler


## training methods

In [None]:
from sklearn.metrics import classification_report

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import collections
def train_epoch(model, dataset, optimizers,config,epoch):
  training_step_outputs=[]
  # Measure how long the training epoch takes.
  t0 = time.time()
  print_each_n_step = 100
  optimizer = optimizers[0]
  scheduler= optimizers[1]
  train_dl = dataset.train_dataloader()
  test_dl = dataset.val_dataloader()
  n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)
  # Put the model into training mode.
  model.train()
  # For each batch of training data...
  
  for step, batch in enumerate(train_dl):
    # Progress update every print_each_n_step batches.
    if step % print_each_n_step == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)    
        # Report progress.
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dl), elapsed))

    # Unpack this training batch from our dataloader. 
    b_input_ids = batch[0].to(config.device)
    b_input_mask = batch[1].to(config.device)
    b_labels = batch[2].to(config.device)
    real_batch_size=b_input_ids.shape[0]
    loss,logits,hidden_states = model(input_ids=b_input_ids,attention_mask=b_input_mask,labels=b_labels)
    preds = torch.argmax(logits,dim=1)
    #---------------------------------
    #  OPTIMIZATION
    #---------------------------------
    # Avoid gradient accumulation
    optimizer.zero_grad()

    # Calculate weigth updates
    # retain_graph=True is required since the underlying graph will be deleted after backward
    loss.backward() 
    
    #clip grad norm
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Apply modifications
    optimizer.step()
    scheduler.step()
    metrics = {'train/loss_step':loss}
    if step + 1 < n_steps_per_epoch:
      wandb.log(metrics)
    
    training_step_outputs.append(OrderedDict({'loss':loss.detach().cpu(),\
                                              'preds':preds.detach().cpu(),\
                                              'labels':b_labels.detach().cpu(),
                                              }))
    
  
  all_losses = np.mean([x['loss'].item() for x in training_step_outputs])
  all_preds = [y.item() for x in training_step_outputs for y in x['preds']]
  all_labels =  [y.item() for x in training_step_outputs for y in x['labels']]
  # result = compute_metrics((all_preds,all_labels))
  result = classification_report(all_labels,all_preds,target_names =dataset.label_list,output_dict=True)
  print(result)
  accuracy = result.pop('accuracy')
  wandb.log({'train/accuracy':accuracy})
  new_result = dict()
  
  for k in result:
    for keys in result[k]:
      new_result[f'train/{k}_{keys}']=result[k][keys]
  wandb.log(new_result)

  training_time = format_time(time.time() - t0)                    
  print("  Training epcoh took: {:}".format(training_time))
  validation_metrics = validate_model(model,test_dl,config,epoch,dataset.label_list)
  return validation_metrics

In [None]:
def validate_model(model,test_dl,config,epoch,label_list):
  print("Running Test...")
  t0 = time.time()
  val_step_outputs = []
  nll_loss = torch.nn.CrossEntropyLoss()
  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()

  for batch in test_dl:
    # Unpack this training batch from our dataloader. 
    b_input_ids = batch[0].to(config.device)
    b_input_mask = batch[1].to(config.device)
    b_labels = batch[2].to(config.device)
    # print('taken input from batch')
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        
      loss,logits,hidden_states = model(input_ids=b_input_ids,attention_mask=b_input_mask,labels=b_labels)
    
    # Accumulate the test loss.
    # print(filtered_logits.shape)
    # print('run discriminator')
    preds = torch.argmax(logits, dim=1)
    loss = nll_loss(logits, b_labels)
    # print('calculated everythin')
    wandb.log({'val/loss':loss})
    # print('logged everythin')

    val_step_outputs.append(OrderedDict({'loss':loss.detach().cpu(),\
                                          'preds':preds.detach().cpu(),'labels':b_labels.detach().cpu()}))
  
  all_losses = np.mean([x['loss'].item() for x in val_step_outputs])
  
  all_preds = [y.item() for x in val_step_outputs for y in x['preds']]
  # print(all_preds)
  all_labels =  [y.item() for x in val_step_outputs for y in x['labels']]
  # print(all_labels)
  # result = compute_metrics((all_preds,all_labels))
  result = classification_report(all_labels,all_preds,target_names =label_list,output_dict=True)
  print('val',result)
  accuracy = result.pop('accuracy')
  wandb.log({'val/accuracy':accuracy})
  new_result = dict()
  
  for k in result:
    for keys in result[k]:
      new_result[f'val/{k}_{keys}']=result[k][keys]
  wandb.log(new_result)
  test_time = format_time(time.time() - t0)
  print("  Test took: {:}".format(test_time))
  return new_result
  # return val_step_outputs

##train

In [None]:
import wandb
!wandb login

In [None]:
# !rm -r /content/wandb
# !rm -r /content/lightning_logs
!rm -r /content/category_title

In [None]:
!mkdir /content/category_title
!chmod 765 /content/category_title

In [None]:
best_test_f1 =0 
seed_everything(42)
# run = wandb.init(project="headline_category_detection",entity='colab-team',config=sweep_config)
run = wandb.init(config=sweep_config)
config = wandb.config
main_dataset = MNISTDataModule(model_number = config.model_number, label_list = label_list)
clickbaitmodel = TransformerForSequenceClassification(
    pretrained_model_name = all_checkpoints[config.model_number],num_classes = len(label_list))
# wandb.watch(clickbaitmodel, log="gradients")
optimizers = build_optimizer(clickbaitmodel, config)

In [None]:
for epoch in tqdm(range(config.epochs)):
    validation_metrics = train_epoch(clickbaitmodel,main_dataset,optimizers,config,epoch+1)
    model_name = all_checkpoints[config.model_number].split('/')[-1]
    # test_f1 = validation_metrics['val/epoch_f1']
    PATH=f'/content/category_title/{model_name}_{epoch}.pt'
    torch.save({
          'epoch': epoch,
          'model': clickbaitmodel.state_dict(),
          'optimizer_state_dict': optimizers[0].state_dict(),
          }, PATH)
    model_artifact = wandb.Artifact(
          f"{model_name}_{epoch}", type=f'{model_name}',
          description=f"{model_name}_{epoch}",
          metadata=dict(config))
    PATH=f'/content/category_title/banglabert_{epoch}.pt'
    model_artifact.add_file(PATH)
    run.log_artifact(model_artifact)
    break

In [None]:
wandb.finish()

## labelling category with best mdoel

In [None]:
import wandb
!wandb login

In [None]:
run = wandb.init()
artifact = run.use_artifact('colab-team/headline_category_detection/banglabert_7:v0', type='banglabert')
artifact_dir = artifact.download()

In [None]:
import math
sweep_config = {
    'learning_rate': 1e-05,
      'batch_size':64,
      'warmup_proportion':0.1,
      'num_train_examples': 15056,
      'model_number': 7,
      'epochs': 10,
     'device': 'cuda'}
sweep_config

In [None]:
best_test_f1 =0 
seed_everything(42)
# run = wandb.init(project="headline_category_detection",entity='colab-team',config=sweep_config)
run = wandb.init(config=sweep_config)
config = wandb.config
clickbaitmodel = TransformerForSequenceClassification(
    pretrained_model_name = all_checkpoints[config.model_number],num_classes = len(label_list))
optimizers = build_optimizer(clickbaitmodel, config)

In [None]:
PATH = "/content/artifacts/banglabert_7:v0/banglabert_7.pt"
checkpoint = torch.load(PATH)

In [None]:
clickbaitmodel.load_state_dict(checkpoint['model'])
optimizer = optimizers[0]
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/machine_learning/clickbait_identification/latestFixed.csv')
dataset.head()

In [None]:
unlabelled_dataset = pd.read_csv('/content/drive/MyDrive/machine_learning/clickbait_identification/combined.csv')
unlabelled_dataset.head()

In [None]:
unlabelled_dataset.shape

In [None]:
input_ids = []
input_mask_array = []
model_number = 7
tokenizer = eval(tokenizer_list[model_number])
max_seq_length = 64

In [None]:
train_examples = list(dataset['cleaned_title'].values)
# Tokenization 
for text in train_examples:
  encoded_sent = tokenizer.encode(text, add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
  input_ids.append(encoded_sent)   
# Attention to token (to ignore padded input wordpieces)
for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]                          
  input_mask_array.append(att_mask)
# Convertion to Tensor
input_ids = torch.tensor(input_ids) 
input_mask_array = torch.tensor(input_mask_array)
label_id_array = torch.tensor([0]*dataset.shape[0], dtype=torch.long)
tensordataset = TensorDataset(input_ids, input_mask_array,label_id_array)

In [None]:
train_examples = list(unlabelled_dataset['cleaned_title'].values)
# Tokenization 
for text in train_examples:
  encoded_sent = tokenizer.encode(text, add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
  input_ids.append(encoded_sent)   
# Attention to token (to ignore padded input wordpieces)
for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]                          
  input_mask_array.append(att_mask)
# Convertion to Tensor
input_ids = torch.tensor(input_ids) 
input_mask_array = torch.tensor(input_mask_array)
label_id_array = torch.tensor([0]*unlabelled_dataset.shape[0], dtype=torch.long)
tensordataset = TensorDataset(input_ids, input_mask_array,label_id_array)

In [None]:
len(tensordataset)

In [None]:
sampler = SequentialSampler
datadl =  DataLoader(
                tensordataset,  # The training samples.
                sampler = sampler(tensordataset), 
                batch_size = 128,
                pin_memory = True,
                num_workers=4) 

In [None]:
for batch in datadl:
  print(batch)
  break

In [None]:
for batch in datadl:
  print(tokenizer.batch_decode(batch[0],skip_special_tokens=True))
  # print(dataset.loc[:5,'cleaned_title'])
  break

In [None]:
def validate_model(model,test_dl,config,epoch,label_list):
  print("Running Test...")
  t0 = time.time()
  val_step_outputs = []
  nll_loss = torch.nn.CrossEntropyLoss()
  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()

  for batch in test_dl:
    # Unpack this training batch from our dataloader. 
    b_input_ids = batch[0].to(config.device)
    b_input_mask = batch[1].to(config.device)
    b_labels = batch[2].to(config.device)
    # print('taken input from batch')
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        
      loss,logits,hidden_states = model(input_ids=b_input_ids,attention_mask=b_input_mask,labels=b_labels)
    preds = torch.argmax(logits, dim=1)
    loss = nll_loss(logits, b_labels)
    val_step_outputs.append(OrderedDict({'loss':loss.detach().cpu(),\
                                          'preds':preds.detach().cpu(),'labels':b_labels.detach().cpu()}))
  all_preds = [y.item() for x in val_step_outputs for y in x['preds']]
  all_labels =  [y.item() for x in val_step_outputs for y in x['labels']]
  test_time = format_time(time.time() - t0)
  print("  Test took: {:}".format(test_time))
  # return new_result
  # return val_step_outputs
  return all_preds

In [None]:
config,label_list

In [None]:
preds = validate_model(clickbaitmodel,datadl,config,0,label_list)
len(preds)

In [None]:
preds[:10]

In [None]:
dataset['category'] = [label_list[i] for i in preds]
dataset.head()

In [None]:
unlabelled_dataset['category'] = [label_list[i] for i in preds]
unlabelled_dataset.head()

In [None]:
dataset.to_csv('/content/drive/MyDrive/machine_learning/clickbait_identification/latestFixedCategory.csv',index=None)

In [None]:
collections.Counter(dataset.category.values)

In [None]:
collections.Counter(unlabelled_dataset.category.values)

In [None]:
unlabelled_dataset.to_csv('/content/drive/MyDrive/machine_learning/clickbait_identification/combined_category.csv',index=None)