In [37]:
!pip install transformers seqeval[gpu] pandas numpy sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

# Import

In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, PreTrainedTokenizer
import mlflow
from seqeval.metrics import classification_report

In [39]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


# Load the dataset

In [40]:
data = pd.read_csv("./data/labeled_doctor_req_chatgpt.csv")
data.head()

Unnamed: 0,sentence,label
0,یک دکتر مرد خوب برای ارتوپد در کرمانشاه سراغ د...,"O,O,B-gen,O,O,O,O,B-cit,O,O,O"
1,آیا می توانید یک دکتر برای آسم معرفی کنید ؟,"O,O,O,O,O,O,O,O,O,O"
2,کسی یک دکتر مرد برای افسردگی در خرم آباد یا مش...,"O,O,O,B-gen,O,O,O,B-cit,I-cit,O,B-cit,O,O,O"
3,آیا می توانید یک دکتر برای روانپزشکی معرفی کنید ؟,"O,O,O,O,O,O,O,O,O,O"
4,آیا می توانید یک دکتر برای زنان معرفی کنید ؟,"O,O,O,O,O,O,B-gen,O,O,O"


In [41]:
label2id = {
    'O': 0,
    'B-gen': 1,
    'I-gen': 2,
    'B-cit': 3,
    'I-cit': 4
}

In [42]:
id2label = {
    0: 'O',
    1: 'B-gen',
    2: 'I-gen',
    3: 'B-cit',
    4: 'I-cit'
}

# Preprocess

In [43]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased')



## Define functions

In [44]:
def tokenize_and_preserve_labels(sentence: str, text_labels: str, tokenizer: PreTrainedTokenizer) -> tuple[list[str], list[str]]:
    tokenized_sentence = []
    labels = []
    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [45]:
class dataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_len: int) -> None:
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index :int):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.label[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

## Split into Train, Test

In [46]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1000, 2)
TRAIN Dataset: (800, 2)
TEST Dataset: (200, 2)


In [47]:
training_set[0]

{'ids': tensor([    2,  3777,  4283,  3949,  2831,  7169,  2793, 59086,  1012,     4,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [48]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
دنبال       O
دکتر        O
خوبی        O
برای        O
افسردگی     O
می          O
گردم        O
.           O
[SEP]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O


In [49]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Define and track models with mlflow

In [50]:
EPOCHS = 2
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [51]:
mlflow.set_tracking_uri("mlflow")
mlflow.set_experiment("NER")

<Experiment: artifact_location='file:///d:/Learning/Hammasir Camp/Project/mlflow/900433234344106285', creation_time=1724746678302, experiment_id='900433234344106285', last_update_time=1724746678302, lifecycle_stage='active', name='NER', tags={}>

## Writing Functions

In [52]:
def initialize_model(id2label: dict, label2id: dict, training_set: dataset) -> tuple[BertForTokenClassification, torch.Tensor]:
    model = BertForTokenClassification.from_pretrained(
        'HooshvareLab/bert-fa-base-uncased', 
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id
    )
    model.to(device)
    ids = training_set[0]["ids"].unsqueeze(0)
    mask = training_set[0]["mask"].unsqueeze(0)
    targets = training_set[0]["targets"].unsqueeze(0)
    ids = ids.to(device)
    mask = mask.to(device)
    targets = targets.to(device)
    outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
    initial_loss = outputs[0]
    return model, initial_loss

In [53]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(optimizer: torch.optim.Adam, max_norm: int, training_loader: DataLoader, model: BertForTokenClassification) -> tuple[BertForTokenClassification, float, float]:
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_norm
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    return model, epoch_loss, tr_accuracy

In [54]:
def valid(model: BertForTokenClassification, testing_loader: DataLoader, device: str, id2label: dict, label2id: dict) -> tuple[list[str], list[str], float, float]:
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps

    return labels, predictions, eval_loss, eval_accuracy

In [55]:
def predict(sentence: str, model: BertForTokenClassification, tokenizer: BertTokenizer, id2label: dict, device: str) -> tuple[str, list[str]]:
  inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

  # move to gpu
  ids = inputs["input_ids"].to(device)
  mask = inputs["attention_mask"].to(device)
  # forward pass
  outputs = model(ids, mask)
  logits = outputs[0]

  active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
  flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

  tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
  token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
  wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

  word_level_predictions = []
  for pair in wp_preds:
    if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
      # skip prediction
      continue
    else:
      word_level_predictions.append(pair[1])

  # we join tokens, if they are not special ones
  str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
  return str_rep, word_level_predictions

# Base Model

In [57]:
mlflow.transformers.autolog(disable=True)
with mlflow.start_run(run_name='base_line_city_gender'):
    mlflow.log_params({
        'EPOCHS': EPOCHS,
        'LEARNING_RATE': LEARNING_RATE,
        'MAX_GRAD_NORM': MAX_GRAD_NORM
    })
    model, initial_loss = initialize_model(label2id=label2id, id2label=id2label, training_set=training_set)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
    for i in range(EPOCHS):
        model, epoch_loss, tr_accuracy = train(optimizer, MAX_GRAD_NORM, training_loader, model)
        mlflow.log_metric(f'loss_epoch{i+1}', epoch_loss)
        mlflow.log_metric(f'accuracy_epoch{i+1}', tr_accuracy)
    labels, predictions, eval_loss, eval_accuracy = valid(model, testing_loader, device, id2label, label2id)
    mlflow.log_metric('eval_loss', eval_loss)
    mlflow.log_metric('eval_accuracy', eval_accuracy)
    mlflow.log_metric('classification_report', classification_report([labels], [predictions]))
    mlflow.transformers.log_model(model, 'model')

pytorch_model.bin:   2%|1         | 10.5M/654M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training loss per 100 training steps: 1.3812986612319946
Training loss per 100 training steps: 0.08356071243726529


In [None]:
sentence = "سلام یه متخصص پوست و مو خانوم تو هاشمیه مشهد برام پیدا کن که این هفته وقت داشته باشه"
sentence_out, word_level_predictions = predict(sentence, model, tokenizer, id2label, device)
print(sentence_out, word_level_predictions, sep="\n")

NameError: name 'predict' is not defined