## Evaluate notebook

In [1]:
import tensorflow as tf
import pandas as pd
import evaluate
import os
dir_root = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
accuracy_ev = evaluate.load("accuracy")
print(accuracy_ev.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative



In [48]:
# model_tf = tf.saved_model.load(os.path.join(os.getcwd(), 'models/tfsample/'))
# model_tf = tf.keras.models.load_model(os.path.join(os.getcwd(), 'models/modeltf/'))
from transformers import AutoModelForSequenceClassification
model_1 = AutoModelForSequenceClassification.from_pretrained('models/huggingfacemodel/', num_labels=6, from_tf=True)


All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


In [49]:
model_1

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [3]:
dataset_df = pd.read_csv(os.path.join(dir_root, 'data/interim/trainset.csv'), converters={'NDD':str})
labels_set = set(dataset_df.labels.to_list())

In [4]:
from datasets import load_dataset, Features, ClassLabel, Value

In [5]:
traincsv = os.path.join(dir_root, 'data/interim/trainsethugf.csv')
testcsv = os.path.join(dir_root, 'data/interim/testsethugf.csv')
validcsv = os.path.join(dir_root, 'data/interim/validsethugf.csv')
# class_names = ["RoboADomicilio", "RoboAPersonas", "RoboAUnidadesEconomicas", "RoboDeBienesAccesoriosYAutoPartes", "RoboDeCarros", "RoboDeMotos"]
class_names = list(labels_set)
robo_features = Features({'relato': Value('string'), 'labels': ClassLabel(names=class_names)})
dataset = load_dataset("csv", data_files={'train': traincsv, 'test': testcsv, 'validation':validcsv}, features=robo_features)

Using custom data configuration default-8ad38b8b415aeff6


Downloading and preparing dataset csv/default to C:\Users\entea\.cache\huggingface\datasets\csv\default-8ad38b8b415aeff6\0.0.0\51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 3001.65it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 596.21it/s]
                                 

Dataset csv downloaded and prepared to C:\Users\entea\.cache\huggingface\datasets\csv\default-8ad38b8b415aeff6\0.0.0\51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 54.47it/s]


In [6]:
seqlen = dataset_df.relato.apply(lambda x: len(x.split()))

In [7]:
from transformers import DistilBertTokenizer, AutoTokenizer, DistilBertTokenizerFast
# model_name = 'xlm-roberta-large'
# model_name = 'bert-base-cased'
# model_name = 'bert-base-multilingual-uncased-sentiment'
model_name = 'distilbert-base-multilingual-cased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [8]:
def tokenizer_func(examples):
  return tokenizer(examples["relato"],
                   max_length=seqlen.max(),
                   padding = "max_length",
                   truncation=True)

tokenized_dataset = dataset.map(tokenizer_func, batched=True)

100%|████████████████████████████████████████████████████████████████████████████████| 274/274 [01:03<00:00,  4.30ba/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:24<00:00,  3.68ba/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:18<00:00,  3.72ba/s]


In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [10]:
tf_test_set = tokenized_dataset["test"].shuffle(seed=42).select(range(4000)).to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [11]:
# from transformers import create_optimizer

# batch_size = 16
# num_epochs = 50
# batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
# total_train_steps = int(batches_per_epoch * num_epochs)
# optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [12]:
# model_tf.compile(optimizer=optimizer, metrics = ['accuracy'])

In [13]:
# model_tf.evaluate(tf_test_set)

In [14]:
dataset_df = pd.read_csv(os.path.join(dir_root, 'data/raw/dataset.csv'))
dataset_df.sample(5)

Unnamed: 0,NDD,RELATO,cantidad_palabras,Tipo_Delito_PJ,TARGET,LABELS_ROBO A DOMICILIO,LABELS_ROBO A PERSONAS,LABELS_ROBO A UNIDADES ECONOMICAS,"LABELS_ROBO DE BIENES, ACCESORIOS Y AUTOPARTES DE VEHICULOS",LABELS_ROBO DE CARROS,LABELS_ROBO DE MOTOS,LABELS
67134,80401822040064,es el caso señor fiscal que el dia lunes 11 de...,110,ROBO,5,1,0,0,0,0,0,ROBO A DOMICILIO
69094,80601821020060,se ingresa formal denuncia escrita parte polic...,38,ROBO,5,1,0,0,0,0,0,ROBO A DOMICILIO
200281,90701820020286,es el caso señor fiscal que el día de hoy 19 d...,120,ROBO,4,0,0,0,1,0,0,"ROBO DE BIENES, ACCESORIOS Y AUTOPARTES DE VEH..."
126691,90101817082686,es el caso señor fiscal que el 13 de agosto de...,91,ROBO,4,0,0,0,1,0,0,"ROBO DE BIENES, ACCESORIOS Y AUTOPARTES DE VEH..."
92338,90101815066900,es el caso señor fiscal que el dia 19 de junio...,195,ROBO,6,0,1,0,0,0,0,ROBO A PERSONAS


In [15]:
dataset_df.groupby(['TARGET', 'LABELS'])['NDD'].nunique().reset_index()

Unnamed: 0,TARGET,LABELS,NDD
0,1,ROBO A UNIDADES ECONOMICAS,30291
1,2,ROBO DE CARROS,35327
2,3,ROBO DE MOTOS,48044
3,4,"ROBO DE BIENES, ACCESORIOS Y AUTOPARTES DE VEH...",66173
4,5,ROBO A DOMICILIO,72008
5,6,ROBO A PERSONAS,179826


In [16]:
set(tokenized_dataset['test']['labels'])

{0, 1, 2, 3, 4, 5}

In [17]:
from datasets import ClassLabel
labels_set = set(dataset_df.LABELS.to_list())
ClassLabel(names=class_names)

ClassLabel(num_classes=6, names=['ROBO A DOMICILIO', 'ROBO A PERSONAS', 'ROBO DE BIENES, ACCESORIOS Y AUTOPARTES DE VEHICULOS', 'ROBO DE CARROS', 'ROBO A UNIDADES ECONOMICAS', 'ROBO DE MOTOS'], id=None)

should I suppose that:

In [18]:
labels_dict = dict(zip(range(6), labels_set))
labels_dict.items()

dict_items([(0, 'ROBO A DOMICILIO'), (1, 'ROBO A PERSONAS'), (2, 'ROBO DE BIENES, ACCESORIOS Y AUTOPARTES DE VEHICULOS'), (3, 'ROBO DE CARROS'), (4, 'ROBO A UNIDADES ECONOMICAS'), (5, 'ROBO DE MOTOS')])

In [50]:
from evaluate import evaluator
metric = evaluate.load("accuracy")
eval = evaluator("text-classification")

results = eval.compute(model_1, 
                       data=dataset["test"].shuffle(seed=42).select(range(4000)), 
                       metric=metric, input_column="relato", 
                       label_column="labels", 
                       tokenizer = tokenizer,
                      label_mapping = labels_dict)

print(results)

KeyError: 'LABEL_2'

Trying to get a tensorflow model that could be used

In [51]:
from transformers import TFDistilBertForSequenceClassification
model_tf = TFDistilBertForSequenceClassification.from_pretrained('models/huggingfacemodel/', num_labels=6)

Some layers from the model checkpoint at models/huggingfacemodel/ were not used when initializing TFDistilBertForSequenceClassification: ['dropout_99']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at models/huggingfacemodel/ and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
model_tf.load_weights('models/modelweights/')

PermissionError: [Errno 13] Unable to open file (unable to open file: name = 'models/modelweights/', errno = 13, error message = 'Permission denied', flags = 0, o_flags = 0)

In [32]:
from transformers import create_optimizer
model_tf.layers[0].trainable = False
batch_size = 16
num_epochs = 50
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
model_tf.compile(optimizer=optimizer, metrics = ['accuracy'])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [33]:
model_tf.summary()

Model: "tf_distil_bert_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  134734080 
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  4614      
_________________________________________________________________
dropout_59 (Dropout)         multiple                  0         
Total params: 135,329,286
Trainable params: 595,206
Non-trainable params: 134,734,080
_________________________________________________________________


In [34]:
model_tf.evaluate(tf_test_set)



[3.342100143432617, 0.0702499970793724]

Loading weights:

In [36]:
model_name

'distilbert-base-multilingual-cased'

In [35]:
modelw = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)

Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_79']
You should probably TRAIN this model on a down-stream ta

In [38]:
modelw.layers[0].trainable = False
modelw.compile(optimizer=optimizer, metrics = ['accuracy'])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [39]:
modelw.summary()

Model: "tf_distil_bert_for_sequence_classification_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  134734080 
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  4614      
_________________________________________________________________
dropout_79 (Dropout)         multiple                  0         
Total params: 135,329,286
Trainable params: 595,206
Non-trainable params: 134,734,080
_________________________________________________________________


In [46]:
modelw.load_weights('models/modelweights/checkpoint')

OSError: Unable to open file (file signature not found)