# 1st Experiment classifying GRIs into Concepts: 200,300,400

> This script is used to train a multiclass classifier using Bert and some other flavors, also we provide the evaluation of each strategy. For better performance we use a TPU backend.


> To run on Google Drive, the input data (training and validation) is expected in the folder named SQUAD MATERIAL.

OBS.: Need to treat words like im- pact 

In [1]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [2]:
#!cat /proc/cpuinfo
#!cat /proc/meminfo

In [2]:
!pip install tokenizers
!pip install transformers

import json
import pandas as pd
import string
import re
import gc
import os
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

!pip install sentencepiece
from transformers import BertTokenizer, TFBertModel
from transformers import DistilBertTokenizer, TFDistilBertModel
from transformers import RobertaTokenizer, TFRobertaModel
from transformers import ElectraTokenizer, TFElectraModel
# from transformers import FunnelTokenizer, TFFunnelForTokenClassification
from transformers import AlbertTokenizer, TFAlbertModel

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.4 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3
Collecting transformers
  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.4 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 69.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 70.8 MB/s 
Installing collected packages: sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uni

In [3]:
# Accessing local drive

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP_tese/Exp1

Mounted at /content/drive
/content/drive/MyDrive/NLP_tese/Exp1


In [4]:
import nltk
from nltk.corpus import stopwords
from functools import reduce

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))
def lower(text):
    return text.lower()
def replace_special_characters(text):
    return REPLACE_BY_SPACE_RE.sub(' ', text)
def filter_out_uncommon_symbols(text):
    return GOOD_SYMBOLS_RE.sub('', text)
def remove_stopwords(text):
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])
def strip_text(text):
    return text.strip()
def remove_GRI(text):
    regex = re.compile(r'GRI|gri', re.UNICODE)
    return re.sub(regex, '', text)
def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
 
PREPROCESSING_PIPELINE = [
#                          remove_articles,
#                          remove_GRI,
                          lower,
#                          replace_special_characters,
#                          filter_out_uncommon_symbols,
#                          remove_stopwords,
#                          strip_text
                          ]
# Anchor method
def text_prepare(text, filter_methods=None):
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
## Implementing our new creat_dataframe function 
def aggregate_cat(txt):
  txt_100 = re.sub('^1[0-9][0-9]\-[0-9]+', '100', txt.strip())
  txt_200 = re.sub('^2[0-9][0-9]\-[0-9]+', '200', txt_100)
  txt_300 = re.sub('^3[0-9][0-9]\-[0-9]+', '300', txt_200)
  txt_400 = re.sub('^4[0-9][0-9]\-[0-9]+', '400', txt_300)
  return (txt_400)

def create_dataframe(ds, tokenizer, maxlen=512, stride=30, split=0.9, use_token_type_ids=True):
  samples = []
  # We have our ds with the format of <Filename,GRI,Text>
  #let's start aggregating our 200,300 and 400 categories
  print(ds.dtypes)
  ds['GRI'] = ds['GRI'].apply(lambda txt: aggregate_cat(txt))
  ds['GRI'] = pd.to_numeric(ds['GRI']).astype('int32') 
  # Let's remove the 100 category
  ds.drop(ds[ds['GRI'] == 100].index, inplace=True)
  ds = ds.reset_index(drop=True)
  ds['GRI'] = ds['GRI'].astype('int32') 
  # We clean our text data
  ds['Text'] = ds['Text'].apply(lambda txt: text_prepare(txt))
  ###
  for index, row in ds.iterrows():
    tmp_a = row['Text']
    tmp_gri = row['GRI']
    tmp_filename = row['Filename']
    tmp_tokens = tokenizer(tmp_a)
    tok_len = len(tmp_tokens["input_ids"])
    idx = tmp_tokens["input_ids"].index(tokenizer.sep_token_id)
    # Split between train and validation according to the data field in the original dataframe.
    tmp_split = "train" if index < len(ds['Text']) * split else "validation"
    # If the total length exceeds the window length, the context is split with a partial overlap governed by stride.
    if tok_len > maxlen:
      print("##Warning: token length larger than the maximum ({}). Splitting answer into partially overlapped chunks...".format(tok_len))
      print(tmp_a)
    else:
      if use_token_type_ids:
        samples.append({"filename": tmp_filename, "text": tmp_a, "gri": tmp_gri, "input_ids": tmp_tokens["input_ids"], "token_type_ids": tmp_tokens["token_type_ids"], "attention_mask": tmp_tokens["attention_mask"], "split": tmp_split})
      else:
        samples.append({"filename": tmp_filename, "text": tmp_a, "gri": tmp_gri, "input_ids": tmp_tokens["input_ids"], "attention_mask": tmp_tokens["attention_mask"], "split": tmp_split})

  return pd.DataFrame(samples)

# Returns tensors for the model, starting from the provided dataframe (which contains prewindowed entries).
def generate_samples(df, max_len, use_token_type_ids=True):
  input_ids = np.zeros((len(df), max_len), dtype=np.int32)
  attention_mask = np.zeros((len(df), max_len), dtype=np.bool)
  class_ESG = np.zeros((len(df),3), dtype=np.bool)

  print(df.dtypes)
  if use_token_type_ids:
    token_type_ids = np.zeros((len(df), max_len), dtype=np.uint8)

  for i in range(len(df)):
    tmp_length = len(df["input_ids"].loc[i])
    input_ids[i,:tmp_length] = df["input_ids"].loc[i]
    attention_mask[i,:tmp_length] = df["attention_mask"].loc[i]
    if use_token_type_ids:
      token_type_ids[i,:tmp_length] = df["token_type_ids"].loc[i]
    # we want class_ESG with vector(3) with [1,0,0] if 200, [0,1,0]
    if df['gri'].loc[i] == 200:
      class_ESG[i,0] = 1
    elif df['gri'].loc[i] == 300:
      class_ESG[i,1] = 1
    else :
      class_ESG[i,2] = 1
  # End for

  if use_token_type_ids:
    return [input_ids, token_type_ids, attention_mask], [class_ESG]
  else:
    return [input_ids, attention_mask], [class_ESG]


### Preprocessing and windowing

### Model

In [6]:
# Modular model topology. Takes an encoder layer and connects it to text head. 
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

def model_topology(encoder, max_len=512, use_pooler=True, use_token_type_ids=True):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)

  if use_token_type_ids:
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    transformer = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
  else:
    transformer = encoder(input_ids, attention_mask=attention_mask)
  # we predict 3 classes (200,300,400), so units=3
  if use_pooler:
    class_ESG = layers.Dense(units=3, activation="sigmoid")(transformer["pooler_output"])
  else:
    transformer = layers.Flatten()(transformer["last_hidden_state"])
    class_ESG = layers.Dense(units=3, activation="sigmoid")(transformer)

  optimizer = keras.optimizers.Adam(learning_rate=5e-5) 
  # From https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a
  # We have this suggestion for the optimizer
  # optimizer = Adam( learning_rate=5e-05, epsilon=1e-08, decay=0.01, clipnorm=1.0)

  if use_token_type_ids:
    model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[class_ESG])
  else:
    model = keras.Model(inputs=[input_ids, attention_mask], outputs=[class_ESG])

  model.compile(loss=["binary_crossentropy"], optimizer=optimizer, metrics=['accuracy'])
# The new model
# https://stackoverflow.com/questions/58565394/what-is-the-difference-between-sparse-categorical-crossentropy-and-categorical-c
# I still don't know if it is better sparse or the simple categorical, read above
#  model.compile(loss=["categorical_crossentropy", "categorical_crossentropy", "categorical_crossentropy"], loss_weights=[0.33, 0.33, 0.33], optimizer=optimizer, metrics=['accuracy'])
  model.summary()
  return model

### Evaluation metrics

In [7]:
# Average exact match score (accuracy) for a list of answers.
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# Evaluation method. Given a test dataframe and a model (with its own parameters), predicts the outputs, optionally prints them, and computes the metrics.
def eval(df, model, max_len, tokenizer, use_token_type_ids=True, print_output=False):
  #list of predictions
  pred_ans=[]
  true_ans=[]
  if use_token_type_ids:
    [input_ids, token_type_ids, attention_mask], class_ESG = generate_samples(df, max_len, use_token_type_ids) 
    pred_class_ESG = model.predict([input_ids, token_type_ids, attention_mask])
  else:
    [input_ids, attention_mask], [class_ESG] = generate_samples(df, max_len, use_token_type_ids) 
    pred_class_ESG = model.predict([input_ids, attention_mask])

  for i in range(len(df)):
    if max(pred_class_ESG[i]) > 0.5:
      true_ans.append(df['gri'].loc[i])
      if pred_class_ESG[i][0] > 0.5:
        pred_ans.append(200)
      elif pred_class_ESG[i][1] > 0.5:
        pred_ans.append(300)
      elif pred_class_ESG[i][2] > 0.5:
        pred_ans.append(400)
      else:
        print("Error no class assigned\n")
  if print_output:
    for i in range(len(df)):
      print("Sentence: {}".format(df["text"].loc[i]))
      print("True category: {}, Prediction 200: {:+.2f}, Prediction 300: {:+.2f}, Prediction 400: {:+.2f}".format(
          df['gri'].loc[i], pred_class_ESG[i][0],pred_class_ESG[i][1],pred_class_ESG[i][2]))

  print("=======================================")
  print("Accuracy:\t\t{:+.4f}%".format(accuracy_score(true_ans, pred_ans)*100))
  print("F1-score:\t\t{:+.4f}%".format(f1_score(true_ans, pred_ans, average="macro")*100))
  print("Precision:\t\t{:+.4f}%".format(precision_score(true_ans, pred_ans, average="macro")*100))
  print("Recall:\t\t\t{:+.4f}%".format(recall_score(true_ans, pred_ans, average="macro")*100))
  return accuracy_score(true_ans, pred_ans), f1_score(true_ans, pred_ans, average="macro"), recall_score(true_ans, pred_ans, average="macro")
  

In [8]:
## Let's check the model -- Training
use_tpu = True
# max_lengths = [256, 384, 512] # Windows with which each model will be trained.
max_lengths = [512] # Windows with which each model will be trained.

if not os.path.exists("weights"):
  os.mkdir("weights")

with open("summExp1.jsonl", "r") as f:
  ts_file = pd.read_json(f,lines=True)

# Models which are going to be trained.
models = {
    # "bert-large-uncased": {"tokenizer": BertTokenizer, "encoder": TFBertModel, "weights": "bert-large-uncased", "use_token_type_ids": True, "use_pooler": True} ,
    # "bert-uncased": {"tokenizer": BertTokenizer, "encoder": TFBertModel, "weights": "bert-base-uncased", "use_token_type_ids": True, "use_pooler": True} ,
      "bert-base-multilingual-uncased": {"tokenizer": BertTokenizer, "encoder": TFBertModel, "weights": "bert-base-multilingual-uncased", "use_token_type_ids": True, "use_pooler": True} #,
  #  "roberta": {"tokenizer": RobertaTokenizer, "encoder": TFRobertaModel, "weights": "roberta-base", "use_token_type_ids": False, "use_pooler": True},
## not working memory    "roberta-large": {"tokenizer": RobertaTokenizer, "encoder": TFRobertaModel, "weights": "roberta-large", "use_token_type_ids": False, "use_pooler": True}
  # "electra-base": {"tokenizer": ElectraTokenizer, "encoder": TFElectraModel, "weights": "google/electra-base-discriminator", "use_token_type_ids": False, "use_pooler": False},
  # "electra-large-generator": {"tokenizer": ElectraTokenizer, "encoder": TFElectraModel, "weights": "google/electra-large-generator", "use_token_type_ids": False, "use_pooler": False}
    # "electra-small": {"tokenizer": ElectraTokenizer, "encoder": TFElectraModel, "weights": "google/electra-small-discriminator", "use_token_type_ids": False, "use_pooler": False},
## not working    "funnel-small": {"tokenizer": FunnelTokenizer, "encoder": TFFunnelForTokenClassification, "weights": "funnel-transformer/small", "use_token_type_ids": False, "use_pooler": False},
## not working (lasthidenstate)    "funnel-intermediate": {"tokenizer": FunnelTokenizer, "encoder": TFFunnelForTokenClassification, "weights": "funnel-transformer/intermediate", "use_token_type_ids": False, "use_pooler": False},
  #  "distilbert-uncased": {"tokenizer": DistilBertTokenizer, "encoder": TFDistilBertModel, "weights": "distilbert-base-uncased", "use_token_type_ids": False, "use_pooler": False} ,
  #  "distilbert-cased": {"tokenizer": DistilBertTokenizer, "encoder": TFDistilBertModel, "weights": "distilbert-base-cased", "use_token_type_ids": False, "use_pooler": False},
#   "distilbert-multilingual": {"tokenizer": DistilBertTokenizer, "encoder": TFDistilBertModel, "weights": "distilbert-base-multilingual-cased", "use_token_type_ids": False, "use_pooler": False}
  # "albert-base-v2": {"tokenizer": AlbertTokenizer, "encoder": TFAlbertModel, "weights": "albert-base-v2", "use_token_type_ids": False, "use_pooler": False}
    # "albert-large-v2": {"tokenizer": AlbertTokenizer, "encoder": TFAlbertModel, "weights": "albert-large-v2", "use_token_type_ids": False, "use_pooler": False}
}

if not os.path.exists("transformers.csv"):
  with open("transformers.csv", "w") as f:
    f.write("Name\tAccuracy\tF1\tLoss-history\r\n")

if use_tpu:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)

# Train each model for each window length and record the performance on the validation set.
for max_len in max_lengths:
  for name, model in models.items():
    gc.collect()
    with open("summExp1.jsonl", "r") as f:
        ts_file = pd.DataFrame()
        ts_file = pd.read_json(f,lines=True)
    tokenizer = model["tokenizer"].from_pretrained(model["weights"])

    # Each model uses a different tokenizer, so every time, the train/validation dataframe needs to be rebuilt. Albeit slow, this improves randomization.
    df = create_dataframe(ts_file, tokenizer, max_len, 30, 0.9, use_token_type_ids=model["use_token_type_ids"])
    total = len(df)
    validation_len = len(df.where(df["split"] == "validation").dropna())
    print("Total: {}, train: {}, validation: {}, ratio: {}".format(total, total - validation_len, validation_len, (total - validation_len) / total))
    train_df = df.where(df["split"] == "train").dropna().drop("split", axis=1).sample(frac=1).reset_index()
    val_df = df.where(df["split"] == "validation").dropna().drop("split", axis=1).sample(frac=1).reset_index()
    
    if model["use_token_type_ids"]:
      [input_ids, token_type_ids, attention_mask], [class_ESG] = generate_samples(train_df, max_len, use_token_type_ids=True)
    else:
      [input_ids, attention_mask], [class_ESG] = generate_samples(train_df, max_len, use_token_type_ids=False)

    if use_tpu:
      with strategy.scope():
        encoder = model["encoder"].from_pretrained(model["weights"])
        nn = model_topology(encoder, max_len,use_pooler=model["use_pooler"], 
                            use_token_type_ids=model["use_token_type_ids"])
    else:
      encoder = model["encoder"].from_pretrained(model["weights"])
      nn = model_topology(encoder, max_len,use_pooler=model["use_pooler"],
                          use_token_type_ids=model["use_token_type_ids"])
    # For the moment, we don't want so use the save model, add "_2" to the file name check
    if not os.path.exists("weights/{}-{}.h5".format(name, max_len)):
      print("Training {}...".format(name))
      if model["use_token_type_ids"]:
        history = nn.fit([input_ids, token_type_ids, attention_mask], class_ESG, batch_size=64, epochs=10, verbose=1)
      else:
        history = nn.fit([input_ids, attention_mask], class_ESG, batch_size=64, epochs=10, verbose=1)
      acc, f1, rec = eval(val_df, nn, max_len, tokenizer, use_token_type_ids=model["use_token_type_ids"])

      with open("transformers.csv", "a") as f:
        f.write("{}-{}\t{}\t{}\t{}\r\n".format(name, max_len, acc, f1, history.history["loss"]))

      print("{}-{}\t{}\t{}\t{}\r\n".format(name, max_len, acc, f1, history.history["loss"]))
      nn.save_weights("weights/{}-{}.h5".format(name, max_len))
    else:
      print("{} already trained.".format(name))
      nn.load_weights("weights/{}-{}.h5".format(name, max_len))
      ## Still we need to know how to transfer the load_weights into a variable to get the loss and write
      ## on our transformers.csv file
      acc, f1, rec = eval(val_df, nn, max_len, tokenizer, use_token_type_ids=model["use_token_type_ids"],
                          print_output=True)
    with open("transformers.csv", "a") as f:
      print("{}-{}\t{}\t{}\t{}\r\n".format(name, max_len, acc, f1, "0"))
      #   f.write("{}-{}\t{}\t{}\t{}\tN/A\r\n".format(name, max_len, iou, acc, f1))


INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Initializing the TPU system: grpc://10.61.208.10:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.61.208.10:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors


Filename    object
GRI         object
Text        object
Obs         object
dtype: object
the income received by the communities during the year exceeded r$ 10 million. that is 55% higher than the amount received in 2015 and, in ad- dition to payment for supplies (r$ 5.8 million), it included the sharing of benefits derived from access to traditional knowledge (r$ 3.07 million), support for community infrastructure, training and payment for use of their image.  we have measured the positive impacts created by our relationship with local families, such as income gen- eration through the purchase of in- puts, investment in developing higher value-added production chains and transfer of technical skills through the training we provide. now that they are much better organized, the communi- ties are entering into partnerships with other companies, as well as natura, to supply inputs obtained from the brazil- ian biodiversity. this development was achieved using a sustainable produc- tion an

Downloading:   0%|          | 0.00/999M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


Cause: while/else statement not yet supported




Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 167356416   input_1[0][0]                    
                                                                 input_2[0][0]                

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=uint8>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=bool>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 3) dtype=bool>]








INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=uint8>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=bool>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 3) dtype=bool>]










Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
index               int64
filename           object
text               object
gri               float64
input_ids          object
token_type_ids     object
attention_mask     object
dtype: object


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 512) dtype=uint8>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 512) dtype=bool>]










Accuracy:		+84.9315%
F1-score:		+81.1794%
Precision:		+81.3083%
Recall:			+81.5657%
bert-base-multilingual-uncased-512	0.8493150684931506	0.8117944147355912	[0.5407583713531494, 0.35366320610046387, 0.2657684087753296, 0.20697268843650818, 0.20719319581985474, 0.16005904972553253, 0.11494284123182297, 0.10466306656599045, 0.07935904711484909, 0.06696967780590057]

bert-base-multilingual-uncased-512	0.8493150684931506	0.8117944147355912	0



In [9]:
## HW characteristics
!cat /proc/cpuinfo
!cat /proc/meminfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2299.998
cache size	: 46080 KB
physical id	: 0
siblings	: 40
core id		: 0
cpu cores	: 20
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
bogomips	: 4599.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 bits virtual
power management:

processor

In [10]:
! nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.

