<a href="https://colab.research.google.com/github/JessicaVicentini99/AgenDay/blob/master/Lime_Experimento_Final_Classificacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install transformers



In [2]:
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads


In [3]:
import tensorflow as tf
# from transformers import AutoTokenizer, TFAutoModelForSequenceClassification #TFBertForSequenceClassification
from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertModel, TFAutoModelForSequenceClassification #TFBertForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
import os
import json
import time
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
from tensorflow.keras.optimizers import Adam
import csv


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Create Directories

In [5]:
def create_dir(path):
  if not os.path.exists(path):
    os.mkdir(path)

In [6]:
def create_directories(base_path):
  directories = {
    'models_path': base_path + '/models',
    'checkpoints_path': base_path + '/checkpoints',
    'metrics_path': base_path + '/metrics',
    'classification_report_path': base_path + '/metrics/classification_report',
    'graphs_path': base_path + '/metrics/graphs',
    'history_path': base_path + '/metrics/history',
    'lime_explanations': base_path + '/lime_explanations',
    'lime_html': base_path + '/lime_explanations/lime_html',
    'lime_html_1_1': base_path + '/lime_explanations/lime_html_1_1',
    'lime_html_0_0': base_path + '/lime_explanations/lime_html_0_0',
    'lime_html_1_0': base_path + '/lime_explanations/lime_html_1_0',
    'lime_html_0_1': base_path + '/lime_explanations/lime_html_0_1',
  }
  for name, folder_path in directories.items():
    create_dir(folder_path)
  return directories

In [7]:
base_path = '/content/drive/MyDrive/Arquivos/Mestrado/Qualificação/Experimentos Pos Qualificacao/Fake_Br_corpus_novo_experimento'

In [8]:
create_dir(base_path)

In [9]:
directories = create_directories(base_path)

## Geracao do nome do modelo

In [10]:
def generate_model_name(models_path):
  model_files = os.listdir(models_path)
  if model_files:
    model_files.sort(reverse=True)
    last_model_index = int(model_files[0].split('_')[1])
    model_index = last_model_index + 1
  else:
    model_index = 1
  timestamp = datetime.datetime.now().strftime('%d_%m_%Y-%H_%M')
  model_save_name = f"model_{model_index}__{timestamp}"
  return model_save_name

In [11]:
def generate_metrics_files_name(model_name, directories):
  files_name = {
    'model': directories['models_path'] + '/' + model_name + '.h5',
    'checkpoint': directories['checkpoints_path'] + '/' + model_name + '_checkpoint-{epoch:02d}.h5',
    'metrics_csv': directories['metrics_path'] + '/models_metrics.csv',
    'classification_report_txt': directories['classification_report_path'] + '/' + model_name + '.txt',
    'classification_report_json': directories['classification_report_path'] + '/' + model_name + '.json',
    'graph_acc': directories['graphs_path'] + '/' + model_name + '_acc.png',
    'graph_loss': directories['graphs_path'] + '/' + model_name + '_loss.png',
    'graph_acc_and_loss': directories['graphs_path'] + '/' + model_name + '_acc_and_loss.png',
    'confusion_matrix': directories['graphs_path'] + '/' + model_name + '_confusion_matrix.png',
    'history': directories['history_path'] + '/' + model_name + '_history.json',
    'lime_explanations': directories['lime_explanations'] + '/lime_explanations.csv',
  }
  return files_name

In [12]:
model_save_name = generate_model_name(directories['models_path'])
files_name = generate_metrics_files_name(model_save_name, directories)

# Model Configs

In [13]:
MODEL_NAME = 'neuralmind/bert-base-portuguese-cased' #'bert-base-uncased'
MAX_LEN = 200

BATCH_SIZE = 240
EPOCHS = 30
LEARNING_RATE = 1e-6 #3e-5
# LEARNING_RATE = 1e-5 #3e-5
DROPOUT_RATE = 0.1

# Load Dataset



In [14]:
def load_dataset():
  dataset = pd.read_csv(
          "/content/drive/MyDrive/Arquivos/Mestrado/Qualificação/full_text_fake_br.csv",
          usecols=['label', 'content'],
          encoding='utf-8'
        )
  # print(dataset['label'].unique())
  # class_mapping = {'fake': 0, 'True': 1}
  # dataset['label'] = dataset['label'].map(class_mapping)
  return dataset

In [15]:
dataset = load_dataset()

In [65]:
MAX_LEN=(dataset['content'].str.len().max())-2

In [17]:
# MAX_LEN=200

In [18]:
# MAX_LEN=5000

In [19]:
dataset["label"]=dataset["label"].astype("int")

In [20]:
dataset.head()

Unnamed: 0,content,label
0,Ex de Luiza Brunet é ouvido em audiência do ca...,1
1,Mercosul suspende direitos políticos da Venezu...,1
2,Relator diz que concessão de asilo a Battisti ...,1
3,"Ele não precisa ser caçado, diz advogado do go...",1
4,Petrobras tem interesse em encontrar parceiros...,1


In [21]:
dataset["label"].unique()


array([1, 0])

## Divisao Dataset

In [66]:
train,test = train_test_split(dataset,test_size=0.2,random_state=42,stratify=dataset["label"])

In [23]:
print(train.shape,test.shape)

(5760, 2) (1440, 2)


# Load Model

In [24]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [67]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [26]:
tr = train["content"]


In [27]:
tr[0]

'Ex de Luiza Brunet é ouvido em audiência do caso no qual é acusado de agressão. Lírio Parisotto é acusado de duas lesões corporais contra a modelo: em 2015 e 2016. Outras duas testemunhas, que faltaram em audiência do ano passado, também prestaram depoimento no Fórum da Barra Funda..  O empresário Lírio Parisotto foi ouvido nesta segunda-feira (13) ao Fórum Criminal da Barra Funda, na Zona Oeste de São Paulo, em nova audiência do caso no qual ele é julgado pela acusação de agredir a ex-mulher, a atriz Luiza Brunet. Parisotto não quis falar com a imprensa. A segunda audiência de instrução durou cerca de duas horas. Além do empresário, prestaram depoimento um perito e uma mulher que presenciou uma viagem do casal para o exterior. O advogado de Parisotto, Celso Vilardi, comentou o depoimento do empresário: "Ele tem uma versão só, nunca mentiu. Uma versão harmônica", disse. Ele questiona as provas apresentadas pela acusação: "Uma hora o atestado médico fala em fratura do dedo. Agora, a ac

In [28]:
# encoded_input = tokenizer.encode_plus(text=tr[0], max_length=MAX_LEN, truncation=True, return_tensors='tf')


In [29]:
# encoded_input

# Tratamento das entradas

In [30]:
# def get_masks(text, max_length):
#     """Mask for padding"""
#     tokens = tokenizer.tokenize(text)
#     length = len(tokens)
#     if length > max_length:
#       tokens = tokens[:max_length]
#     tokens = ["[CLS]"] + tokens + ["[SEP]"]


#     return np.asarray([1]*len(tokens) + [0] * (max_length - len(tokens)))


In [31]:
# vec_get_masks = np.vectorize(get_masks, signature = '(),()->(n)')

In [32]:
# def get_segments(text, max_length):
#     """Segments: 0 for the first sequence, 1 for the second"""
#     tokens = tokenizer.tokenize(text)
#     length = len(tokens)
#     if length > max_length:
#         tokens = tokens[:max_length]
#     tokens = ["[CLS]"] + tokens + ["[SEP]"]

#     segments = []
#     current_segment_id = 0
#     with_tags = ["[CLS]"] + tokens + ["[SEP]"]
#     token_ids = tokenizer.convert_tokens_to_ids(tokens)

#     for token in tokens:
#         segments.append(current_segment_id)
#         if token == "[SEP]":
#             current_segment_id = 1
#     return np.asarray(segments + [0] * (max_length - len(tokens)))

In [33]:
# vec_get_segments = np.vectorize(get_segments, signature = '(),()->(n)')


In [34]:
# def get_ids(text, tokenizer, max_length):
#     """Token ids from Tokenizer vocab"""
#     tokens = tokenizer.tokenize(text)
#     length = len(tokens)
#     if length > max_length:
#       tokens = tokens[:max_length]
#     tokens = ["[CLS]"] + tokens + ["[SEP]"]


#     token_ids = tokenizer.convert_tokens_to_ids(tokens)
#     input_ids = np.asarray(token_ids + [0] * (max_length - len(tokens)))
#     return input_ids

In [35]:
# vec_get_ids = np.vectorize(get_ids, signature = '(),(),()->(n)')


In [36]:
# def prepare(text_array, tokenizer, max_length = 200):

#     ids = vec_get_ids(text_array,
#                       tokenizer,
#                       max_length).squeeze()
#     masks = vec_get_masks(text_array,
#                       max_length).squeeze()
#     segments = vec_get_segments(text_array,
#                       max_length).squeeze()

#     return ids, segments, masks

In [37]:
train.head()

Unnamed: 0,content,label
4264,Marqueteiro do PT está inquieto na cadeia. Ele...,0
3482,População busca fósseis de dinossauros em Nova...,1
4757,CHEGA! Vitória de Trump serve de aviso para a ...,0
1942,Tribunal nega liberdade para ex-gerente de Eng...,1
327,Chuva recorde no Rio causa 4 mortes e deixa es...,1


In [38]:
train["content"][0]

'Ex de Luiza Brunet é ouvido em audiência do caso no qual é acusado de agressão. Lírio Parisotto é acusado de duas lesões corporais contra a modelo: em 2015 e 2016. Outras duas testemunhas, que faltaram em audiência do ano passado, também prestaram depoimento no Fórum da Barra Funda..  O empresário Lírio Parisotto foi ouvido nesta segunda-feira (13) ao Fórum Criminal da Barra Funda, na Zona Oeste de São Paulo, em nova audiência do caso no qual ele é julgado pela acusação de agredir a ex-mulher, a atriz Luiza Brunet. Parisotto não quis falar com a imprensa. A segunda audiência de instrução durou cerca de duas horas. Além do empresário, prestaram depoimento um perito e uma mulher que presenciou uma viagem do casal para o exterior. O advogado de Parisotto, Celso Vilardi, comentou o depoimento do empresário: "Ele tem uma versão só, nunca mentiu. Uma versão harmônica", disse. Ele questiona as provas apresentadas pela acusação: "Uma hora o atestado médico fala em fratura do dedo. Agora, a ac

In [68]:
tr = train["content"]
te = test["content"]

In [40]:
# MAX_LEN=15000

In [69]:
encoded_samples_train = tokenizer.batch_encode_plus(
    tr,
    add_special_tokens=True,  # Adicione tokens especiais como [CLS], [SEP]
    max_length=MAX_LEN,           # Defina o comprimento máximo desejado
    padding="max_length",     # Preencha/trunce para o comprimento máximo
    truncation=True,          # Truncar a sequência se exceder o comprimento máximo
    return_tensors="tf"       # Retorne tensores do PyTorch
)

In [70]:
encoded_samples_test = tokenizer.batch_encode_plus(
    te,
    add_special_tokens=True,  # Adicione tokens especiais como [CLS], [SEP]
    max_length=MAX_LEN,           # Defina o comprimento máximo desejado
    padding="max_length",     # Preencha/trunce para o comprimento máximo
    truncation=True,          # Truncar a sequência se exceder o comprimento máximo
    return_tensors="tf"       # Retorne tensores do PyTorch
)

In [43]:
# encoded_samples = []


In [44]:
# for sample in tr:
#     encoded_sample = tokenizer.encode_plus(
#         sample,
#         add_special_tokens=True,  # Adicione tokens especiais como [CLS], [SEP]
#         max_length=MAX_LEN,           # Defina o comprimento máximo desejado
#         padding="max_length",     # Preencha/trunque para o comprimento máximo
#         truncation=True,          # Truncar a sequência se exceder o comprimento máximo
#         return_tensors="tf"       # Retorne tensores do PyTorch
#     )
#     encoded_samples.append(encoded_sample)

In [45]:
# ids_train, segments_train, masks_train = prepare(tr,
#                                                  tokenizer, MAX_LEN)
# ids_test, segments_test, masks_test = prepare(te,
#                                                tokenizer, MAX_LEN)

In [46]:
    # ValueError: Input 0 of layer "model_3" is incompatible with the layer: expected shape=(None, 46074), found shape=(24, 46096)

In [47]:
# max_id_length = max([len(ids) for ids in ids_train])
# print("Tamanho máximo de ids_train:", max_id_length)

In [48]:
# max_id_length = max([len(ids) for ids in masks_train])
# print("Tamanho máximo de ids_train:", max_id_length)

Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors

# Criacao do Modelo

In [71]:
input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
segment_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32,name="segment_ids")
bert_model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
  )
output = bert_model([input_word_ids, input_mask, segment_ids])
# output = output
# output = tf.keras.layers.Dense(32,activation='relu')(output)
# output = tf.keras.layers.Dropout(DROPOUT_RATE)(output)
logits = output.logits  # Acessa os logits da saída da camada BERT
probs = tf.keras.layers.Softmax()(logits)

# output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[probs])


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier', 'bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
# loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

In [51]:
# model.compile(
#             optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE),
#             loss='sparse_categorical_crossentropy',
#             metrics=['accuracy'])



In [72]:
model.compile(Adam(learning_rate=LEARNING_RATE), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [53]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 200)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 200)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 200)]        0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  108924674  ['input_word_ids[0][0]',         
 ation (TFBertForSequenceClassi  rOutput(loss=None,               'input_mask[0][0]',         

# Carregar pesos do modelo

In [54]:
# model.load_weights('/content/drive/MyDrive/Arquivos/Mestrado/Qualificação/Fake_Br_corpus_second_experiment/models/model_2__28_05_2023-18_49.h5')


# Treinamento do Modelo

In [55]:
train["label"] = train["label"]

In [56]:
encoded_samples_train['input_ids']

<tf.Tensor: shape=(5760, 200), dtype=int32, numpy=
array([[  101,   496,  6685, ...,  1640, 12897,   102],
       [  101,  4150,  1163, ...,  3334,   107,   102],
       [  101,   187, 22340, ...,   171,   771,   102],
       ...,
       [  101, 12075,   125, ...,   179,   368,   102],
       [  101, 14215,  5054, ...,   125,   532,   102],
       [  101, 11124,  1331, ..., 12385,   107,   102]], dtype=int32)>

In [57]:
input_word_ids = encoded_samples_train['input_ids']
input_mask = encoded_samples_train['attention_mask'],
segment_ids = encoded_samples_train['token_type_ids']

In [58]:
labels = np.array(train.label)

In [59]:
# labels = labels.astype(int)  # Converte para inteiros
labels = tf.keras.utils.to_categorical(labels, num_classes=2)


In [60]:
labels = labels.astype(int)  # Converte para inteiros


In [61]:
encoded_samples_train

{'input_ids': <tf.Tensor: shape=(5760, 200), dtype=int32, numpy=
array([[  101,   496,  6685, ...,  1640, 12897,   102],
       [  101,  4150,  1163, ...,  3334,   107,   102],
       [  101,   187, 22340, ...,   171,   771,   102],
       ...,
       [  101, 12075,   125, ...,   179,   368,   102],
       [  101, 14215,  5054, ...,   125,   532,   102],
       [  101, 11124,  1331, ..., 12385,   107,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(5760, 200), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(5760, 200), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=in

In [73]:
start_time = time.time()

history = model.fit(
          x={
              "input_word_ids": encoded_samples_train["input_ids"],
              "input_mask": encoded_samples_train["attention_mask"],
              "segment_ids": encoded_samples_train["token_type_ids"]
          },
          y=train['label'],
          validation_split=0.20,
          epochs = EPOCHS,
          batch_size = 6
          )
end_time = time.time()


Epoch 1/30


ResourceExhaustedError: ignored

https://www.kaggle.com/code/andreshg/nlp-glove-bert-tf-idf-lstm-explained#8.-BERT

In [None]:
model.save_weights(
    '/content/drive/MyDrive/Arquivos/Mestrado/Qualificação/Fake_Br_corpus_novo_experimento/weights'
)


In [None]:
model.save(
    '/content/drive/MyDrive/Arquivos/Mestrado/Qualificação/Fake_Br_corpus_novo_experimento/full_model'
)

# Avaliação do modelo

## Excucao do modelo no conjunto de testes

In [None]:
test["label"] = test["label"].astype(float)

In [None]:
test_loss, test_acc = model.evaluate([ids_test, masks_test, segments_test], test["label"])


In [None]:
# threshold = 0.5


In [None]:
# y_pred = model.predict([ids_test, masks_test, segments_test])

# y_pred_bool = np.argmax(y_pred, axis=1)


# Geração de Relatorios

In [None]:
y_pred = model.predict([ids_test, masks_test, segments_test])
y_pred_bool = np.argmax(y_pred, axis=1)

In [None]:
y_pred_bool

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True class')
  plt.xlabel('Predicted class')
  return hmap

In [None]:
def generate_classification_report(y_pred, y_pred_bool, test, ids_test, masks_test, segments_test, files_name):
  # y_pred = model.predict([ids_test, masks_test, segments_test])
  # y_pred_bool = np.argmax(y_pred, axis=1)

  report = classification_report(y_pred_bool, test["label"])
  report_dict = classification_report(y_pred_bool, test["label"], output_dict=True)
  # df_report = pd.DataFrame(report_dict).transpose()

  with open(files_name['classification_report_json'], 'w') as f:
    json.dump(report_dict, f)

  with open(files_name['classification_report_txt'], 'w') as f:
    f.write(report)

  print(report)

  # CLASSIFICATION REPORT
  cm = confusion_matrix(test["label"], y_pred_bool)
  df_cm = pd.DataFrame(cm, index=['0','1'], columns=['0','1'])
  heatmap = show_confusion_matrix(df_cm)
  heatmap.figure.savefig(files_name['confusion_matrix'])


In [None]:
generate_classification_report(y_pred, y_pred_bool, test, ids_test, masks_test, segments_test, files_name)

# Salvar metricas do modelo

In [None]:
def save_model_metrics(model_save_name, files_name, history, start_time,
                       end_time, epochs, batch_size, learning_rate, dropout,
                       test_loss, test_acc, with_stop_words):
  # Calcula o tempo de execução em segundos
  time_seconds = round(end_time - start_time, 2)

  # Calcula o tempo de execução em minutos
  time_minutes = round(time_seconds / 60, 2)
  metrics = {
    'model': model_save_name,
    'loss': history.history['loss'][-1],
    'accuracy': history.history['accuracy'][-1],
    'val_loss': history.history['val_loss'][-1],
    'val_accuracy': history.history['val_accuracy'][-1],
    'time': str(end_time - start_time),
    'time_seconds': time_seconds,
    'time_minutes': time_minutes,
    'epochs': epochs,
    'batch_size': batch_size,
    'learning_rate': learning_rate,
    'dropout': dropout,
    'test_loss': test_loss,
    'test_acc': test_acc,
    'with_stop_words': with_stop_words
  }

  if os.path.isfile(files_name['metrics_csv']):
      # Se o arquivo existe, leia-o em um DataFrame
      df_metrics = pd.read_csv(files_name['metrics_csv'])
  else:
      # Se o arquivo não existe, crie um DataFrame vazio
      df_metrics = pd.DataFrame()

  df_metrics = df_metrics.append(metrics, ignore_index=True)
  df_metrics.to_csv(files_name['metrics_csv'], index=False)

  with open(files_name['history'], 'w') as f:
    json.dump(history.history, f)

  display(df_metrics.tail())



In [None]:
save_model_metrics(model_save_name, files_name, history, start_time, end_time,
                   EPOCHS, BATCH_SIZE, LEARNING_RATE, DROPOUT_RATE, test_loss,test_acc, True)


# salvar grafico do treinamento

In [None]:
def save_model_graphs(model_save_name, files_name, history):
  epochs_range = list(range(1, len(history.history['accuracy'])+1))
  # cria o gráfico de Accuracy
  plt.plot(epochs_range, history.history['accuracy'])
  plt.plot(epochs_range, history.history['val_accuracy'])
  plt.title('Training and Validation Accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Validation'], loc='lower right')

  # Salva figura
  plt.savefig(files_name['graph_acc'])
  plt.show()

  # cria o gráfico de Loss
  plt.plot(epochs_range, history.history['loss'])
  plt.plot(epochs_range, history.history['val_loss'])
  plt.title('Training and Validation Loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Validation'], loc='lower right')

  # Salva figura
  plt.savefig(files_name['graph_loss'])
  plt.show()

  # Definir o tamanho da figura
  fig, axs = plt.subplots(1, 2, figsize=(15,5))

  # Plotar o gráfico de loss no primeiro subplot
  axs[0].plot(epochs_range, history.history['loss'], label='train_loss')
  axs[0].plot(epochs_range, history.history['val_loss'], label='val_loss')
  axs[0].set_title('Model Loss')
  axs[0].set_xlabel('Epoch')
  axs[0].set_ylabel('Loss')
  axs[0].legend()

  # Plotar o gráfico de acurácia no segundo subplot
  axs[1].plot(epochs_range, history.history['accuracy'], label='train_acc')
  axs[1].plot(epochs_range, history.history['val_accuracy'], label='val_acc')
  axs[1].set_title('Model Accuracy')
  axs[1].set_xlabel('Epoch')
  axs[1].set_ylabel('Accuracy')
  axs[1].legend()

  # Salvar a figura em um arquivo
  plt.savefig(files_name['graph_acc_and_loss'])

In [None]:
save_model_graphs(model_save_name, files_name, history)

In [None]:
# np.shape(ids_test)

In [None]:
# y_pred

In [None]:
# # y_pred_bool = np.where(y_pred >= threshold, 1, 0)
# y_pred_bool = np.argmax(y_pred, axis=1)
# from sklearn.metrics import classification_report

# print(classification_report(y_pred_bool, test["label"]))

In [None]:
# from sklearn.metrics import confusion_matrix
# import seaborn as sns


In [None]:
# np.shape(y_pred_bool)

# LIME

https://www.kaggle.com/code/arinjaypathak/fine-tuned-bert-lime-youtube-comment-sentiment

In [None]:
!pip install lime

In [None]:
import lime
from lime.lime_text import LimeTextExplainer

## LimeTestes

## Funçao de predicao

In [None]:
def predict_proba(arr):
    # processed=[]
    # for i in arr:
    #     processed.append(i)
    id,segment,mask=prepare(arr,tokenizer,max_length=200)
    pred=model.predict([id,mask,segment])
    return pred
    # id, segment, mask = prepare([arr], tokenizer, max_length=200)
    # pred = model.predict([id, mask, segment], batch_size=1)
    # return pred

In [None]:
def save_lime_explain(model_save_name, files_name, pos, sample_index, sample_text, original_class, prediction_class):

  metrics = {
    'model_name': model_save_name,
    'pos': pos,
    'index': sample_index,
    'real_class': original_class,
    'pred_class': prediction_class,
    'text': sample_text
  }

  if os.path.isfile(files_name['lime_explanations']):
      # Se o arquivo existe, leia-o em um DataFrame
      df_lime = pd.read_csv(files_name['lime_explanations'])
  else:
      # Se o arquivo não existe, crie um DataFrame vazio
      df_lime = pd.DataFrame()

  df_lime = df_lime.append(metrics, ignore_index=True)
  df_lime.to_csv(files_name['lime_explanations'], index=False)
  display(df_lime.tail())


In [None]:
def explain_instance(directories, test, y_pred_bool, model_save_name, files_name):
  class_names = [0, 1]
  explainer = LimeTextExplainer(class_names=class_names)
  i = 0
  for index, row in test.iterrows():
    if i <= 682:
      i = i + 1
      continue
    file_name = '/' + model_save_name + '_' + str(i) + '_pos_' + str(index) + '_index_'+ str(int(row['label'])) + '_orig_class_'+ str(y_pred_bool[i]) +'pred_class__lime_explain'
    html_path = 'lime_html_' +  str(int(row['label'])) + '_' + str(y_pred_bool[i])
    # explicando instancia
    exp = explainer.explain_instance(row['content'], predict_proba)
    exp.show_in_notebook(text=True)
    # Salvando html do lime
    exp.save_to_file(directories[html_path] + file_name + '.html')
    # salvando explicancao como csv
    result_list = exp.as_list()
    csv_file_name = directories[html_path] + file_name + '.csv'
    with open(csv_file_name, mode='w', newline='') as result_file:
      writer = csv.writer(result_file)
      writer.writerows(result_list)
    # salvando texto e predicao e classe real no csv com todas as amostras
    save_lime_explain(model_save_name, files_name, i, index, row['content'], row['label'], y_pred_bool[i])
    i = i + 1
    print(i)
    print('-------------------------------------')


In [None]:
explain_instance(directories, test, y_pred_bool, model_save_name, files_name)

In [None]:
def explain_instance(explainer, directories, test):


# experimentos de teste Lime

In [None]:
# y_pred = model.predict([ids_test, masks_test, segments_test])
# row_index = dataset.loc[dataset['label'] == 0].index[0]
# row = dataset.loc[row_index, :]
# ids_train, segments_train, masks_train = prepare(tr,
#                                                  tokenizer, MAX_LEN)

## funcao de predicao

In [None]:
def predict_proba_test(arr):
    # processed=[]
    # for i in arr:
    #     processed.append(i)
    id,segment,mask=prepare(arr,tokenizer,max_length=200)
    pred=model.predict([id,mask,segment])
    return pred
    # id, segment, mask = prepare([arr], tokenizer, max_length=200)
    # pred = model.predict([id, mask, segment], batch_size=1)
    # return pred

In [None]:
row_index

In [None]:
dataset.iloc[4,:]

In [None]:
# sample_text = dataset.iloc[4,:]
sample_text = dataset['content'][4]
sample_text = dataset['content'][4]
      # dataset['content'][3600]

samples = [
     dataset['content'][4],
     dataset['content'][3600]
]

In [None]:
samples = [
     dataset['content'][6660],
]

In [None]:
dataset.iloc[3600,:]

In [None]:
test['content'][6660]

In [None]:
test["label"]

In [None]:
resp = predict_proba_test(samples)

In [None]:
resp

In [None]:
resp

In [None]:
resp = predict_proba(dataset['content'][4])

In [None]:
# test = np.where(resp >= threshold, 1, 0)
np.argmax(resp, axis=0)

## instanciando explainer

In [None]:
class_names = [0, 1]
explainer = LimeTextExplainer(class_names=class_names)


In [None]:
explainer.explain_instance(dataset['content'][3600],predict_proba).show_in_notebook(text=True)


In [None]:
explainer.explain_instance(dataset['content'][4],predict_proba).show_in_notebook(text=True)
