# Área de imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install texthero simpletransformers

In [3]:
import time
import torch
import pandas as pd
import texthero as hero
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Variável que indica a disponibilidade de uma GPU com CUDA na máquina / ambiente
cuda_available = torch.cuda.is_available()

# Carregamento dos dados

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/LucasRotsen/tcc_case_study_tasks/main/data/task_sample.csv', sep='|', names=["title", "body", "labels"])

# Modelagem

#### Atribuindo um valor numérico para os labels

In [6]:
df['labels'] = [1 if row == 'Bug' else 0 for row in df['labels']]

#### Pré-processamento do texto

In [7]:
# Função para remover "stopwords", dígitos, e pontuação

def text_cleansing(df: pd.DataFrame, column_name: str):
    custom_pipeline = [hero.preprocessing.lowercase,
                       hero.preprocessing.remove_digits,
                       hero.preprocessing.remove_punctuation,
                       hero.remove_stopwords,
                       hero.remove_whitespace]

    return hero.clean(df[column_name], custom_pipeline)

In [8]:
# Juntando as duas colunas de texto em uma só

df['text'] = df['title'] + ' ' + df['body']

In [9]:
# Fazendo a limpeza do texto

df['text'] = text_cleansing(df, 'text')

In [10]:
# Mantendo apenas as colunas que serão utilizadas no treinamento

df = df[['text', 'labels']]

#### Dividindo o conjunto de dados entre treino e teste

In [11]:
train, test = train_test_split(df, test_size=0.3)

#### Instanciando o modelo

Observação importante: 
- A utilização de uma GPU com CUDA habilitado é fortemente indicada para esta parte do tutorial e para a realização do projeto
- A ferramenta online [Google Colaboratory](https://colab.research.google.com/) disponibiliza gratuitamente um ambiente com GPU + CUDA para utilização em notebooks
- Há um tutorial detalhado de como utilizar o Google Colab no repositório deste tutorial

In [12]:
model = ClassificationModel(
    "roberta", 
    "roberta-base",
    use_cuda=cuda_available
)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

#### Treinando o modelo

In [13]:
start = time.time()
model.train_model(train)
end = time.time()

print(f'Model training took {end - start} seconds!')

  0%|          | 0/10500 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1313 [00:00<?, ?it/s]


Non-finite norm encountered in torch.nn.utils.clip_grad_norm_; continuing anyway. Note that the default behavior will change in a future release to error out if a non-finite total norm is encountered. At that point, setting error_if_nonfinite=false will be required to retain the old behavior.



Model training took 1229.5338406562805 seconds!


#### Avaliando o modelo

In [14]:
start = time.time()
result, model_outputs, wrong_predictions = model.eval_model(test)
end = time.time()

print(f'Model evaluation took {end - start} seconds!')

  0%|          | 0/4500 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/563 [00:00<?, ?it/s]

Model evaluation took 158.58650064468384 seconds!


#### Visualizando as métricas de avaliação

In [15]:
result

{'auprc': 0.8301762781002655,
 'auroc': 0.8705328242669549,
 'eval_loss': 0.46050973895599745,
 'fn': 466,
 'fp': 447,
 'mcc': 0.591443725407088,
 'tn': 1980,
 'tp': 1607}

#### Fazendo predições com o modelo treinado

In [16]:
# Tabela de correspondência
cor_tab = {0: 'non-bug', 1: 'bug'}

In [17]:
text = "[DevTools Bug] Could not inspect element with id '2'. Error thrown:Cached data for element '2' not found"

In [18]:
validation_df = pd.DataFrame({"text": [text]})
validation_df['text'] = text_cleansing(df=validation_df, column_name='text')

In [19]:
# Fazendo a predição a partir de uma mensagem
predictions, raw_outputs = model.predict(validation_df['text'].iloc[0])

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
cor_tab[predictions[0]]

'bug'