# Attempt 2: huggingface transformers

After doing some research, it is clear that we can also use huggingface to train a model, outputting the probabilities and then performing auc-roc

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:

RUN=1
#load csv files
train = pd.read_csv('../data/train_split.csv')
valid = pd.read_csv('../data/valid_split.csv')

## data preprocessing

In [3]:
label_cols = ['toxic',	'severe_toxic'	,'obscene',	'threat',	'insult', 	'identity_hate']

In [4]:
binary_dfs_train = {}
for label in label_cols:
    train_copy = train.copy()
    #stratify binary_dfs_train['toxic'] with train as 3% of the data
    subtrain, _ = train_test_split(train_copy, test_size=0.97, random_state=42, stratify=train_copy[label], )
    binary_dfs_train[label] = subtrain.copy()
    binary_dfs_train[label]['label_name'] = binary_dfs_train[label][label].apply(lambda x: label if x==1 else 'other')
    binary_dfs_train[label] = binary_dfs_train[label][['id', 'comment_text', label]]
    #rename the label column to 'label'
    binary_dfs_train[label].rename(columns={label:'label'}, inplace=True)

In [5]:
binary_dfs_val = {}
for label in label_cols:
    val_copy = valid.copy()
    #stratify binary_dfs_val['toxic'] with 10% of the data
    subvalid,_ = train_test_split(val_copy, test_size=0.9, random_state=42, stratify=val_copy[label] )
    binary_dfs_val[label] = subvalid.copy()
    binary_dfs_val[label]['label_name'] = binary_dfs_val[label][label].apply(lambda x: label if x==1 else 'other')
    binary_dfs_val[label] = binary_dfs_val[label][['id', 'comment_text', label]]
    #rename the label column to 'label'
    binary_dfs_val[label].rename(columns={label:'label'}, inplace=True)

## model training

In [6]:
#train a huggingface classifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments

#instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
#check if cuda is available
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
!wandb disabled

W&B disabled.


In [9]:
import datasets


binary_datasets_train = {}
for key in binary_dfs_train.keys():
    
    #convert the data into a Dataset object
    binary_datasets_train[key] = datasets.Dataset.from_pandas(binary_dfs_train[key])
    binary_datasets_train[key] = binary_datasets_train[key].map(lambda batch: tokenizer(batch['comment_text'], truncation=True, padding='max_length',max_length=512), batched=True,batch_size=2056)
    
    binary_datasets_train[key].set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    

Map:   0%|          | 0/3829 [00:00<?, ? examples/s]

Map: 100%|██████████| 3829/3829 [00:00<00:00, 5287.33 examples/s]
Map: 100%|██████████| 3829/3829 [00:00<00:00, 4562.59 examples/s]
Map: 100%|██████████| 3829/3829 [00:00<00:00, 5345.20 examples/s]
Map: 100%|██████████| 3829/3829 [00:00<00:00, 5337.37 examples/s]
Map: 100%|██████████| 3829/3829 [00:00<00:00, 5362.58 examples/s]
Map: 100%|██████████| 3829/3829 [00:00<00:00, 5337.81 examples/s]


In [10]:
valid['toxic'].value_counts()

toxic
0    28855
1     3059
Name: count, dtype: int64

In [11]:
binary_datasets_val = {}
for key in binary_dfs_val.keys():
    #instantiate the model and use the label list to define the labels
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    #convert the data into a Dataset object
    binary_datasets_val[key] = datasets.Dataset.from_pandas(binary_dfs_val[key])
    binary_datasets_val[key] = binary_datasets_val[key].map(lambda batch: tokenizer(batch['comment_text'], truncation=True, padding='max_length',max_length=512), batched=True,batch_size=2056)
    
    binary_datasets_val[key].set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    #drop features from the dataset other than the input_ids, attention_mask, and label columns
    #binary_datasets_val[key].drop('id', axis=1, inplace=True)
    #binary_datasets_val[key].drop('comment_text', axis=1, inplace=True)
    #binary_datasets_val[key].drop('label', axis=1, inplace=True)

    

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3191/3191 [00:00<00:00, 5651.23 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3191/3191 [00:00<00:00, 5482.91 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_cla

In [12]:
#import trainer and training arguments
from transformers import Trainer, TrainingArguments
for key in binary_datasets_train.keys():
    #instantiate the model and use the label list to define the labels
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    model.to(device)

    #instantiate the training arguments
    training_args = TrainingArguments(
        output_dir=f'../results/{key}/{RUN}',          # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        learning_rate=5e-05,             # learning rate
        evaluation_strategy='epoch',
        save_strategy='epoch',
        fp16=True,
    )

    #instantiate the trainer
    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=binary_datasets_train[key],         # training dataset
        eval_dataset=binary_datasets_val[key],             # evaluation dataset
    )   

    #train the model
    trainer.train()

    #evaluate the model
    trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
                                                  
 33%|███▎      | 479/1437 [02:10<03:23,  4.70it/s]

{'eval_loss': 0.14464622735977173, 'eval_runtime': 23.6325, 'eval_samples_per_second': 135.026, 'eval_steps_per_second': 2.116, 'epoch': 1.0}


 35%|███▍      | 500/1437 [02:17<03:44,  4.17it/s]  

{'loss': 0.1841, 'learning_rate': 3.267223382045929e-05, 'epoch': 1.04}


                                                  
 67%|██████▋   | 958/1437 [04:27<01:41,  4.73it/s]

{'eval_loss': 0.2423940896987915, 'eval_runtime': 23.5824, 'eval_samples_per_second': 135.313, 'eval_steps_per_second': 2.12, 'epoch': 2.0}


 70%|██████▉   | 1000/1437 [04:39<01:42,  4.28it/s] 

{'loss': 0.0698, 'learning_rate': 1.5309672929714683e-05, 'epoch': 2.09}


                                                   
100%|██████████| 1437/1437 [06:45<00:00,  4.65it/s]

{'eval_loss': 0.238447368144989, 'eval_runtime': 23.3822, 'eval_samples_per_second': 136.471, 'eval_steps_per_second': 2.138, 'epoch': 3.0}


100%|██████████| 1437/1437 [06:47<00:00,  3.53it/s]


{'train_runtime': 408.7908, 'train_samples_per_second': 28.1, 'train_steps_per_second': 3.515, 'train_loss': 0.09528765615358399, 'epoch': 3.0}


100%|██████████| 50/50 [00:23<00:00,  2.14it/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  8%|▊         | 112/1437 [00:27<05:21,  4.12it/s]

KeyboardInterrupt: 