# Fine Tune Transformer from HuggingFace DistilElectra

**Note: This notebook is run in Paperspace platform**

In [41]:
%pip install transformers
%pip install torch
%pip install imblearn
%pip install wandb --upgrade


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mNote: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mNote: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mNote: y

In [42]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, classification_report

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback, set_seed

# Progress bar
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
tqdm_notebook.pandas()

## Import Clean Text Data

In [43]:
# Note: Change the name of data set used for feature creation
data_set='bully_data_clean_with_stopword'
    
    
# Import Data Set #
bully_data_cleaned = pd.read_csv('bully_data_clean_with_stopword.csv', encoding='utf8')                                            
bully_data_cleaned = bully_data_cleaned[~bully_data_cleaned['text_check'].isna()]
bully_data_cleaned = bully_data_cleaned[bully_data_cleaned['text_check'] != ""]
#bully_data_cleaned = bully_data_cleaned[bully_data_cleaned['role']!='None']
bully_data_cleaned = bully_data_cleaned[['label','text_check']]
bully_data_cleaned['label'] = bully_data_cleaned['label'].progress_apply(lambda x: 1 if x =="Cyberbullying" else 0)
bully_data_cleaned.rename(columns = {'text_check':'text'}, inplace = True)
bully_data_cleaned = bully_data_cleaned.reset_index(drop=True)

  0%|          | 0/120932 [00:00<?, ?it/s]

In [44]:
bully_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120932 entries, 0 to 120931
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   120932 non-null  int64 
 1   text    120932 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [45]:
bully_data_cleaned['label'].value_counts()

0    115556
1      5376
Name: label, dtype: int64

## Define pretrained tokenizer and model

In [46]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


model_name = "lsanochkin/distilelectra-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
set_seed(1127)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/lsanochkin/distilelectra-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8ad3f3e323c9bbccccb02a886541e18096e4439fe4bef351ad2a69b265245ebf.1b39ff9035b433c890d769efdd02fb72db0fcdc9429a75adbd43ca0f6edb2e5d
Model config ElectraConfig {
  "_name_or_path": "lsanochkin/distilelectra-base",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu

## Preprocess data and Fine Tune Transformers


In [47]:
# Data: Text Input and Label 
X = list(bully_data_cleaned["text"])
y = list(bully_data_cleaned["label"])


# Create torch dataset #
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# Define Trainer parameters 
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall_cb = recall_score(y_true=labels, y_pred=pred, average='binary', pos_label=1)
    precision_cb = precision_score(y_true=labels, y_pred=pred, average='binary', pos_label=1)
    f1_cb = f1_score(y_true=labels, y_pred=pred, average='binary', pos_label=1)
    
    recall_ncb = recall_score(y_true=labels, y_pred=pred, average='binary', pos_label=0)
    precision_ncb = precision_score(y_true=labels, y_pred=pred, average='binary', pos_label=0)
    f1_ncb = f1_score(y_true=labels, y_pred=pred, average='binary', pos_label=0)
    
    recall_overall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision_overall = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1_overall = f1_score(y_true=labels, y_pred=pred, average='macro')


    return {"accuracy": accuracy, 
            "precision_cb": precision_cb, "recall_cb": recall_cb, "f1_cb": f1_cb,
            "precision_ncb": precision_ncb, "recall_ncb": recall_ncb, "f1_ncb": f1_ncb,
            "precision_overall": precision_overall, "recall_overall": recall_overall, "f1_overall": f1_overall}


# Plot Confusion Matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.

    Arguments
    ---------
    cf:            confusion matrix to be passed in

    group_names:   List of strings that represent the labels row by row to be shown in each square.

    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'

    count:         If True, show the raw number in the confusion matrix. Default is True.

    normalize:     If True, show the proportions for each category. Default is True.

    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.

    xyticks:       If True, show x and y ticks. Default is True.

    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.

    sum_stats:     If True, display summary statistics below the figure. Default is True.

    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.

    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.

    '''


    # CONFUSION MATRIX IN PERCENTAGE
    cf_pct = cf.astype('float')/cf.sum(axis=1)[:, np.newaxis]
    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf_pct.flatten()]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf_pct,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)


# Run cross-validation 
def run_cross_validation(model_name='DistilElectra',
                         X=X,
                         y=y, 
                         splits=5,
                         epoch=4,
                         checkpoint=False):
    
    kfold = StratifiedShuffleSplit(n_splits=splits, test_size=0.1, random_state=1127)
    n_fold = 1

    print("Developing Model with Cross validation for: " + model_name)
    for train, test in tqdm(kfold.split(X, y)):

        print("Running for Fold: ",n_fold)
        train_index = list(train)
        test_index = list(test)

        X_train = [X[i] for i in train_index]
        y_train = [y[i] for i in train_index]
        X_val = [X[i] for i in test_index]
        y_val = [y[i] for i in test_index]

        # Tokenize
        X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
        X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

        # Create torch dataset
        train_dataset = Dataset(X_train_tokenized, y_train)
        val_dataset = Dataset(X_val_tokenized, y_val)

        # Define Trainer
        args = TrainingArguments(
            output_dir="content/drive/MyDrive_binary/output_" + model_name + "/fold"+str(n_fold),
            evaluation_strategy="epoch",
            save_strategy="epoch",
            num_train_epochs=epoch,
            seed=1127,
            load_best_model_at_end=True,
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        )

        if n_fold <= 4:
            checkpoint_temp = True
        else:
            checkpoint_temp = False

        trainer.train(resume_from_checkpoint=checkpoint)  # Add gradient_clip_val
        print("Complete for fold", n_fold)
        n_fold = n_fold + 1


# Run Hold Out Test

# Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=1127)

# Train and Validate Set
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=1127)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_test)

def run_hold_out_split(model_name='DistilElectra',
                       epoch=8,
                       train_dataset=train_dataset,
                       eval_dataset=val_dataset,
                       checkpoint=False):
  
    print("Developing Model with Hold Out Splits for: " + model_name)
    # Fine Tune Transformer
    # Define Trainer
    args = TrainingArguments(
      output_dir="content/drive/MyDrive_binary/output_" + model_name + "/holdout",
      evaluation_strategy="epoch",
      save_strategy="epoch",
      #eval_steps=500,
      #per_device_train_batch_size=1,
      #per_device_eval_batch_size=1,
      num_train_epochs=epoch, #1 was okay
      seed=1127,
      load_best_model_at_end=True,
    )

    trainer = Trainer(
    # model_init=model_init,
    args=args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    )

    trainer.train(resume_from_checkpoint=checkpoint)
    print("Complete for hold-out validate set")


In [48]:
# Predict (Hold Out Test) 

from sklearn.metrics import confusion_matrix
def compute_metrics_holdout(model_name='DistilElectra',
                            model_path='content/drive/MyDrive_binary/output_DistilElectra/holdout/checkpoint-3820', 
                            average_method='binary',
                            X_test=X_test):
  
    X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

    # Create torch dataset
    test_dataset = Dataset(X_test_tokenized)

    # Load trained model
    model_pred = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

    # Define test trainer
    test_trainer = Trainer(model_pred)

    # Make prediction
    raw_pred, _, _ = test_trainer.predict(test_dataset)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)

    # Compute metrics
    precision_cb = precision_score(y_test, y_pred, average=average_method, pos_label=1)
    recall_cb = recall_score(y_test, y_pred, average=average_method, pos_label=1)
    f1_cb = f1_score(y_test, y_pred, average=average_method, pos_label=1)

    precision_ncb = precision_score(y_test, y_pred, average=average_method, pos_label=0)
    recall_ncb = recall_score(y_test, y_pred, average=average_method, pos_label=0)
    f1_ncb = f1_score(y_test, y_pred, average=average_method, pos_label=0)

    precision_overall = precision_score(y_test, y_pred, average='macro')
    recall_overall = recall_score(y_test, y_pred, average='macro')
    f1_overall = f1_score(y_test, y_pred, average='macro')

    # Print Results
    print("Classification Report:")
    print(classification_report(y_test,y_pred))
    print()
    print("Label 1: Cyberbullying")
    print("Precision: ", precision_cb)
    print("Recall: ", recall_cb)
    print("F-measure: ", f1_cb)
    print()
    print("Label 0: Non-Cyberbullying")
    print("Precision: ", precision_ncb)
    print("Recall: ", recall_ncb)
    print("F-measure: ", f1_ncb)
    print()
    print("Macro Metrics")
    print("Precision: ", precision_overall)
    print("Recall: ", recall_overall)
    print("F-measure: ", f1_overall)
    print()

    # Confusion Matrix
    conf_mat = confusion_matrix(y_test,y_pred)
    categories = ['Non-Cyberbullying', 'Cyberbullying']
    labels = ['True Negative','',
            '','True Positive']

    make_confusion_matrix(conf_mat, 
                        group_names=labels,
                        categories=categories, 
                        figsize=(8,5), 
                        cbar=True, 
                        title='Fine Tuned ' + model_name + ' for Cyberbullying Detection', 
                        cmap='YlGnBu', 
                        sum_stats=False)



## Cross Validation (K = 5)

In [None]:
run_cross_validation(model_name='DistilElectra',
                         X=X,
                         y=y, 
                         splits=5,
                         epoch=4,
                         checkpoint=False)

Developing Model with Cross validation for: DistilElectra


0it [00:00, ?it/s]

Running for Fold:  1


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 108838
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27212
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,Precision Cb,Recall Cb,F1 Cb,Precision Ncb,Recall Ncb,F1 Ncb,Precision Overall,Recall Overall,F1 Overall
1,0.1082,0.109363,0.964941,0.623913,0.533457,0.57515,0.978425,0.985029,0.981716,0.801169,0.759243,0.778433
2,0.1044,0.113,0.966926,0.643154,0.576208,0.607843,0.980365,0.985116,0.982735,0.811759,0.780662,0.795289
3,0.0834,0.111184,0.968331,0.684086,0.535316,0.600626,0.978583,0.988491,0.983512,0.831334,0.761903,0.792069
4,0.0728,0.116585,0.968745,0.68018,0.561338,0.615071,0.979742,0.987712,0.983711,0.829961,0.774525,0.799391


***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-6803
Configuration saved in content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-6803/config.json
Model weights saved in content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-6803/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-13606
Configuration saved in content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-13606/config.json
Model weights saved in content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-13606/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold1/checkpoint-20409
Configuration saved in content/drive/MyDriv

Complete for fold 1
Running for Fold:  2


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 108838
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27212
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,Precision Cb,Recall Cb,F1 Cb,Precision Ncb,Recall Ncb,F1 Ncb,Precision Overall,Recall Overall,F1 Overall
1,0.1173,0.09709,0.971639,0.732697,0.570632,0.641588,0.980214,0.990308,0.985235,0.856456,0.78047,0.813412
2,0.0829,0.099585,0.972135,0.746929,0.565056,0.643386,0.979978,0.991087,0.985501,0.863453,0.778071,0.814444
3,0.0848,0.109311,0.972052,0.738095,0.576208,0.647182,0.980469,0.990481,0.98545,0.859282,0.783345,0.816316
4,0.0599,0.114097,0.970977,0.72104,0.566914,0.634755,0.980036,0.989789,0.984888,0.850538,0.778352,0.809822


***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-6803
Configuration saved in content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-6803/config.json
Model weights saved in content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-6803/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-13606
Configuration saved in content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-13606/config.json
Model weights saved in content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-13606/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold2/checkpoint-20409
Configuration saved in content/drive/MyDriv

Complete for fold 2
Running for Fold:  3


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 108838
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27212
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,Precision Cb,Recall Cb,F1 Cb,Precision Ncb,Recall Ncb,F1 Ncb,Precision Overall,Recall Overall,F1 Overall
1,0.0965,0.12419,0.972548,0.828025,0.483271,0.610329,0.976401,0.995327,0.985773,0.902213,0.739299,0.798051


***** Running Evaluation *****
  Num examples = 12094
  Batch size = 16
Saving model checkpoint to content/drive/MyDrive_binary/output_DistilElectra/fold3/checkpoint-6803
Configuration saved in content/drive/MyDrive_binary/output_DistilElectra/fold3/checkpoint-6803/config.json
Model weights saved in content/drive/MyDrive_binary/output_DistilElectra/fold3/checkpoint-6803/pytorch_model.bin


### [Continue] Cross Validation (K = 5)

In [None]:
run_cross_validation(model_name='DistilElectra',
                         X=X,
                         y=y, 
                         splits=5,
                         epoch=4,
                         checkpoint=True)

## Hold Out Split

In [None]:
#holdout
run_hold_out_split(model_name='DistilElectra',
                       epoch=4,
                       train_dataset=train_dataset,
                       eval_dataset=val_dataset,
                       checkpoint=False)

In [None]:
#holdout
run_hold_out_split(model_name='DistilElectra',
                       epoch=4,
                       train_dataset=train_dataset,
                       eval_dataset=val_dataset,
                       checkpoint=True)

### Test Split Confusion Matrix

#### Epoch 1

In [None]:
compute_metrics_holdout(model_name='DistilElectra',
                        model_path='content/drive/MyDrive_binary/output_DistilElectra/holdout/checkpoint-6803', 
                        average_method='binary',
                        X_test=X_test)

#### Epoch 2

In [None]:
compute_metrics_holdout(model_name='DistilElectra',
                        model_path='content/drive/MyDrive_binary/output_DistilElectra/holdout/checkpoint-13606', 
                        average_method='binary',
                        X_test=X_test)

#### Epoch 3

In [None]:
compute_metrics_holdout(model_name='DistilElectra',
                        model_path='content/drive/MyDrive_binary/output_DistilElectra/holdout/checkpoint-20409', 
                        average_method='binary',
                        X_test=X_test)

#### Epoch 4

In [None]:
compute_metrics_holdout(model_name='DistilElectra',
                        model_path='content/drive/MyDrive_binary/output_DistilElectra/holdout/checkpoint-27212', 
                        average_method='binary',
                        X_test=X_test)