In [1]:
# @title Installing Transformers
from IPython.display import clear_output
!git clone https://github.com/facebookresearch/SentEval

from google.colab import drive
drive.mount('/content/drive')
clear_output()

DRIVE_PATH = "/content/drive/MyDrive/cs678project/"

In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [3]:
# !export LC_ALL=C.UTF-8
!ls /content/drive/MyDrive/cs678project/Fine-Tuned-Models/

cola-1k-123    cola-full-42   mnli-full-42    qqp-2.5k-1234  sst2-2.5k-1234
cola-1k-42     mnli-1k-123    mrpc-1k-42      qqp-7k-123     sst2-2.5k-42
cola-2.5k-123  mnli-1k-42     mrpc-2.5k-42    qqp-7k-1234    sst2-7k-123
cola-2.5k-42   mnli-2.5k-123  mrpc-full-1234  qqp-full-42    sst2-7k-1234
cola-7k-123    mnli-2.5k-42   qqp-1k-123      sst2-1k-1234   sst2-7k-42
cola-7k-42     mnli-7k-123    qqp-1k-1234     sst2-1k-42     sst-full-42
cola-full-123  mnli-7k-42     qqp-2.5k-123    sst2-2.5k-123


In [4]:
!pip install transformers checklist

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install transformers --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
# @title Importing Requirements

from transformers import (
    BertConfig,
    BertTokenizer,
    TFBertModel,
    BertModel,
    glue_processors,
    glue_convert_examples_to_features,
    set_seed
)
from transformers.optimization_tf import create_optimizer

import tensorflow as tf
import tensorflow_datasets
import numpy as np
import copy 
import os
import pandas as pd

from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_suite import TestSuite


In [7]:
# @title Hyperparameters
BATCH_SIZE =  64# @param {type:"integer"}
EPOCHS =  10#@param {type:"integer"}
MAX_LENGTH =   64#@param {type:"integer"}

TASK = "mnli" #@param ["cola", "sst", "mrpc", "sts", "qqp", "mnli", "qnli", "rte"]

MODEL_SIZE = "full"#@param ["full", "7k", "2.5k", "1k"]
PROBE = "BigramShift" #@param ['Length','BigramShift', 'TopConst', 'Tense','SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion']
MODEL_SEED = "42" #@param [42, 123, 1234]
PROBE_SEED = "60" #@param [40, 50, 60]
num_labels = 2

if PROBE == "TopConst":
  num_labels = 20
elif PROBE == 'Length':
    num_labels = 6

LEARNING_RATE =  3e-4 #@param {type:"number"}
WARMUP_RATIO =   0.1 #@param {type:"number"}
LAYER = "12" #@param [1,2,3,4, 5,6, 7,8, 9,10, 11, 12] 
LAYER = int(LAYER)

#####YOU CAN REPLACE THIS LINE WITH YOUR SAVED MODEL'S PATH####
SAVED_MODELS_DIR = f"{DRIVE_PATH}/Fine-Tuned-Models/" + TASK + '-' + MODEL_SIZE + '-' + str(MODEL_SEED)


DATA_NAME = ""
if PROBE == "Length":
  DATA_NAME = "sentence_length.txt"
elif PROBE == "BigramShift":
  DATA_NAME = "bigram_shift.txt"
elif PROBE == "TopConst":
  DATA_NAME = "top_constituents.txt"
elif PROBE == "Tense":
  DATA_NAME = "past_present.txt"
elif PROBE == "SubjNumber":
  DATA_NAME = "subj_number.txt"
elif PROBE == "ObjNumber":
  DATA_NAME = "obj_number.txt"
elif PROBE == "OddManOut":
  DATA_NAME = "odd_man_out.txt"
elif PROBE == "CoordinationInversion":
  DATA_NAME = "coordination_inversion.txt"

set_seed(int(PROBE_SEED))

In [8]:
!pip install googletrans==4.0.0-rc1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
!ls /content/SentEval/data/probing

bigram_shift.txt	    past_present.txt	 top_constituents.txt
coordination_inversion.txt  README.md		 tree_depth.txt
obj_number.txt		    sentence_length.txt  word_content.txt
odd_man_out.txt		    subj_number.txt


In [10]:
with open("/content/SentEval/data/probing/" + DATA_NAME, "r", encoding="utf-8") as file_object:
    split, label, text = [], [], []
    for line in file_object:
        tmp = line.strip().split('\t')
        split.append(tmp[0])
        label.append(tmp[1])
        text.append(tmp[2])

df = pd.DataFrame(list(zip(split, label, text)), columns=["split", "label", "text"])

if PROBE != 'Length':
    df["label"] = df["label"].factorize()[0]

df_train = df[df["split"] == "tr"]
df_val = df[df["split"] == "va"]
df_test = df[df["split"] == "te"]

In [11]:
df.columns

Index(['split', 'label', 'text'], dtype='object')

In [12]:
import pandas as pd
from tqdm import tqdm
import googletrans
print(googletrans.LANGUAGES)
from googletrans import Translator
translator = Translator()

def batch_translate(texts, src_language, dest_language):
    translator = Translator()
    translations = []
    for text in texts:
        translation = translator.translate(text, src=src_language, dest=dest_language)
        translations.append(translation.text)
    return translations

from pprint import pprint
src_language = 'en'
dest_language = 'fr'
batch_size = 100

# # create an empty column in the DataFrame to store the translations
df['translation'] = ''

# # loop over the DataFrame in batches and translate the texts
for i in tqdm(range(0, len(df[:200]), batch_size)):
    # print(i)
    batch = df.iloc[i:i+batch_size]['text'].tolist()
    batch_translations = batch_translate(batch,src_language, dest_language)
    df.loc[i:i+batch_size-1, 'translation'] = batch_translations

{'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'he': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'lo': 'lao', 'la': 'lat

100%|██████████| 2/2 [00:16<00:00,  8.43s/it]


In [13]:
dfx = df[["label", "translation"]].copy()
print(df.columns)
df.columns


dfx.rename(columns={"translation":"text"}, inplace=True)

Index(['split', 'label', 'text', 'translation'], dtype='object')


In [14]:
# df.drop(columns=["translation"], inplace =True)

In [15]:
# @title Probing Model

class ProbeModel(tf.keras.Model):
  def __init__(self, bert_model, num_labels, layer, *inputs, **kwargs):
    super(ProbeModel, self).__init__(name="ProbeModel")
    self.bert = bert_model
    self.bert.trainable = False
    self.dropout = tf.keras.layers.Dropout(0.1)
    self.layer = layer
    self.classifier = tf.keras.layers.Dense(
                      num_labels,
                      kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
                      name="classifier")
    
  
  def call(self, inputs, **kwargs):

    outputs = self.bert(inputs, **kwargs)
    pooled_out = outputs[2][self.layer]
    pooled_out = pooled_out[:,0,:]
    
    droped_out = self.dropout(pooled_out, training=kwargs.get("training", False))
    output = self.classifier(droped_out)
    return output

In [16]:
# @title Tokenizing Dataset
def tokenization(dataframe, tokenizer, max_length):
    input_ids, attention_mask, token_type_ids, labels = [], [], [], []
    for _ , row in dataframe.iterrows():
        inputs = tokenizer.encode_plus(
            row["text"],
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True
        )

        input_ids.append(inputs["input_ids"])
        attention_mask.append(inputs["attention_mask"])
        token_type_ids.append(inputs["token_type_ids"])
        labels.append(row["label"])

    return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids), np.array(labels)


In [17]:
!pip install -U transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:

# @title Loading the Model

config = BertConfig.from_pretrained('bert-base-multilingual-cased', num_labels=2)
config.output_hidden_states = True
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased', config=config)
bert_model.output_hidden_states = True

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [19]:
dfx

Unnamed: 0,label,text
0,0,"Une semaine, elle était avec l'homme, juste un..."
1,0,"Il a versé son Dieu à Heart, et après quelques..."
2,0,Nous ne pouvons pas lui mettre sur les lieux p...
3,1,Je détestais même entendre ce nom maintenant.
4,0,C'est mon Noël.
...,...,...
119995,1,
119996,1,
119997,1,
119998,1,


In [20]:
train_input_ids, train_attention_mask, train_token_type_ids, train_labels = tokenization(df_train, tokenizer, MAX_LENGTH)
val_input_ids, val_attention_mask, val_token_type_ids, val_labels = tokenization(df_val, tokenizer, MAX_LENGTH)
test_input_ids, test_attention_mask, test_token_type_ids, test_labels = tokenization(df_test, tokenizer, MAX_LENGTH)



In [21]:
french_test_input_ids, french_test_attention_mask, french_test_token_type_ids, french_test_labels = tokenization(dfx[:200], tokenizer, MAX_LENGTH)

In [22]:
# %%
# @title Preparing Probing Model
probe_input = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="probe_input")
probe_mask = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="probe_mask")

probe_output = bert_model(probe_input, attention_mask=probe_mask, token_type_ids=None)[2][LAYER]
probe_output = probe_output[:, 0, :]
probe_output = tf.keras.layers.Dense(num_labels, activation='softmax')(probe_output)

probe_model = tf.keras.Model(inputs=[probe_input, probe_mask], outputs=probe_output)
probe_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 probe_input (InputLayer)       [(None, 64)]         0           []                               
                                                                                                  
 probe_mask (InputLayer)        [(None, 64)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  177853440   ['probe_input[0][0]',            
                                thPoolingAndCrossAt               'probe_mask[0][0]']             
                                tentions(last_hidde                                               
                                n_state=(None, 64,                                            

In [23]:
# # %%
# # @title Compiling Probing Model
optimizer, _ = create_optimizer(init_lr=LEARNING_RATE, num_train_steps=EPOCHS * (len(train_labels) // BATCH_SIZE), num_warmup_steps=int(WARMUP_RATIO * (EPOCHS * (len(train_labels) // BATCH_SIZE))))
probe_model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])

In [24]:
# @title Callback
class ModelCheckpoint(tf.keras.callbacks.Callback):
  def __init__(self, monitor, save_path):
    super(ModelCheckpoint, self).__init__()
    self.monitor = monitor
    self.save_path = save_path
    self.bestScore = -np.Inf
    self.bestLoss = np.Inf

  def on_epoch_end(self, epoch, logs):
    score = logs.get(self.monitor)
    loss = logs.get("val_loss")
    if score > self.bestScore or (score == self.bestScore and loss < self.bestLoss):
      path = os.path.join(TASK, str(epoch+1))
      os.makedirs(path)
      self.model.save_weights(path+'/best_weights.h5')
      self.bestScore = score
      self.bestLoss = loss
      print("\nModel saved as the best model")

monitor = "val_accuracy"
checkpoint = ModelCheckpoint(monitor, SAVED_MODELS_DIR)


In [25]:
# %%
# @title Training Probing Model
probe_history = probe_model.fit(
    x=[train_input_ids, train_attention_mask],
    y=train_labels,
    validation_data=([val_input_ids, val_attention_mask], val_labels),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    callbacks=[checkpoint]  # Add the custom callback here
)


Epoch 1/10




Model saved as the best model
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# @title Retrieving the best model
import os 
list_of_dirs = os.listdir('/content/' + TASK)
 

final_list = list(map(int, list_of_dirs))
best_model = max(final_list)

model_path ="/content/"+ TASK + '/' + str(best_model) + '/best_weights.h5'
probe_model.load_weights(model_path)

In [27]:
# %%
# @title Evaluating Probing Model
evaluation = probe_model.evaluate(x=[test_input_ids, test_attention_mask], y=test_labels, batch_size=BATCH_SIZE, verbose=1)
print("Test Loss: ", evaluation[0])
print("Test Accuracy: ", evaluation[1])

Test Loss:  0.7277625799179077
Test Accuracy:  0.5864999890327454


In [28]:
# @title Evaluating Probing Model with french
evaluation = probe_model.evaluate(x=[french_test_input_ids, french_test_attention_mask], y=french_test_labels, batch_size=BATCH_SIZE, verbose=1)
print("Test Loss: ", evaluation[0])
print("Test Accuracy: ", evaluation[1])

Test Loss:  0.8631762862205505
Test Accuracy:  0.5099999904632568


In [29]:
# Create a prediction function that takes a list of texts and returns a list of predictions
def predict(texts):
    input_ids, attention_mask, _, _ = tokenization(pd.DataFrame({'  ': texts}), tokenizer, MAX_LENGTH)
    predictions = probe_model.predict([input_ids, attention_mask])
    # Convert predictions to class labels if required
    return predictions

In [30]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.test_suite import TestSuite
from checklist.expect import Expect

In [31]:
import checklist
import spacy
import itertools

import checklist.editor
import checklist.text_generation
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
from checklist.test_suite import TestSuite
import numpy as np
import spacy
from checklist.perturb import Perturb

In [32]:
from checklist.pred_wrapper import PredictorWrapper

In [33]:
CUDA_LAUNCH_BLOCKING = 1
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"