In [1]:
%load_ext autoreload

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [47]:
%autoreload
from trainer import *
from SupContrastLoss import MultiPosConLoss
from RankingLoss import MultipleNegativesSymmetricRankingLoss

In [48]:
def train_datadreamer_ta2(fold, output_folder, used_loss, luar_model_path='./rrivera1849', batch_size=128, epochs=25):
    train_num_rows, train_data_generator = get_performs_data_generator(fold, "train")
    dev_num_rows, dev_data_generator = get_performs_data_generator(fold, "dev")
    
    with DataDreamer(output_folder):
        dataset = DataSource(
            "Train Data",
            data=train_data_generator,
            total_num_rows=train_num_rows,
        )
        dev_dataset = DataSource(
            "Dev Data",
            data=dev_data_generator,
            total_num_rows=dev_num_rows,
        )
    
        trainer = get_luar_trainer()(
            "LUAR Trainer",
            model_name=luar_model_path,
            peft_config=LoraConfig(),
            trust_remote_code=True,
            device='cuda:0',
            dtype="bfloat16",
            force=False, #so we can resume training
        )

        #loss = SupConLoss if used_loss=='SupConLoss' else losses.MultipleNegativesSymmetricRankingLoss
        loss = MultiPosConLoss if used_loss=='SupConLoss' else losses.MultipleNegativesSymmetricRankingLoss

        if used_loss == 'SupConLoss':
            trainer.train_with_labeled_pairs(
                train_anchors=dataset.output["anchors"],
                train_others= dataset.output["others"],
                train_labels=dataset.output["labels"],
                validation_anchors=dev_dataset.output["anchors"],
                validation_others=dev_dataset.output["others"],
                validation_labels=dev_dataset.output["labels"],
                epochs=epochs,    
                batch_size=batch_size,
                logging_steps=0.2,
                loss=loss,
                learning_rate=0.0005,
                early_stopping_threshold=0.001,
                early_stopping_patience=5,
                accelerator_config={
                    "dispatch_batches": False,
                },
                callbacks=[EpochTrackerCallback()]
            )
        else:
            trainer.train_with_positive_pairs(
                train_anchors=dataset.output["anchors"],
                train_positives=dataset.output["positives"],
                validation_anchors=dev_dataset.output["anchors"],
                validation_positives=dev_dataset.output["positives"],
                epochs=epochs,    
                batch_size=batch_size,
                loss=loss,
                learning_rate=0.0005,
                early_stopping_threshold=0.001,
                early_stopping_patience=5,
                accelerator_config={
                    "dispatch_batches": False,
                },
                callbacks=[EpochTrackerCallback()]
            )

### Training original TA2 system on performers' data:

- Train 3 epochs on the performers data
- Resume training on HRS dataset from phase 1 and 2

In [49]:
def train_datadreamer_ta2__on_performers_data(fold, output_folder, used_loss, luar_model_path='./rrivera1849', batch_size=128, epochs=25):
    train_num_rows, train_data_generator = get_performs_data_generator(fold, "train")
    dev_num_rows, dev_data_generator = get_performs_data_generator(fold, "dev")
    
    with DataDreamer(output_folder):
        dataset = DataSource(
            "Train Data",
            data=train_data_generator,
            total_num_rows=train_num_rows,
        )
        dev_dataset = DataSource(
            "Dev Data",
            data=dev_data_generator,
            total_num_rows=dev_num_rows,
        )
    
        trainer = get_luar_trainer()(
            "LUAR Trainer",
            model_name=luar_model_path,
            peft_config=LoraConfig(),
            trust_remote_code=True,
            device='cuda:0',
            dtype="bfloat16",
            force=False, #so we can resume training if things shutsdown
        )
        
        trainer.train_with_positive_pairs(
            train_anchors=dataset.output["anchors"],
            train_positives=dataset.output["positives"],
            validation_anchors=dev_dataset.output["anchors"],
            validation_positives=dev_dataset.output["positives"],
            epochs=epochs,    
            batch_size=batch_size,
            loss=losses.MultipleNegativesSymmetricRankingLoss,
            learning_rate=0.0005,
            early_stopping_threshold=0.001,
            early_stopping_patience=5,
            eval_strategy='steps',
            logging_strategy='steps',
            save_strategy='steps',
            logging_steps=200,
            save_steps=600,
            eval_steps=200,
            save_total_limit=3,
            resume_from_checkpoint=True,
            overwrite_output_dir=False,
            accelerator_config={
                "dispatch_batches": False,
            },
            callbacks=[EpochTrackerCallback()]
        )

In [40]:
# output_path = '/mnt/swordfish-pool2/milad/datadreamer-ta2/'
# luar_model_path = '/mnt/swordfish-pool2/milad/rrivera1849'
# fold = "/mnt/swordfish-pool2/milad/hiatus-data/performers-data/tmp-data/*/{split}"

output_path = '/burg/old_dsi/users/ma4608/ajay-ta2-system/output'
luar_model_path = '/burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849'
fold = "/burg/old_dsi/users/ma4608/hiatus_data/sadiri/*/{split}"

In [8]:
train_datadreamer_ta2__on_performers_data(fold, output_path + '/original_ta2_performers_data_model_50k', 'MultipleNegativesSymmetricRankingLoss', luar_model_path=luar_model_path, batch_size=100, epochs=3)

/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/train_queries_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/train_queries_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/train_queries_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/train_candidates_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/train_candidates_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/train_candidates_filtered.jsonl
Dataset Statistics: train 103605
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/dev_queries_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/dev_queries_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/dev_queries_filtered.jsonl
/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/dev_candidates_filtered.jsonl
/burg/old_dsi/users

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: /burg/old_dsi/users/ma4608/ajay-ta2-system/output/original_ta2_performers_data_model_50k


Dataset Statistics: dev 8284


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' is running (resumed). ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Anchors' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Anchors' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Positives' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Positives' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Anchors' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / 

The repository for /burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Finished loading.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [ 🤗 Accelerate] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Train Epoch: 2.507232401157184 -- {'loss': 1.2535, 'grad_norm': 1.5801076889038086, 'learning_rate': 8.212793314046931e-05}
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Anchors' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Positives' finished running lazily. 🎉
[ [35m🤖 Dat

The repository for /burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Finished loading.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' finished and is saved to disk. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: /burg/old_dsi/users/ma4608/ajay-ta2-system/output/original_ta2_performers_data_model_50k


In [10]:
print('done')

done


Now training on training split of HRS1&2
 - For this, I copied original_ta2_performers_data_model_50k to original_ta2_performers_data_model_hrs_continued
 - Then continued training for further epcohs

In [50]:
output_path = '/burg/old_dsi/users/ma4608/ajay-ta2-system/output'
luar_model_path = '/burg/old_dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849'
fold = "/burg/old_dsi/users/ma4608/hiatus_data/hrs_data_combined/{split}"

In [51]:
train_num_rows, train_data_generator = get_data_generator_for_combined_hrs(fold, "", "train")
dev_num_rows, dev_data_generator = get_data_generator_for_combined_hrs(fold,  "", "dev")

dict_keys(['documentID', 'fullText', 'languages', 'lengthWords'])


KeyError: 'authorIDs'

In [12]:
train_datadreamer_ta2(fold, output_path + '/original_ta2_performers_data_model_hrs_continued', 'MultipleNegativesSymmetricRankingLoss', luar_model_path=luar_model_path, batch_size=100, epochs=25)

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: /burg/old_dsi/users/ma4608/ajay-ta2-system/output/original_ta2_performers_data_model_hrs_continued
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: /burg/old_dsi/users/ma4608/ajay-ta2-system/output/original_ta2_performers_data_model_hrs_continued


Dataset Statistics: train 0
Dataset Statistics: dev 0


ValueError: Expected single column only, got []

### Train original TA2 system

In [25]:
# output_path = '/mnt/swordfish-pool2/milad/datadreamer-ta2/'
# luar_model_path = '/mnt/swordfish-pool2/milad/rrivera1849'
output_path = '/burg/dsi/users/ma4608/ajay-ta2-system/output'
luar_model_path = '/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849'

In [26]:
#fold = "/mnt/swordfish-pool2/milad/hiatus-data/train-test-dev split/{split}/Official Query-candidate format/"
fold = "/burg/dsi/users/ma4608/ajay-ta2-system/training_source/data/train-test-dev-split/{split}/official-query-candidate-format/"

In [7]:
train_num_rows, train_data_generator = get_data_generator(fold, "cross_genre_all", "train", split_percent=ast.literal_eval("None"))
dev_num_rows, dev_data_generator = get_data_generator(fold, "cross_genre_all", "dev", split_percent=ast.literal_eval("None"))

Dataset Statistics: cross_genre_all train None 575 4142
Dataset Statistics: cross_genre_all dev None 79 635


In [8]:
train_datadreamer_ta2(fold, output_path + '/original_ta2_model', 'MultipleNegativesSymmetricRankingLoss', luar_model_path=luar_model_path, batch_size=64, epochs=25)

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: /burg/dsi/users/ma4608/ajay-ta2-system/output/original_ta2_model
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' was previously run and saved, but was outdated. 😞
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Anchors' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Anchors' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Positives' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35m

The repository for /burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Finished loading.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [ 🤗 Accelerate] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Anchors' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Positives' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Eval Epoch: 0.0 -- {'loss': 2.626582384109497, 'model_preparation_time': 0.0923, 'runtime': 9.619, 'samples_per_second': 16.426, 'steps_per_second': 0.312}
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/dsi/users/

The repository for /burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Finished loading.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' finished and is saved to disk. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: /burg/dsi/users/ma4608/ajay-ta2-system/output/original_ta2_model


### Train original TA2 system with Supervised Contrastive Loss:

In [9]:
# output_path = '/mnt/swordfish-pool2/milad/datadreamer-ta2/'
# luar_model_path = '/mnt/swordfish-pool2/milad/rrivera1849'
output_path = '/burg/dsi/users/ma4608/ajay-ta2-system/output'
luar_model_path = '/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849'

In [10]:
fold = "/burg/dsi/users/ma4608/ajay-ta2-system/training_source/data/train-test-dev-split/{split}/official-query-candidate-format/"

In [11]:
train_num_rows, train_data_generator = get_data_generator_for_supervised_contrastive_learning(fold, "cross_genre_all", "train", split_percent=ast.literal_eval("None"))
dev_num_rows, dev_data_generator = get_data_generator_for_supervised_contrastive_learning(fold, "cross_genre_all", "dev", split_percent=ast.literal_eval("None"))

number of authors 8896
number of authors after filtering  1651
Dataset Statistics: 1651 9761
number of authors 1351
number of authors after filtering  270
Dataset Statistics: 270 1570


In [12]:
train_datadreamer_ta2(fold, output_path + '/supcon_ta2_model', 'SupConLoss', luar_model_path=luar_model_path, batch_size=128, epochs=25)

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: /burg/dsi/users/ma4608/ajay-ta2-system/output/supcon_ta2_model
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Train Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' was previously run and saved, but was outdated. 😞
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Trainer 'LUAR Trainer' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Anchors' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Anchors' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Train Others' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[

The repository for /burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Finished loading.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [ 🤗 Accelerate] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Dev Data' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Anchors' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'LUAR Trainer / Tokenize Validation Others' finished running lazily. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] [LUAR Trainer (/burg/dsi/users/ma4608/ajay-ta2-system/training_source/rrivera1849)] Eval Epoch: 0.0 -- {'loss': 4.055605411529541, 'model_preparation_time': 0.0581, 'runtime': 20.6433, 'samples_per_second': 76.054, 'steps_per_secon

OutOfMemoryError: CUDA out of memory. Tried to allocate 768.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 530.69 MiB is free. Including non-PyTorch memory, this process has 39.04 GiB memory in use. Of the allocated memory 37.21 GiB is allocated by PyTorch, and 1.33 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)