In [None]:
!pip -q install evaluate

In [1]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification, TrainingArguments, Trainer,
    DataCollatorWithPadding, AutoTokenizer, set_seed, EarlyStoppingCallback
)
import os
import argparse
import torch
from huggingface_hub import login

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True,
                                  # max_length=1024
                                 )

def get_data(train_path, dev_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    val_df = pd.read_json(dev_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    return train_df, val_df, test_df

def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro"))

    return results


def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model, train_tem_args):
    
    torch.cuda.empty_cache()
        
    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    cache_dir = "./huggingface_cache"  # Directory to store the downloaded model
    
    api_token = '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'
    login(api_token)
    
    # get tokenizer and model from huggingface
    tokenizer = AutoTokenizer.from_pretrained(model)    
#     tokenizer = AutoTokenizer.from_pretrained(model, padding="longest", truncation=True)
    model = AutoModelForSequenceClassification.from_pretrained(
       model, num_labels=len(label2id), id2label=id2label, label2id=label2id,
       ignore_mismatched_sizes=True 

    )
    
    # Ensure padding tokens are set for GPT-2
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

#     for param in model.parameters(): param.data = param.data.contiguous()
    
    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#     data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pin_memory=True)
    

    # Calculate eval_steps to evaluate 3 times per epoch
    total_train_samples = len(train_dataset)
    batch_size = train_tem_args['train_batch']
    steps_per_epoch = total_train_samples // batch_size
    eval_steps = steps_per_epoch // 4  # Evaluate 3 times per epoch

    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        learning_rate=train_tem_args['lr'],
        per_device_train_batch_size=train_tem_args['train_batch'],
        per_device_eval_batch_size=train_tem_args['val_batch'],
        num_train_epochs=train_tem_args['epochs'],
        weight_decay=train_tem_args['weight_decay'],
        save_strategy="steps",  # Save based on steps
        logging_steps=eval_steps,  # Log every eval_steps
        evaluation_strategy="steps",
        save_total_limit=4,  # Save only the last 2 checkpoints
        save_steps=eval_steps,  # Save model every eval_steps
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )
    
    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=5,  # Stop if no improvement after 3 evaluations
        early_stopping_threshold=0.001  # Minimum improvement threshold
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping],  # Add early stopping callback
    )

    trainer.train()

    # save best model
    best_model_path = os.path.join(checkpoints_path, 'best')
    
    if not os.path.exists(best_model_path):
        os.makedirs(best_model_path)
    
    trainer.save_model(best_model_path)


def test(test_df, model_path, id2label, label2id):
    
    # load tokenizer from saved model 
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id,
       ignore_mismatched_sizes=True 

    )
    
    # Ensure padding tokens are set for GPT-2
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
    
            
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
#     metric = evaluate.load("bstrai/classification_report")
#     results = metric.compute(predictions=preds, references=predictions.label_ids)
    
    # return dictionary of classification report
    return preds

# Training English

In [None]:
!pip install -q --upgrade transformers
!pip install -q wandb==0.16.6

In [2]:
# # Initialize Weights & Biases (W&B) in disabled mode.
import torch
import wandb
wandb.init(mode="disabled")

!rm -r /kaggle/working/
!rm -rf ~/.cache/huggingface
!rm -rf ~/.cache/

print("Training started----------------------------------------------")

torch.cuda.empty_cache()

  pid, fd = os.forkpty()


rm: cannot remove '/kaggle/working/': Device or resource busy
Training started----------------------------------------------


In [None]:
# Set paths and parameters
train_path = '/kaggle/input/coling-25-task-1/coling25task1/en_train.jsonl' 
dev_path = '/kaggle/input/coling-25-task-1/coling25task1/en_dev.jsonl'    
test_path = '/kaggle/input/coling-25-task-1/coling25task1/en_devtest_text_id_only.jsonl'   
checkpoints_path = '/kaggle/working/checkpoints'

model = "roberta-base"
model_name = '/kaggle/input/eng-ensemble-coling-final/eng_roberta-base'

prediction_path = '/kaggle/working/subtask_a_pred.jsonl'
random_seed = 41

# Set logging and seed
# logging.basicConfig(level=logging.INFO)
set_seed(random_seed)

train_tem_args = {
    'epochs': 3,
    'lr': 2e-5,
    'weight_decay': 0.01,
    'train_batch': 4,
    'val_batch': 16,
}

id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}


#get data for train/dev/test sets
train_df, valid_df, test_df = get_data(train_path, dev_path, test_path, random_seed)

reduce_val = 0.36

# Sample 10% of the DataFrame
train_df = train_df.sample(n=int(len(train_df) * reduce_val), random_state=42)
valid_df = valid_df.sample(n=int(len(valid_df) * reduce_val), random_state=42)


train_df['text_length'] = train_df['text'].apply(lambda x: len(x.split()))  # Compute text lengths
train_df = train_df.sort_values(by='text_length', ascending=True).drop(columns=['text_length'])  # Sort by text length


valid_df['text_length'] = valid_df['text'].apply(lambda x: len(x.split()))  # Compute text lengths
valid_df = valid_df.sort_values(by='text_length', ascending=True).drop(columns=['text_length'])  # Sort by text length


print("data loaded--------------")

# train detector model
fine_tune(train_df, valid_df, f"{model}/{random_seed}", id2label, label2id, model_name, train_tem_args)
# fine_tune(train_df, valid_df, f"testing", id2label, label2id, model, train_tem_args)

print("Training Done --------------")

data loaded--------------
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Map:   0%|          | 0/219876 [00:00<?, ? examples/s]

Map:   0%|          | 0/94232 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss


In [None]:
import shutil
import os

# Define your variables
dir_to_zip = f"{model}/{random_seed}/best/"

# Define the name of the output zip file
last_word = model.split('/')[-1]
output_zip = f"eng_{last_word}.zip"

# Create a zip file from the directory
shutil.make_archive(output_zip.replace('.zip', ''), 'zip', dir_to_zip)

print(f"Zipped contents of {dir_to_zip} into {output_zip}")


In [None]:
s

test_df['text_length'] = test_df['text'].apply(lambda x: len(x.split()))  # Compute text lengths
test_df = test_df.sort_values(by='text_length', ascending=True).drop(columns=['text_length'])  # Sort by text length


# test detector model
predictions = test(test_df, f"{model}/{random_seed}/best/", id2label, label2id)


# logging.info(results)
predictions_df = pd.DataFrame({'id': test_df.id, 'label': predictions})
predictions_df.to_json(prediction_path, lines=True, orient='records')

print("Prediction Done --------------")

In [None]:
def test_2nd(test_df, model_path, id2label, label2id):
    
    # load tokenizer from saved model 
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
            
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
#     metric = evaluate.load("bstrai/classification_report")
    
    print(preds)
    
    # Ensure predictions and references match in length
    if len(preds) != len(test_df.id):
        raise ValueError("Mismatch between the number of predictions and references.")

#     results = metric.compute(predictions=preds, references=test_df.id)
    
    # return dictionary of classification report
    return preds

In [None]:
test_df = pd.read_json('/kaggle/input/coling-25-task-1/en_dev.jsonl', lines=True)

test_df = test_df.sample(n=int(len(test_df) * 0.03), random_state=42)

sampled_df = test_df[['id','text']]
sampled_df

In [None]:
print("Predicting for score --------------")

predictions = test_2nd(sampled_df, f"{model}/{random_seed}/best/", id2label, label2id)

# len(predictions),len(sampled_df)
# logging.info(results)
predictions_df = pd.DataFrame({'id': sampled_df.id, 'label': predictions})
predictions_df.to_json("score_df.jsonl", lines=True, orient='records')

print("Predicting for score Done --------------")


# Farmat Checker

In [None]:
import os
import argparse
import logging
import json
import pandas as pd


# logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
COLUMNS = ['id', 'label']


def check_format(file_path):
  if not os.path.exists(file_path):
    logging.error("File doesnt exists: {}".format(file_path))
    return False
  
  try:
    submission = pd.read_json(file_path, lines=True)[['id', 'label']]
  except:
    logging.error("File is not a valid json file: {}".format(file_path))
    return False
  
  for column in COLUMNS:
    if submission[column].isna().any():
      logging.error("NA value in file {} in column {}".format(file_path, column))
      return False
  
  if not submission['label'].isin(range(0, 2)).all():
    logging.error("Unknown Label in file {}".format(file_path))
    logging.error("Unique Labels in the file are {}".format(submission['label'].unique()))
    return False
      
  return True


    
pred_file_path = prediction_path 
  
# for pred_file_path in prediction_file_path:
check_result = check_format(pred_file_path)
result = 'Format is correct' if check_result else 'Something wrong in file format'
#     logging.info("Checking file: {}. Result: {}".format(prediction_file_path, result))
print(result)

# Score

In [None]:
import logging.handlers
import argparse
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import sys
# sys.path.append('.')
# from format_checker import check_format


def evaluate(pred_fpath, gold_fpath):
  
  pred_labels = pred_fpath
  gold_labels = gold_fpath

  print(gold_labels)
  
  merged_df = pred_labels.merge(gold_labels, on=['id'], suffixes=('_pred', '_gold'))

  print(merged_df)

  macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0)
  micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0)
  accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred'])
  
  return macro_f1, micro_f1, accuracy


def validate_files(pred_files):
  if not check_format(pred_files):
    logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files))
    return False
  return True


pred_file_path = predictions_df
gold_file_path = test_df

logging.info('Prediction file format is correct')
macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path)
logging.info("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy))

In [None]:
macro_f1, micro_f1, accuracy 