In [None]:
# Install Pytorch s
%pip install "torch==2.2.2" tensorboard

# Install Hugging Face libraries
%pip install --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"


In [None]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from huggingface_hub import login
api_token = 'use your own token'
login(api_token)

In [None]:
import pandas as pd

def custom_sample_balanced_data(df, lang_column, label_column, lang_limits=None, random_state=42):
    
    # Initialize an empty DataFrame to store the final balanced data
    balanced_df = pd.DataFrame()

    # Loop through each unique language
    for lang in df[lang_column].unique():
        # Filter the DataFrame for the current language
        lang_df = df[df[lang_column] == lang]
        
        # If the language is in the lang_limits dictionary, perform balanced sampling
        if lang in lang_limits:
            # Split by label to balance the labels (0 and 1)
            label_0 = lang_df[lang_df[label_column] == 0]
            label_1 = lang_df[lang_df[label_column] == 1]
            
            # Determine the maximum possible samples per label (50:50 ratio)
            n_samples_per_label = min(len(label_0), len(label_1), lang_limits[lang] // 2)
            
            if n_samples_per_label > 0:
                # Sample the rows for each label with equal number of rows
                sampled_label_0 = label_0.sample(n=n_samples_per_label, random_state=random_state)
                sampled_label_1 = label_1.sample(n=n_samples_per_label, random_state=random_state)
                
                # Combine the sampled data for the current language
                balanced_lang_df = pd.concat([sampled_label_0, sampled_label_1], ignore_index=True)
                
                # Append to the final DataFrame
                balanced_df = pd.concat([balanced_df, balanced_lang_df], ignore_index=True)
        else:
            # For languages not in lang_limits, keep all rows
            balanced_df = pd.concat([balanced_df, lang_df], ignore_index=True)

    # Shuffle the final DataFrame to mix the rows
    balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return balanced_df



In [None]:
# # Define the system message for MGT detection
# system_message = """
# You are a model trained to determine whether a given text is machine-generated or human-written. 
# Analyze the linguistic patterns, structure, and content of the text. 
# Based on your analysis, respond with either 'machine-generated' or 'human-written.'
# """

# # Load Datasets
# train_dataset = load_dataset('json', data_files='training_data.jsonl', split="train")
# valid_dataset = load_dataset('json', data_files='validation_data.jsonl', split="train")


# # Preprocess datasets for training
# def format_mgt_data(examples):
#     return (examples['text'],truncation=)

# train_dataset_mapped = train_dataset.map(format_mgt_data, batched=True)
# valid_dataset_mapped = valid_dataset.map(format_mgt_data, batched=True)


In [None]:
# train_dataset_small[2]

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig,
    DataCollatorWithPadding, AutoTokenizer, set_seed, EarlyStoppingCallback
)

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True,
                                  max_length=512
                                 )

def get_data(train_path, dev_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    val_df = pd.read_json(dev_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    return train_df, val_df, test_df

def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro"))

    return results

def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model_name, train_tem_args, class_weights):
    torch.cuda.empty_cache()

    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)

    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True, # enable 4-bit quantization
        bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
        bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
        bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
    )
    
    lora_config = LoraConfig(
        r = 16, # the dimension of the low-rank matrices
        lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
        target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout = 0.05, # dropout probability of the LoRA layers
        bias = 'none', # wether to train bias weights, set to 'none' for attention layers
        task_type = 'SEQ_CLS'
    )

#     # Load a configuration with defaults (adjust parameters if necessary)
#     config = AutoConfig.from_pretrained(
#         model_name,
#         rope_scaling={
#             "type": "linear",  # or "exponential"
#             "factor": 8.0
#         }
#     )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
        revision='main'
    )


    # Model configurations for training
    model.config.use_cache = False
    model.config.pretraining_tp = 1


    model = prepare_model_for_kbit_training(model)

    model = get_peft_model(model, lora_config)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Ensure the model is on the correct device (GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)

    # Tokenize datasets
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Calculate eval_steps to evaluate 3 times per epoch
    total_train_samples = len(train_dataset)
    batch_size = train_tem_args['train_batch']
    steps_per_epoch = total_train_samples // batch_size
    eval_steps = steps_per_epoch // 4  # Evaluate 3 times per epoch
    eval_steps = 0.1

    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        learning_rate=train_tem_args['lr'],
        per_device_train_batch_size=train_tem_args['train_batch'],
        per_device_eval_batch_size=train_tem_args['val_batch'],
        num_train_epochs=train_tem_args['epochs'],
        weight_decay=train_tem_args['weight_decay'],
        save_strategy="steps",  # Save based on steps
        logging_steps=eval_steps,  # Log every eval_steps
        evaluation_strategy="steps",
        save_total_limit=2,  # Save only the last 2 checkpoints
        save_steps=eval_steps,  # Save model every eval_steps
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )
    
    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=5,  # Stop if no improvement after 3 evaluations
        early_stopping_threshold=0.001  # Minimum improvement threshold
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping],  # Add early stopping callback
#         class_weights=class_weights
    )

    trainer.train()

    # save best model
    best_model_path = os.path.join(checkpoints_path, 'best')
    
    if not os.path.exists(best_model_path):
        os.makedirs(best_model_path)
    
    trainer.save_model(best_model_path)
    


In [None]:
from transformers import set_seed


# Set paths and parameters
train_path = '/kaggle/input/coling-25-task-1/en_train.jsonl' 
dev_path = '/kaggle/input/coling-25-task-1/en_dev.jsonl'    
test_path = '/kaggle/input/coling-25-task-1/en_devtest_text_id_only.jsonl'  
checkpoints_path = '/kaggle/working/checkpoints'

scoring_path = '/kaggle/input/short-coling/en_dev_for_score.jsonl'

# train_path = '/kaggle/input/short-mul-lang-coling/multilingual_train_short.jsonl' 
# dev_path = '/kaggle/input/short-mul-lang-coling/multilingual_dev_short.jsonl' 

model = "mistralai/Mistral-7B-Instruct-v0.1"
model = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model  = "meta-llama/Prompt-Guard-86M"
model = "meta-llama/Meta-Llama-3-8B"
model = "/kaggle/input/llama-3/transformers/8b-hf/1"
mod3l = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"

trained_model = "llama_3_hf_custom"

prediction_path = '/kaggle/working/subtask_a_pred.jsonl'
random_seed = 42

# Set logging and seed
# logging.basicConfig(level=logging.INFO)
set_seed(random_seed)

train_tem_args = {
    'epochs': 3,
    'lr': 1e-4,
    'weight_decay': 0.01,
    'train_batch': 4,
    'val_batch': 16,
    
}


id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}


#get data for train/dev/test sets
train_df, valid_df, test_df = get_data(train_path, dev_path, test_path, random_seed)

# lang_limits = {'en': 12000, 'zh': 9000}
# # Call the function
# train_df = custom_sample_balanced_data(train_df, lang_column='lang', label_column='label', lang_limits=lang_limits)

# lang_limits = {'en': 9000, 'zh': 7000}
# # Call the function
# valid_df = custom_sample_balanced_data(valid_df, lang_column='lang', label_column='label', lang_limits=lang_limits)


reduced_val = 1
reduced_val = 0.005
# Sample 10% of the DataFrame
train_df = train_df.sample(n=int(len(train_df) * reduced_val), random_state=42)
valid_df = valid_df.sample(n=int(len(valid_df) * reduced_val), random_state=42)

print(train_df['lang'].value_counts(),valid_df['lang'].value_counts())


print("data loaded--------------")

class_weights=(1/train_df.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights



In [None]:
# # Initialize Weights & Biases (W&B) in disabled mode.
import torch
import wandb
wandb.init(mode="disabled")

!rm -r /kaggle/working/
!rm -rf ~/.cache/huggingface
!rm -rf ~/.cache/

print("Training started----------------------------------------------")

torch.cuda.empty_cache()
from huggingface_hub import login
api_token = 'use your own'
login(api_token)
    

In [None]:
# train detector model
print("Training Start --------------")

fine_tune(train_df, valid_df, trained_model, id2label, label2id, model, train_tem_args, class_weights)

print("Training Done --------------")

In [None]:
import shutil
import os

# Define your variables
dir_to_zip = trained_model

# Define the name of the output zip file
last_word = model.split('/')[-1]
output_zip = f"eng_{trained_model}.zip"

# Create a zip file from the directory
shutil.make_archive(output_zip.replace('.zip', ''), 'zip', dir_to_zip)

print(f"Zipped contents of {dir_to_zip} into {output_zip}")

In [None]:
def test(test_df, model_path, id2label, label2id):
    
    # load tokenizer from saved model 
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id,
        ignore_mismatched_sizes=True,
    )
    
    # Ensure padding tokens are set for GPT-2
#     tokenizer.pad_token = tokenizer.eos_token
#     model.config.pad_token_id = model.config.eos_token_id
    
            
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
#     metric = evaluate.load("bstrai/classification_report")
#     results = metric.compute(predictions=preds, references=predictions.label_ids)
    
    # return dictionary of classification report
    return preds

In [None]:
from peft import PeftModel, LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding
import torch
import numpy as np
from datasets import Dataset

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True, max_length=512)

def test(test_df, model_path, id2label, label2id, base_model_path):
    # Configure the quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,  # 4-bit quantization
        bnb_4bit_quant_type='nf4',  # optimal dtype for normally distributed weights
        bnb_4bit_use_double_quant=True,  # double quantization
        bnb_4bit_compute_dtype=torch.bfloat16  # use bf16 for computation
    )

    # Load the base model with quantization config
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model_path,  # Load the base model
        quantization_config=quantization_config,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )

    # Load the LoRA configuration
    lora_config = LoraConfig(
        r=16,
        lora_alpha=8,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.05,
        bias='none',
        task_type='SEQ_CLS'
    )

    # Apply LoRA to the model
    model = get_peft_model(model, lora_config)

    # Load the fine-tuned model weights from the checkpoint
    model = PeftModel.from_pretrained(model, model_path)  # Use the checkpoint for fine-tuned weights
    
    # load adapter with the base model.
    model = model.merge_and_unload()

    # Load the tokenizer from the base model
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)

    # Ensure padding tokens are set for GPT-based models (optional)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    # Convert the test DataFrame to a Hugging Face Dataset
    test_dataset = Dataset.from_pandas(test_df)

    # Preprocess the test data
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Create the Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Perform inference and get predictions
    predictions = trainer.predict(tokenized_test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)

    # Return the predictions
    return preds


In [None]:
S


# test detector model
predictions = test(test_df, trained_model, id2label, label2id, model)

# results, predictions = test(test_df, f"testing", id2label, label2id)


# logging.info(results)
predictions_df = pd.DataFrame({'id': test_df.id, 'label': predictions})
predictions_df.to_json(prediction_path, lines=True, orient='records')

print("Prediction Done --------------")

# Prediction for score

In [None]:
test_df = pd.read_json(scoring_path, lines=True)

test_df_tem = test_df[['id','text']]

sampled_df = test_df_tem
# Calculate 10% of the DataFrame
# num_samples = int(len(test_df_tem) * 0.1)  # 10% of the total rows

# Sample 10% of the DataFrame
# sampled_df = test_df_tem.sample(n=num_samples, random_state=42)

sampled_df

In [None]:
print("Predicting for score --------------")
best_model_path = trained_model

predictions = test(sampled_df, best_model_path, id2label, label2id)

print(len(predictions),len(sampled_df))
# logging.info(results)
predictions_df = pd.DataFrame({'id': sampled_df.id, 'label': predictions})
predictions_df.to_json("score_df.jsonl", lines=True, orient='records')

print("Predicting for score Done --------------")

# Format Checking

In [None]:
import os
import argparse
import logging
import json
import pandas as pd


# logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
COLUMNS = ['id', 'label']


def check_format(file_path):
  if not os.path.exists(file_path):
    logging.error("File doesnt exists: {}".format(file_path))
    return False
  
  try:
    submission = pd.read_json(file_path, lines=True)[['id', 'label']]
  except:
    logging.error("File is not a valid json file: {}".format(file_path))
    return False
  
  for column in COLUMNS:
    if submission[column].isna().any():
      logging.error("NA value in file {} in column {}".format(file_path, column))
      return False
  
  if not submission['label'].isin(range(0, 2)).all():
    logging.error("Unknown Label in file {}".format(file_path))
    logging.error("Unique Labels in the file are {}".format(submission['label'].unique()))
    return False
      
  return True


    
pred_file_path = prediction_path 
  
# for pred_file_path in prediction_file_path:
check_result = check_format(pred_file_path)
result = 'Format is correct' if check_result else 'Something wrong in file format'
#     logging.info("Checking file: {}. Result: {}".format(prediction_file_path, result))
print(result)

# Scoring

In [None]:
import logging.handlers
import argparse
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import sys
# sys.path.append('.')
# from format_checker import check_format


def evaluate(pred_fpath, gold_fpath):
  
  pred_labels = pred_fpath
  gold_labels = gold_fpath

  print(gold_labels)
  
  merged_df = pred_labels.merge(gold_labels, on=['id'], suffixes=('_pred', '_gold'))

  print(merged_df)

  macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0)
  micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0)
  accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred'])
  
  return macro_f1, micro_f1, accuracy


def validate_files(pred_files):
  if not check_format(pred_files):
    logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files))
    return False
  return True


pred_file_path = predictions_df
gold_file_path = test_df

logging.info('Prediction file format is correct')
macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path)
logging.info("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy))
print(macro_f1, micro_f1, accuracy)

In [None]:
macro_f1, micro_f1, accuracy