In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 2, 3"

import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

from datasets import load_dataset

# URL of the TSV file
url = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_single_train.tsv"
test_url = "https://raw.githubusercontent.com/MMU-TDMLab/CompLex/refs/heads/master/test-labels/lcp_single_test.tsv"
# Load the TSV file using the csv format
train_data = load_dataset(
    "csv",
    data_files=url,
    delimiter="\t"  # Specify tab-separated values
)

# Inspect the dataset
print(train_data)


val_data = load_dataset(
    "csv",
    data_files=test_url,
    delimiter="\t"  # Specify tab-separated values
)

# Inspect the dataset
print(val_data)




DatasetDict({
    train: Dataset({
        features: ['id', 'corpus', 'sentence', 'token', 'complexity'],
        num_rows: 7232
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'corpus', 'sentence', 'token', 'complexity'],
        num_rows: 887
    })
})


In [2]:

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from huggingface_hub import login


# Log in using your Hugging Face token
login("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")

model_name ='meta-llama/Llama-2-7b-hf'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length=1000



In [3]:
from datasets import DatasetDict


def generate_prompt(data_point):
    """
    Generates a prompt for evaluating the humor intensity of an edited headline.
    Args:
        data_point (dict): A dictionary containing 'original', 'edit', and 'meanGrade'.
    Returns:
        str: The formatted prompt as a string.
    """
    return f"""### Instruction: Given a sentence and a word in this sentence, you need to calculate a score between 0 and 1 to represent the lexical
complexity of this word.
            ### Sentence: {data_point['sentence']}
            ### Word: {data_point['token']}
            ### Output: The complexity score is """  # noqa: E501


# Assuming `dataset` is your DatasetDict
def add_label_column(example):
    total_length = 19
    num = float(example['complexity'])

    #formatted_num = str(num) + tokenizer.pad_token * padding_length

    # Add labels and outputs to the example
    example['labels'] = float(example['complexity'])
    example['output'] = str(num)
    example['input'] = generate_prompt(example)

    
    return example

train_data = train_data['train'].map(add_label_column)
val_data = val_data['train'].map(add_label_column)

In [4]:
train_data

Dataset({
    features: ['id', 'corpus', 'sentence', 'token', 'complexity', 'labels', 'output', 'input'],
    num_rows: 7232
})

In [5]:
print(train_data['input'][2], train_data['labels'][2], train_data['output'][2])

### Instruction: Given a sentence and a word in this sentence, you need to calculate a score between 0 and 1 to represent the lexical
complexity of this word.
            ### Sentence: The man, the lord of the land, said to us, 'By this I will know that you are honest men: leave one of your brothers with me, and take grain for the famine of your houses, and go your way.
            ### Word: brothers
            ### Output: The complexity score is  0.0499999999999999 0.0499999999999999


In [6]:
print(val_data['input'][2], val_data['labels'][2], val_data['output'][2])

### Instruction: Given a sentence and a word in this sentence, you need to calculate a score between 0 and 1 to represent the lexical
complexity of this word.
            ### Sentence: the ten sons of Haman the son of Hammedatha, the Jew's enemy, but they didn't lay their hand on the plunder.
            ### Word: hand
            ### Output: The complexity score is  0.2 0.2


In [7]:
from generator.modeling import PredictorCausalLM
from generator.collator import DataCollator
from generator import metrics
from generator.training import GenTrainer

In [8]:
config = AutoConfig.from_pretrained(model_name)  # Load configuration
config.dense_representation = 10 
config.pad_token_id = tokenizer.pad_token_id
config.nub_of_token_generation = 59


In [9]:
model = PredictorCausalLM(config, num_labels=1)  # Instantiate model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
import leader

leader.PEFT(model, method='column', rank=3)

In [11]:
data_collator = DataCollator(tokenizer=tokenizer)

In [12]:
from transformers import TrainingArguments, Trainer
from generator import metrics
import time
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='dir',
    learning_rate=2e-3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=5,
    num_train_epochs=10,
    weight_decay=0.00,
    remove_unused_columns=False,
    evaluation_strategy="steps",
    save_strategy="no",
    save_total_limit=2,
    save_steps=40000000,
    logging_steps=400,

    load_best_model_at_end=False,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=200,
)

compute_metrics = metrics.RegressionMetrics(tokenizer)
trainer = GenTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    max_steps_for_sampling=500,

)




In [13]:
trainer.train()



Step,Training Loss,Validation Loss,Mae,Mse,Rmse,Accuracy,R2,Pearson,Spearman's rank
400,79.7121,9.965739,0.097247,0.017406,0.131931,0.006764,-0.06147,0.018561,-0.046027
800,47.5377,9.594646,0.088437,0.01378,0.11739,0.05186,0.159616,0.597022,0.566815
1200,45.5042,9.54204,0.075278,0.009871,0.099355,0.040586,0.397997,0.684174,0.478469
1600,43.8724,9.600289,0.070124,0.007949,0.089158,0.048478,0.515224,0.749862,0.680958
2000,42.4219,9.719041,0.068569,0.007534,0.086796,0.015784,0.54057,0.759382,0.683227
2400,41.2421,9.649955,0.068173,0.007657,0.087506,0.03044,0.533026,0.734873,0.572139
2800,37.1952,10.08499,0.067014,0.006998,0.083656,0.016911,0.573218,0.758334,0.650813
3200,34.605,10.445763,0.069073,0.007375,0.085878,0.012401,0.550236,0.748356,0.659606
3600,31.3672,11.599126,0.065961,0.006889,0.083002,0.011274,0.579856,0.765728,0.683899
4000,28.8334,12.66724,0.068141,0.007356,0.085767,0.007892,0.551399,0.771027,0.703842




TrainOutput(global_step=4820, training_loss=40.3579042680036, metrics={'train_runtime': 54099.7089, 'train_samples_per_second': 1.337, 'train_steps_per_second': 0.089, 'total_flos': 260047393474560.0, 'train_loss': 40.3579042680036, 'epoch': 9.980920779759437})

In [14]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri Jan 24 17:33:00 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.03             Driver Version: 535.216.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:05:00.0 Off |                    0 |
| N/A   36C    P0             184W / 700W |  76361MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:06:00.0 Off |  

In [15]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri Jan 24 17:33:01 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.03             Driver Version: 535.216.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:05:00.0 Off |                    0 |
| N/A   36C    P0             119W / 700W |  76361MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:06:00.0 Off |  