In [None]:
!nvidia-smi

In [6]:
from datasets import load_dataset

dataset = load_dataset("fake_news_filipino")

In [7]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [9]:
# Load Llama 2 Tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", add_prefix_space=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['article']

def preprocessing_function(examples):
    return tokenizer(examples['article'], truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2564 [00:00<?, ? examples/s]

Map:   0%|          | 0/642 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoConfig
model_name="tiiuae/falcon-7b-instruct"
config = AutoConfig.from_pretrained(model_name)
config

FalconConfig {
  "_name_or_path": "tiiuae/falcon-7b-instruct",
  "alibi": false,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "FalconForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "tiiuae/falcon-7b-instruct--configuration_falcon.FalconConfig",
    "AutoModel": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconModel",
    "AutoModelForCausalLM": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForCausalLM",
    "AutoModelForQuestionAnswering": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForQuestionAnswering",
    "AutoModelForSequenceClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForSequenceClassification",
    "AutoModelForTokenClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForTokenClassification"
  },
  "bias": false,
  "bos_token_id": 11,
  "eos_token_id": 11,
  "hidden_dropout": 0.0,
  "hidden_size": 4544,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max

In [11]:
config._name_or_path=model_name
config.hidden_size=4544
config.num_hidden_layers=32
config.n_head=71
config.num_labels=2
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='Classify the text as positive or negative, text:'

In [12]:
from falconSKT import  PrefixForSequenceClassification, PromptForSequenceClassification
model = PrefixForSequenceClassification.from_pretrained(
    model_name,
    config=config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prefix sequence length:  11


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PrefixForSequenceClassification were not initialized from the model checkpoint at tiiuae/falcon-7b-instruct and are newly initialized: ['score.bias', 'prompt_encoder.embedding.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 6921779778
Trainable Parameters: 59074
Percentage Trainable: 0.00085345101830253578%


In [15]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.metrics import r2_score, accuracy_score, matthews_corrcoef
import numpy as np

def compute_metrics(p):
    logits = p.predictions
    #print("logits", logits)
    #print("logits", len(logits), len(logits[0]), len(logits[0][0]))
    preds = np.argmax(logits, axis=-1)
    labels = p.label_ids
    #print("labels", labels)

    accuracy = accuracy_score(labels, preds)



    return {"acc": accuracy}

training_args = TrainingArguments(
    output_dir='./rfalcon_task_prefix_sk',
    num_train_epochs=15,
    do_eval=True,
    #learning_rate=0.001,
    #bf16=True,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,

    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps = 100,
    save_strategy="steps",
    save_steps=100,

    save_total_limit=2,
    load_best_model_at_end=True,
    #optim="paged_adamw_8bit",
)

In [16]:
trainer = Trainer(
    model=model,

    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics, #compute_metrics1,#compute_metrics_classification,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=7)],
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Acc
100,0.81,0.570703,0.778816
200,0.5468,0.477829,0.82243
300,0.4729,0.43327,0.841121
400,0.4385,0.410608,0.858255
500,0.4236,0.396386,0.85514
600,0.4065,0.380426,0.861371
700,0.4079,0.370605,0.864486
800,0.3841,0.364172,0.84891
900,0.3804,0.361236,0.870717
1000,0.3731,0.350219,0.876947


Could not locate the best model at ./rfalcon_task_prefix_sk/checkpoint-1900/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=1935, training_loss=0.4078879600347475, metrics={'train_runtime': 9928.1168, 'train_samples_per_second': 3.874, 'train_steps_per_second': 0.195, 'total_flos': 1.9572171877131264e+17, 'train_loss': 0.4078879600347475, 'epoch': 15.0})

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Acc
100,0.3315,0.320502,0.884735
200,0.351,0.314951,0.884735
300,0.3159,0.31258,0.889408
400,0.3089,0.308553,0.889408
500,0.312,0.307433,0.88785
600,0.3021,0.298492,0.889408
700,0.3184,0.293765,0.890966
800,0.291,0.287614,0.894081
900,0.2943,0.291719,0.889408
1000,0.293,0.285068,0.897196
