In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
!pip install transformers accelerate datasets scikit-learn sentencepiece

In [None]:
from datasets import load_dataset

dataset = load_dataset("fake_news_filipino")

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'article'],
        num_rows: 2564
    })
    test: Dataset({
        features: ['label', 'article'],
        num_rows: 642
    })
})

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
# Load Llama 2 Tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", add_prefix_space=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['article']

def preprocessing_function(examples):
    return tokenizer(examples['article'], truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2564 [00:00<?, ? examples/s]

Map:   0%|          | 0/642 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2564
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 642
    })
})

In [None]:
from transformers import AutoConfig
model_name="tiiuae/falcon-7b-instruct"
config = AutoConfig.from_pretrained(model_name)
config

FalconConfig {
  "_name_or_path": "tiiuae/falcon-7b-instruct",
  "alibi": false,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "FalconForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "tiiuae/falcon-7b-instruct--configuration_falcon.FalconConfig",
    "AutoModel": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconModel",
    "AutoModelForCausalLM": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForCausalLM",
    "AutoModelForQuestionAnswering": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForQuestionAnswering",
    "AutoModelForSequenceClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForSequenceClassification",
    "AutoModelForTokenClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForTokenClassification"
  },
  "bias": false,
  "bos_token_id": 11,
  "eos_token_id": 11,
  "hidden_dropout": 0.0,
  "hidden_size": 4544,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max

In [None]:
config._name_or_path=model_name
config.hidden_size=4544
config.num_hidden_layers=32
config.n_head=71
config.num_labels=2
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='classify the text as positive or negative, text:'

In [None]:
from falconSKT import  PrefixForSequenceClassification, PromptForSequenceClassification
model = PromptForSequenceClassification.from_pretrained(
    model_name,
    config=config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prompt sequence length:  11


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PromptForSequenceClassification were not initialized from the model checkpoint at tiiuae/falcon-7b-instruct and are newly initialized: ['score.weight', 'prompt_encoder.embedding.weight', 'score.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 6921779778
Trainable Parameters: 59074
Percentage Trainable: 0.00085345101830253578%


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.metrics import r2_score, accuracy_score, matthews_corrcoef
import numpy as np

def compute_metrics(p):
    logits = p.predictions
    #print("logits", logits)
    #print("logits", len(logits), len(logits[0]), len(logits[0][0]))
    preds = np.argmax(logits, axis=-1)
    labels = p.label_ids
    #print("labels", labels)

    accuracy = accuracy_score(labels, preds)



    return {"acc": accuracy}

training_args = TrainingArguments(
    output_dir='./rfalcon_task_prompt',
    num_train_epochs=10,
    do_eval=True,
    #learning_rate=0.001,
    #bf16=True,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,

    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps = 100,
    save_strategy="steps",
    save_steps=100,

    save_total_limit=2,
    load_best_model_at_end=True,
    #optim="paged_adamw_8bit",
)

In [None]:
tokenized_datasets['test']

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 642
})

In [None]:
trainer = Trainer(
    model=model,

    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics, #compute_metrics1,#compute_metrics_classification,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=7)],
    data_collator=data_collator,
)

trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Acc
100,0.746,0.609614,0.672897
200,0.5631,0.517491,0.830218
300,0.5181,0.491138,0.766355
400,0.4812,0.452652,0.827103
500,0.4507,0.436989,0.84891
600,0.4246,0.414144,0.858255
700,0.423,0.404807,0.825545
800,0.4153,0.399259,0.825545
900,0.4111,0.380693,0.853583
1000,0.399,0.374257,0.864486


Could not locate the best model at ./rfalcon_task_prompt/checkpoint-2500/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=2570, training_loss=0.4115899720544481, metrics={'train_runtime': 8794.6864, 'train_samples_per_second': 2.915, 'train_steps_per_second': 0.292, 'total_flos': 1.3048114584754176e+17, 'train_loss': 0.4115899720544481, 'epoch': 10.0})

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Acc
100,0.3493,0.328621,0.886293
200,0.3564,0.32157,0.889408
300,0.3663,0.315313,0.897196
400,0.3336,0.312389,0.897196
500,0.3314,0.319523,0.880062
600,0.3064,0.309928,0.890966
700,0.3417,0.30319,0.88785
800,0.3275,0.307578,0.875389
900,0.3289,0.295829,0.903427
1000,0.317,0.294327,0.897196
