<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/CLASSIFICATION1_basic_LoRA_shkim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install -q transformers
!pip install -q peft
!pip install -q evaluate

# AG NEWS DataSet Label 추가

In [6]:
from datasets import load_dataset, ClassLabel, DatasetDict

agnews_dataset = load_dataset("fancyzhx/ag_news")
print(agnews_dataset['train'].features)

new_labels = agnews_dataset["train"].features["label"].names + ["StackOverFlow"]
new_class_label = ClassLabel(names=new_labels)

# 새로운 클래스 레이블을 기존 데이터셋에 적용하기 위해 데이터셋을 업데이트
updated_features = agnews_dataset["train"].features.copy()
updated_features["label"] = new_class_label

agnews_dataset = agnews_dataset.cast(updated_features)

# 결과 출력
print(agnews_dataset['train'].features)



{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech', 'StackOverFlow'], id=None)}


# TrainSet에 데이터 추가

In [7]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset, concatenate_datasets
from datasets import ClassLabel, Value

sof_dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/MyDrive/LLMEmbed/convert_train.csv',
    split='train'
)
# print(sof_dataset_train)

class_label = ClassLabel(num_classes=5, names=['World', 'Sports', 'Business', 'Sci/Tech', 'StackOverFlow'])

# 데이터셋에서 30000건 추출
sof_train_samples = sof_dataset_train.shuffle(seed=42).select(range(30000))
# 'Concat_Text' 필드를 'text'로, 그리고 'label' 값을 'StackOverFlow'로 설정
sof_train_samples = sof_train_samples.map(lambda example: {'text': example['Concat_Text'], 'label': class_label.str2int('StackOverFlow')}, remove_columns=['Id', 'Title', 'Body', 'Tags', 'CreationDate', 'Y', 'Cleaned_tag ', 'Cleaned_tag', 'Array_Tag', 'Filtered_Array_Tag', 'Filtered_Tag', 'Concat_Text'])

# sof_train_samples의 features를 업데이트
sof_train_samples = sof_train_samples.cast_column('label', class_label)

# sof_train_samples와 agnews_train을 결합
combined_train_dataset = concatenate_datasets([agnews_dataset["train"], sof_train_samples])

# 결과를 확인
print(combined_train_dataset)

# 레이블의 데이터 건수 카운트
label_counts = combined_train_dataset.features['label'].num_classes
label_counter = {label: 0 for label in range(label_counts)}

for example in combined_train_dataset:
    label_counter[example['label']] += 1

print(label_counter)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/30000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 150000
})
{0: 30000, 1: 30000, 2: 30000, 3: 30000, 4: 30000}


# TestSet에 데이터 추가

In [8]:
sof_dataset_test = load_dataset(
    'csv',
    data_files='/content/drive/MyDrive/LLMEmbed/convert_valid.csv',
    split='train'
)
sof_test_samples = sof_dataset_test.shuffle(seed=42).select(range(1900))
# 'Concat_Text' 필드를 'text'로, 'label' 값을 'StackOverFlow'로 설정
sof_test_samples = sof_test_samples.map(lambda example: {'text': example['Concat_Text'], 'label': class_label.str2int('StackOverFlow')}, remove_columns=['Id', 'Title', 'Body', 'Tags', 'CreationDate', 'Y', 'Cleaned_tag ', 'Cleaned_tag', 'Array_Tag', 'Filtered_Array_Tag', 'Filtered_Tag', 'Concat_Text'])

# sof_train_samples의 features를 업데이트
sof_test_samples = sof_test_samples.cast_column('label', class_label)

combined_test_dataset = concatenate_datasets([agnews_dataset["test"], sof_test_samples])

# 결과 확인
print(combined_test_dataset)

# 레이블의 데이터 건수 카운트
label_counts = combined_test_dataset.features['label'].num_classes
label_counter = {label: 0 for label in range(label_counts)}

for example in combined_test_dataset:
    label_counter[example['label']] += 1

print(label_counter)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1900 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1900 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 9500
})
{0: 1900, 1: 1900, 2: 1900, 3: 1900, 4: 1900}


# Tokenizing

In [9]:
from transformers import AutoTokenizer

new_agnews_dataset= DatasetDict({
    "train": combined_train_dataset,
    "test": combined_test_dataset
})

# DatasetDict 출력
print(new_agnews_dataset)

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = new_agnews_dataset.map(tokenize_function, batched=True)



small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1500))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1500))

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 9500
    })
})


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

# Model / LoRA config

In [10]:
from peft import LoraConfig, TaskType
from peft import get_peft_model
from transformers import RobertaForSequenceClassification

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=8,lora_alpha=16, lora_dropout=0.1
)

model = RobertaForSequenceClassification.from_pretrained(
    'FacebookAI/roberta-large',
    num_labels=5
)

model = get_peft_model(model, lora_config)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Fine-tuning

In [11]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics = {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
    }
    return metrics

training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=10)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.298815,0.914667,0.917435,0.915077,0.915445
2,No log,0.252989,0.927333,0.928727,0.928091,0.928169
3,0.558100,0.308857,0.922667,0.926295,0.924673,0.923343
4,0.558100,0.267285,0.927333,0.92792,0.927983,0.927727
5,0.558100,0.270609,0.928,0.928669,0.92939,0.928539
6,0.234600,0.268396,0.926667,0.927993,0.927612,0.927513
7,0.234600,0.273008,0.933333,0.934306,0.934293,0.934089
8,0.203900,0.268408,0.932667,0.932992,0.933582,0.933219
9,0.203900,0.275231,0.930667,0.932169,0.931697,0.93155
10,0.203900,0.272445,0.931333,0.932603,0.932252,0.932153


TrainOutput(global_step=1880, training_loss=0.30163496707348114, metrics={'train_runtime': 1340.8089, 'train_samples_per_second': 11.187, 'train_steps_per_second': 1.402, 'total_flos': 1.406395266048e+16, 'train_loss': 0.30163496707348114, 'epoch': 10.0})

# Inference

In [14]:
from transformers import pipeline

classifier = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=0)

sample_texts = [
    "US trade deficit swells in June The US trade deficit has exploded 19 to a record \$55.8bn as oil costs drove imports higher, according to a latest figures.",
    "Indians Beat Twins 7-1, Nearing AL Lead (AP) AP - The Cleveland Indians pulled within one game of the AL Central lead, scoring four runs in the first inning and beating the Minnesota Twins 7-1 Saturday night behind home runs by Travis Hafner and Victor Martinez.",
    "Pilgrims Crowd Field for Mass With Pope LOURDES, France - A frail Pope John Paul II celebrated an open-air Mass on Sunday as several hundred thousand pilgrims, many in wheelchairs, crowded onto a field near a French shrine to the Virgin Mary that is associated with miraculous cures of the sick. The Mass was a highlight of the Pope's two-day visit to Lourdes, a town in the Pyrenees where Roman Catholic tradition says St...",
    "T. Rex Had Teen Growth Spurt, Scientists Say (Reuters) Reuters - Tyrannosaurus Rex grew incredibly fast\during a teenaged growth spurt that saw the dinosaur expand its\bulk by six times, but the fearsome beasts \"lived fast and died\young,\" researchers said on Wednesday.",
    "working of compareTo() method of Comparable interface            I have one Employee class and the requirement is to sort the objects using comparable interface. The output with this code is :The difference of this id and other id is..** 6  other id**1The difference of this id and other id is..** 3  other id**6The difference of this id and other id is..** 3  other id**6The difference of this id and other id is..** 3  other id**1The difference of this id and other id is..** 11  other id**3The difference of this id and other id is..** 11  other id**6",
]

label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech",
    4: "StackOverFlow"
}

for i, text in enumerate(sample_texts):
    # 'classifier' pipeline을 사용하여 ag_news 분류
    result = classifier(text)

    # 추론 결과의 레이블을 사용자 정의 레이블로 매핑
    label_id = int(result[0]['label'].split('_')[-1])  # 'LABEL_X' 형식에서 X를 추출
    label_text = label_map.get(label_id, "Unknown")  # 매핑되는 텍스트 레이블을 가져오거나, 없으면 "Unknown"을 사용

    # print(f"Text: {text}")
    print(f"Text {i+1} Predicted label: {label_text}, Score: {result[0]['score']:.4f}")

The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification'

Text 1 Predicted label: Business, Score: 0.9985
Text 2 Predicted label: Sports, Score: 0.9941
Text 3 Predicted label: World, Score: 0.9962
Text 4 Predicted label: Sci/Tech, Score: 0.9977
Text 5 Predicted label: StackOverFlow, Score: 0.9999
