# ☑️ T5

### 기존

In [None]:
!pip -q install transformers datasets

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset

In [None]:
# T5 모델 및 토크나이저 로드
model_name = "google/mt5-small"  # T5의 한국어 성능을 높이기 위해 Multilingual T5 모델을 사용
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# 데이터셋 로드
train_dataset = load_dataset('json', data_files='/content/drive/MyDrive/일상대화요약_데이터/일상대화요약_train_processed.json')
dev_dataset = load_dataset('json', data_files='/content/drive/MyDrive/일상대화요약_데이터/일상대화요약_dev_processed.json')

In [None]:
def preprocess_function(examples):
    # 모든 대화를 하나의 텍스트로 결합
    inputs = [" ".join([turn["utterance"] for turn in example["conversation"]]) for example in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding=True, return_tensors='pt')

    # 목표 요약 생성
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=128, truncation=True, padding=True, return_tensors='pt')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 토크나이징 및 입력 데이터 준비
train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
dev_tokenized_dataset = dev_dataset.map(preprocess_function, batched=True)

In [None]:
# 훈련
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset['train'],
    eval_dataset=dev_tokenized_dataset['train'],
)

trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model("./t5-summarization")
tokenizer.save_pretrained("./t5-summarization")

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

def summarize_conversation(conversation, tokenizer, model, max_length=150, num_sentences=5):
    conversation_text = " ".join([utterance['utterance'] for utterance in conversation])

    # Tokenize the input conversation
    inputs = tokenizer(conversation_text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)

    # Move input tensors to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    model = model.to(device)

    # Generate the summary using the model
    summary_ids = model.generate(inputs['input_ids'], max_length=max_length, num_beams=4, early_stopping=True)

    # Decode the generated summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # 문장 단위로 분리하고, 첫 5문장만 선택
    sentences = generated_summary.split('. ')
    selected_summary = '. '.join(sentences[:num_sentences])

    # 선택된 문장이 5개보다 적다면, 마지막에 온점을 추가
    if not selected_summary.endswith('.'):
        selected_summary += '.'

    return selected_summary

# 테스트 데이터 로드
test_data = json.load(open('/content/drive/MyDrive/일상대화요약_데이터/일상대화요약_test_processed.json'))

# 테스트 데이터에 대한 요약 생성 및 출력
for i, item in enumerate(test_data):
    example_conversation = item['input']['conversation']
    conversation_text = " ".join([utterance['utterance'] for utterance in example_conversation])
    generated_summary = summarize_conversation(example_conversation, tokenizer, model)

    print(f"ID: {item['id']}")
    print(f"Conversation:\n{conversation_text}\n")
    print(f"Generated Summary:\n{generated_summary}\n")
    print("-" * 80)  # Separator for readability

### PEFT(LORA) 적용

In [None]:
!pip -q install transformers datasets

In [None]:
!pip install peft

In [None]:
from datasets import load_dataset

import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model

In [None]:
# 모델 및 토크나이저 로드
model_name = "google/mt5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# PEFT 설정 (LoRA 적용)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
)

# PEFT 적용
model = get_peft_model(model, lora_config)

# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=30,
    evaluation_strategy="epoch",
    remove_unused_columns=False, # 불필요한 열 제거하지 않음
    fp16=True,  # 16비트 부동소수점 사용, GPU 효율
)

In [None]:
# 데이터셋 토크나이징 + 프롬프트
def preprocess_function(examples):
    # 대화 문장을 하나의 텍스트로 결합하고, 요약 프롬프트를 추가
    inputs = ["summarize: " + " ".join([turn["utterance"] for turn in example["conversation"]]) for example in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # 목표 요약 생성
    labels = tokenizer(examples["output"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# 데이터셋 로드
dataset = load_dataset('json', data_files={
    'train': '/content/drive/MyDrive/일상대화요약_데이터/일상대화요약_train_processed.json',
    'validation': '/content/drive/MyDrive/일상대화요약_데이터/일상대화요약_dev_processed.json'
})

# 데이터셋 전처리
train_tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)
dev_tokenized_dataset = dataset['validation'].map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=dev_tokenized_dataset,
    data_collator=data_collator  # Seq2Seq에 맞는 데이터 콜레이터 설정
)

# 모델 훈련
trainer.train()

# 훈련 후 모델 저장
model.save_pretrained("./t5_lora")
tokenizer.save_pretrained("./t5_lora")

In [None]:
# 전처리된 데이터 확인
print(train_tokenized_dataset[0])

# ✅ KoBart v2

### 데이터 및 데이터로더 준비

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-base-v2")
model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
# 사용자 정의 데이터셋 클래스
class ConversationSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        conversation = " ".join([utterance['utterance'] for utterance in item['input']['conversation']])
        summary = item['output']

        # 입력과 출력을 토큰화
        input_encodings = self.tokenizer(conversation, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        output_encodings = self.tokenizer(summary, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        # 디코더를 위한 레이블을 이동시키기 위해 타겟 출력 텐서를 설정
        labels = output_encodings['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

# 예제 학습 데이터
train_data = json.load(open('/content/drive/MyDrive/24-summer KUBIG NLP/PROJECT/일상대화요약_데이터/일상대화요약_train.json'))
eval_data = json.load(open('/content/drive/MyDrive/24-summer KUBIG NLP/PROJECT/일상대화요약_데이터/일상대화요약_dev.json'))
test_data = json.load(open('/content/drive/MyDrive/24-summer KUBIG NLP/PROJECT/일상대화요약_데이터/일상대화요약_test.json'))

# 데이터셋 인스턴스화
train_dataset = ConversationSummaryDataset(train_data, tokenizer)
eval_dataset = ConversationSummaryDataset(eval_data, tokenizer)

# DataLoader 생성
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

### Train

In [None]:
training_args = TrainingArguments(
    output_dir="./kobart-summarization",
    num_train_epochs=50,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    eval_accumulation_steps=2,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=2000,
    eval_steps=2000,
    save_total_limit=3,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    fp16=True,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
2000,2.0592,2.848609
4000,1.3088,3.184103
6000,0.7249,3.41582
8000,0.4774,3.560427


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=8000, training_loss=1.472987486243248, metrics={'train_runtime': 1136.907, 'train_samples_per_second': 22.253, 'train_steps_per_second': 11.127, 'total_flos': 4877891665920000.0, 'train_loss': 1.472987486243248, 'epoch': 31.620553359683793})

In [None]:
# 제출 모델 저장
trainer.save_model("./kobart-summarization_0822_1")
tokenizer.save_pretrained("./kobart-summarization_0822_1")

Non-default generation parameters: {'forced_eos_token_id': 1}


('./kobart-summarization_0822_1/tokenizer_config.json',
 './kobart-summarization_0822_1/special_tokens_map.json',
 './kobart-summarization_0822_1/tokenizer.json')

### Validation

In [None]:
trainer.evaluate()

{'eval_loss': 2.848609209060669,
 'eval_runtime': 1.8215,
 'eval_samples_per_second': 55.997,
 'eval_steps_per_second': 27.999,
 'epoch': 31.620553359683793}

### 생성

In [None]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

In [None]:
model_path = "./kobart-summarization_0822_1"
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): Laye

In [None]:
import json

results = []

# 빔 서치, length_penalty 및 n-gram 크기 설정
beam_size = 6
max_length_summary = 128
length_penalty = 0.9
no_repeat_ngram_size = 3

# test_data의 각 항목을 처리
for item in test_data:
    # 대화 발화를 하나의 문자열로 결합
    conversation = " ".join([utterance['utterance'] for utterance in item['input']['conversation']])

    # 입력 텍스트를 토큰화
    inputs = tokenizer(conversation, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # 요약 생성
    with torch.no_grad():
        summaries = model.generate(
            inputs["input_ids"],
            max_length=max_length_summary,
            num_beams=beam_size,
            no_repeat_ngram_size=no_repeat_ngram_size,
            early_stopping=True,
            length_penalty=length_penalty,
            eos_token_id=tokenizer.eos_token_id
        )

    # 요약을 디코딩
    decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True) for summary in summaries]

    # 후처리: 공백 제거 및 사소한 문제 수정
    processed_summary = decoded_summaries[0].strip()

    # 출력 JSON 구조 생성
    result = {
        "id": item["id"],  # test data의 동일한 ID 사용
        "input": item["input"],  # 입력 데이터를 그대로 포함
        "subject_keyword": item.get("subject_keyword", ""),  # 존재하는 경우 주제 키워드 포함
        "output": processed_summary  # 처리된 요약을 출력으로 추가
    }

    # 결과 목록에 추가
    results.append(result)

    # 선택적으로 각 결과를 확인하기 위해 출력
    print(f"ID {item['id']}에 대한 생성된 결과:")
    print(json.dumps(result, ensure_ascii=False, indent=4))
    print("-" * 50)  # 가독성을 위한 구분선

# 결과를 JSON 파일로 저장
with open('/content/drive/MyDrive/24-summer KUBIG NLP/inference_results_3.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("추론 결과가 'inference_results.json'에 저장되었습니다.")