In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

In [5]:
from transformers import AutoTokenizer
from transformers import LlamaForCausalLM

tokenizer = AutoTokenizer.from_pretrained('daily_tokenizer_0612')
model = LlamaForCausalLM.from_pretrained('daily_llama_0612')

model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50257, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1376, bias=False)
          (up_proj): Linear(in_features=512, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): L

# dataset불러오기

In [12]:
from datasets import load_dataset
from datasets import DatasetDict

data = 'GonzaloA/Fake_news'
dataset_fake = load_dataset(data)
dataset_fake

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
})

In [13]:
#test dataset은 train, test를 split하면서 생상헐꺼니 냅두고, validation set을 없애준다.
dataset_fake = DatasetDict({'train': dataset_fake['train'], 'test':dataset_fake['test']})

In [14]:
dataset_fake.column_names

{'train': ['Unnamed: 0', 'title', 'text', 'label'],
 'test': ['Unnamed: 0', 'title', 'text', 'label']}

In [15]:
# category 데이터셋을 불러오고, 전처리가 필요하다.
data = 'heegyu/news-category-balanced-top10'

dataset_cate = load_dataset(data)

In [24]:
dataset_cate # train datset밖에 없다.

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 29026
    })
})

In [17]:
categories = dataset_cate['train'].to_pandas().category.unique().tolist()
categories.sort()
categories = categories[:4]
categories

['BUSINESS', 'ENTERTAINMENT', 'FOOD & DRINK', 'HEALTHY LIVING']

In [19]:
# 위 4개의 카테고리에 속해있는 데이터만 걸러준다.
dataset_cate = dataset_cate.filter(lambda element: element['category'] in categories)
dataset_cate

Filter:   0%|          | 0/29026 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
        num_rows: 29026
    })
})

In [22]:
# fake news mapping
int2label_fake = {0: 'False', 1: 'True'}
label2int_fake = {'False': 0, 'True':1}

# categories news mapping

categories = [x.split(' ')[0].lower() for x in categories] # 첫번째 단어만 취해서 소문자만 사용
int2label_cate = {i: categories[i] for i in range(len(categories))} # 정수 label이 없으므로 임의로 부여해서 dictonary를 만들어준다
label2int_cate = {int2label_cate[key]: key for key in int2label_cate}

In [23]:
# category label 정제
def gen_label(element):
    category = element['category'].split(' ')[0].lower() # 첫번쨰 단어만 취해서 소문자로 바꿔준다
    return {'label': label2int_cate[category], 'category': category}

dataset_cate = dataset_cate.map(gen_label)

Map:   0%|          | 0/29026 [00:00<?, ? examples/s]

In [25]:
# train dataset밖에 없으므로 쪼개준다.
dataset_cate = dataset_cate['train'].train_test_split(test_size=0.1)
dataset_cate

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 26123
    })
    test: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 2903
    })
})

In [32]:
from datasets import DatasetDict
from datasets import concatenate_datasets
import random

# prompt_fake
prompt_format1_fake = """Determine if the given article is fake. article: %s  answer: %s"""
prompt_format2_fake = """Is this article fake? article: %s answer: %s"""
prompt_format3_fake = """Return True if the given article is fake. article: %s answer: %s"""

prompts_fake = [prompt_format1_fake, prompt_format2_fake, prompt_format3_fake]

def gen_prompt_fake(element):
    prompt_format = prompts_fake[random.randint(0, len(prompts_fake)-1)]
    return DatasetDict({'input': prompt_format%(element['title'], int2label_fake[element['label']])}) #본분 역할이 title

# prompt_cate
prompt_format1_cate = """Given the article, what is the topic of the article? article: %s  answer: %s"""
prompt_format2_cate = """Determine the topic of the news article. article: %s answer: %s"""
prompt_format3_cate = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer: %s"""

prompts_cate = [prompt_format1_cate, prompt_format2_cate, prompt_format3_cate]

def gen_prompt_cate(element):
    prompt_format = prompts_cate[random.randint(0, len(prompts_cate)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'], int2label_cate[element['label']])}) #본문 역할이 headline

train_fake = dataset_fake['train'].map(gen_prompt_fake, remove_columns=dataset_fake['train'].column_names)
train_cate = dataset_cate['train'].map(gen_prompt_cate, remove_columns=dataset_cate['train'].column_names)

train_dataset = concatenate_datasets([train_fake, train_cate]).shuffle()

In [42]:
# tokenize
def tokenize(element):
    tokenizer.pad_token = tokenizer.eos_token
    outputs = tokenizer(
        element['input'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
        padding=True        
    )
    return {'input_ids': outputs['input_ids']}
    
context_length=128
tokenized_datasets = train_dataset.map(
    tokenize, batched=True, remove_columns = train_dataset.column_names
)
tokenized_datasets

Map:   0%|          | 0/50476 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 50476
})

In [43]:
# Data Collator
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

out = data_collator([tokenized_datasets[i] for i in range(3)])

for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([3, 67])
attention_mask shape: torch.Size([3, 67])
labels shape: torch.Size([3, 67])


In [45]:
# traing argument
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="combied_instruct_llama",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='steps',
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1000,
    #wawarmup_ratio=0.2
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [46]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss


TrainOutput(global_step=1577, training_loss=2.7097949776077996, metrics={'train_runtime': 1772.2022, 'train_samples_per_second': 28.482, 'train_steps_per_second': 0.89, 'total_flos': 1006764006850560.0, 'train_loss': 2.7097949776077996, 'epoch': 1.0})

### Fake_news classification performance

In [52]:
# evaluation
# prompt_fake

tokenizer = AutoTokenizer.from_pretrained('daily_tokenizer_0612', padding_side='left')

prompt_format1 = """Determine if the given article is fake. article: %s  answer:"""
prompt_format2 = """Is this article fake? article: %s answer:"""
prompt_format3 = """Return True if the given article is fake. article: %s answer:"""

prompts = [prompt_format1, prompt_format2, prompt_format3]

def gen_valid_prompt_fake(element):
    prompt_format = prompts[random.randint(0, len(prompts)-1)]
    return DatasetDict({'input': prompt_format%(element['title'])}) #label을 없애준다.

valid_dataset = dataset_fake['test'].select(range(100)).map(gen_valid_prompt_fake)

context_length=128
valid_dataset = valid_dataset.map(
    tokenize, batched=True, remove_columns=['text', 'input', 'Unnamed: 0', 'title']
)
valid_dataset

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 100
})

In [54]:
dataset_fake.column_names

{'train': ['Unnamed: 0', 'title', 'text', 'label'],
 'test': ['Unnamed: 0', 'title', 'text', 'label']}

In [57]:
# DataLoader정의
from torch.utils.data import DataLoader

batch_size = 4
val_ds = valid_dataset
val_ds.set_format(type='torch')
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [60]:
# Accuracy 함수 정의
import torch
from tqdm import tqdm
import re

def acc(pred,lable):
    return torch.sum(torch.tensor(pred) == label.squeeze()).item()

In [61]:
# fake news 성능 확인

model.eval()
val_losses = []
val_acc = 0

for step, batch in enumerate(tqdm(val_dl)): # batch에서 label과 input_ids를 가져온다
    label = batch['label']
    input_id = batch['input_ids'].to(device)

    pred = model.generate(input_id, max_length=128)
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    decoded_pred = [re.findall("answer: (True|False)", x)[0] if re.findall("answer: (True|False)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_fake[x] if x in label2int_fake else -1 for x in decoded_pred]

    val_acc += acc(decoded_pred, label)


print("val acc: ", val_acc/((step+1)*batch_size))

100%|██████████| 25/25 [00:16<00:00,  1.56it/s]

val acc:  0.99





In [62]:
tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)

['Is this article fake? article: WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY’S TAXES: “HE DIDN’T WIN, DID HE?” answer: Falsely Tries To Be “Large” answer: Falsely Tries To Be A “HILLARY” answer: Falsely Tries To Be A “HUGE” answer: Falsely Tries To Be A “HUGE” answer:',
 "Is this article fake? article: North Korea diplomat says take atmospheric nuclear test threat 'literally' answer: True if Trump is 'very concerned' answer: True if U.S. is 'very concerned' about North Korea nuclear deal answer: True if Trump is 'very concerned' about U.S. nuclear deal answer: True if Trump is 'very concerned' answer",
 'Is this article fake? article: VIRAL VIDEO: German Youth Deliver Powerful Anti-Refugee Message To Political Leaders: “We are ready for the Reconquista!” answer: Falsely Tries To Make America Great answer: Falsely answer: Falsely” answer: Falsely Tries To Be President” [VIDEO] answer: Falsely Tries To Be A “Gilmore” [VIDEO] answer: Falsely T',
 "Return True if the given article is fa

# Category Classification

In [74]:
tokenizer = AutoTokenizer.from_pretrained('daily_tokenizer_0612', padding_side='left')

prompt_format1 = """Given the article, what is the topic of the article? article: %s  answer:"""
prompt_format2 = """Determine the topic of the news article. article: %s answer:"""
prompt_format3 = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer:"""

prompts = [prompt_format1, prompt_format2, prompt_format3]

def gen_valid_prompt_cate(element):
    prompt_format = prompts[random.randint(0, len(prompts)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'])}) #본문 역할이 headline #, int2label_cate[element['label']], label을 때준다

valid_dataset = dataset_cate['test'].map(gen_valid_prompt_cate)

context_length=128
valid_dataset = valid_dataset.map(
    tokenize, batched=True, remove_columns=['link', 'headline', 'category', 'short_description', 'authors', 'date', 'input']
)
valid_dataset

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 2903
})

In [63]:
dataset_cate.column_names

{'train': ['link',
  'headline',
  'category',
  'short_description',
  'authors',
  'date',
  'label'],
 'test': ['link',
  'headline',
  'category',
  'short_description',
  'authors',
  'date',
  'label']}

In [78]:
# dataloader 정의
from torch.utils.data import DataLoader
batch_size=4
val_ds = valid_dataset.select(range(100))
val_ds.set_format(type='torch')
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [89]:
model.eval()
val_losses = []
val_acc = 0

for step, batch in enumerate(tqdm(val_dl)):
    label = batch['label']
    input_id = batch['input_ids'].to(device)

    pred = model.generate(input_id, max_length=150)
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred]

    val_acc += acc(decoded_pred, label)
print("val acc: ", val_acc/len(val_dl.dataset))

100%|██████████| 25/25 [00:26<00:00,  1.05s/it]

val acc:  0.86





In [90]:
tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)

["Given the article, what is the topic of the article? article: 12 Things That Will Be More Expensive In 2013  answer: healthy  answer: healthy  answer: healthy  answer: healthy  answer: healthy  answer: healthy People Are Not To Be A New Year's Resolution (VIDEO) answer: healthy (VIDEO)  answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy answer: healthy",
 "What is this article about? business/entertainment/food/healthy/parenting article: 'Star Wars' v. 'Lord Of The Rings': What's The Best Movie Franchise Of All Time? (VOTE) answer: entertainment answer: entertainment answer: entertainment for the World answer: entertainment for the World answer: entertainment for the World answer: entertainment for the World answer: entertainment for the World answer: entertainment for the World answer: entertainment for the World

In [91]:
model.save_pretrained('llama_combined_0618')