In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 6000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import json
import pandas as pd
with open('../Dataset/labels.json', 'r') as json_file:
    narratives = json.load(json_file)
    
#train data
with open('../Dataset/combined_EN_HI_PT_BG_RU.json', 'r') as json_file: # download data from task website
    train_df_dict = json.load(json_file)

In [None]:
train_df_dict[0] # see format of data
"""
{'file': 'EN_CC_100013.txt',
 'narrative': 'CC: Criticism of climate movement',
 'sub_narrative': 'CC: Criticism of climate movement: Ad hominem attacks on key activists',
 'narratives_list': ['Criticism of climate movement'],
 'subnarratives_list': ['Ad hominem attacks on key activists'],
 'category': 'CC',
 'file_Content': 'Bill Gates Says He Is ‘The Solution’ To Climate Change So It’s OK To Own Four Private Jets \n\nBill Gates has the right to fly around the world on private jets while normal people are forced to live in 15 minute cities without freedom of travel, according to Bill Gates himself, who told the BBC he is doing much more than anybody else to fight climate change.\n\nGates claimed that because he continues to “spend billions of dollars” on climate change activism, his carbon footprint isn’t an issue.\n\nSign up to get unfiltered news delivered straight to your inbox.\n\nYou can unsubscribe any time. By subscribing you agree to our Terms of Use\n\n“Should I stay at home and not come to Kenya and learn about farming and malaria?” Gates said in the interview with Amol Rajan.\n\n“I’m comfortable with the idea that not only am I not part of the problem by paying for the offsets, but also through the billions that my Breakthrough Energy Group is spending, that I’m part of the solution,” Gates added. Watch:\n\nEarlier this year, Gates flew around Australia on board his $70 million dollar luxury private jet lecturing people about climate change and ordering them to stop flying on planes.\n\nGates, who has declared that the energy crisis is a good thing, owns no fewer than FOUR private jets at a combined cost of $194 million dollars.\n\nA study carried out by Linnaeus University economics professor Stefan Gössling found that Gates flew more than 213,000 miles on 59 private jet flights in 2017 alone.\n\nGates emitted an estimated 1,760 tons of carbon dioxide emissions, over a hundred times more than the emissions per capita in the United States, according to data from the World Bank.\n\nElsewhere during the carefully constructed interview, Gates said he was surprised that he was targeted by ‘conspiracy theorists’ for pushing vaccines during the pandemic.\n\nWhile the BBC interview was set up to look like Gates was being challenged or grilled, he wasn’t asked about his close friendship with the elite pedophile Jeffrey Epstein.',
 'lang': 'EN'}
"""

In [None]:
# Hierarchical Three-Step Prompting (H3Prompt) 

In [None]:
import json
with open('../Dataset/sub_narratives_with_explanations.json', 'r') as json_file:
    sub_narratives_with_explanations = json.load(json_file)
with open('../Dataset/main_narratives_with_explanations.json', 'r') as json_file:
    main_narratives_with_explanations = json.load(json_file)   

# Step 1: Classify the document into a category
def classify_category(document_text):
    prompt = f"""
    Given the following document text, classify it into one of the two categories: "Ukraine-Russia War" or "Climate Change". 

    Document Text: 
    {document_text}

    Determine the category that closely or partially fits the document. If neither category applies, return "Other". Return only the output, without any additional explanations or text.
    """
    return prompt

# Step 2: Identify the main narratives
def classify_narratives(document_text, category):

    # Generate narratives list with explanations for the given main narrative
    narratives_list_with_explanations = "\n".join(
        f'- {narrative}: {main_narratives_with_explanations[narrative]}'
        for narrative in narratives[category]
    )
    
    prompt = f"""
    The document text given below is related to "{category}". 
    Please classify the document text into the most relevant narratives. Below is a list of narratives along with their explanations:

    {narratives_list_with_explanations}

    Document Text: 
    {document_text}
    
    Return the most relevant narratives as a hash-separated string (eg. Narrative1#Narrative2..). If no specific narrative can be assigned, just return "Other" and nothing else. Return only the output, without any additional explanations or text.
    """
    return prompt

# Step 3: Identify the sub-narratives based on main narratives
def classify_sub_narrative(document_text, category, main_narrative):
    if main_narrative == "Hidden plots by secret schemes of powerful groups": return "Other"
    
    # Generate sub-narratives list with explanations for the given main narrative
    sub_narratives_list_with_explanations = "\n".join(
        f'- {sub_narrative}: {sub_narratives_with_explanations[sub_narrative]}'
        for sub_narrative in narratives[category][main_narrative]
    )

    prompt = f"""
    The document text given below is related to "{category}" and its main narrative is: "{main_narrative}".
    Please classify the document text into the most relevant sub-narratives. Below is a list of sub-narratives along with their explanations:

    {sub_narratives_list_with_explanations}

    Document Text:
    {document_text}

    Return the most relevant sub-narratives as a hash-separated string (e.g., Sub-narrative1#Sub-narrative2..). If no specific sub-narrative can be assigned, just return "Other" and nothing else. Return only the output, without any additional explanations or text.
    """
    return prompt

In [None]:
from collections import defaultdict
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    output=[]
    
    instruction1 = classify_category(data_point['file_Content'])
    mapping = {"URW": "Ukraine-Russia War", "CC":"Climate Change", "NONE": "Other"}
    
    output.append([
            {"role": "user", "content": instruction1},
            {"role": "assistant", "content": mapping[data_point["category"]]}
        ])

    #stop if other class found
    if mapping[data_point["category"]] == "Other": return output
    
    instruction2 = classify_narratives(data_point['file_Content'], mapping[data_point["category"]])

    output.append([
            {"role": "user", "content": instruction2},
            {"role": "assistant", "content": "#".join(list(set(data_point["narratives_list"])))}
        ])

    #stop if other class found
    if data_point["narratives_list"][0] == "Other": return output

    m_s = defaultdict(list)
    for m, s in zip(data_point["narratives_list"], data_point['subnarratives_list']):
        m_s[m].append(s)
    
    for main_narrative in list(set(data_point["narratives_list"])):
        instruction3 = classify_sub_narrative(data_point['file_Content'], mapping[data_point["category"]], main_narrative)
        
        output.append([
                {"role": "user", "content": instruction3},
                {"role": "assistant", "content": "#".join(m_s[main_narrative])}
            ])
        
    return output

In [None]:
train_data = []
for i in range(len(train_df_dict)):
    train_data += generate_prompt(train_df_dict[i])

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_dict({'prompt': train_data})

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples['prompt']
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
train_dataset[0]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 8,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 6, # Set this for 1 full training run.
        # max_steps = 4000,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "lora_model",
        report_to = "none", # Use this for WandB etc
        save_strategy = "epoch"
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")