In [None]:
import pandas as pd
df = pd.read_csv("dataset/df_data_preprocessed.csv")

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
# (python 3.10/3.11 only)
!pip install xformers


In [None]:
!pip -q install peft

In [None]:
# after installing packages (above), restart the session
# here they using flash attention, 4bit quantization
from unsloth import FastLanguageModel
import torch
# max_seq_length = 14048 # tried with 14048 max sequence length but T4 and L4 can't take it
# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth
max_seq_length = 2048 # change to 2048 tokens for limitations of GPU. Choose any! We auto support RoPE Scaling internally!
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    #"unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# max_seq_length = 2048 # change to 2048 tokens for limitations of GPU. Choose any! We auto support RoPE Scaling internally!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
!pip3 -q install datasets

In [None]:
# dataset formatting like huggingface
from datasets import Dataset, load_dataset

# Format each row as a conversation
formatted_data = [
    [
        {'from': 'human', 'value': row['Abstract'] + row['Introduction'] + row['Conclusion']},
        {'from': 'gpt', 'value': row['GPT_ground_truth_FW']}
    ]
    for _, row in df_train.iterrows()
]

# Create a DataFrame and then convert to a Hugging Face Dataset
df_conversations = pd.DataFrame({'conversations': formatted_data})
huggingface_dataset = Dataset.from_pandas(df_conversations)



In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

from unsloth.chat_templates import standardize_sharegpt
huggingface_dataset = standardize_sharegpt(huggingface_dataset)
huggingface_dataset = huggingface_dataset.map(formatting_prompts_func, batched = True,)

Standardizing format:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = huggingface_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Start training
trainer_stats = trainer.train()

In [None]:
# Assuming 'model' is your trained model and 'tokenizer' is your tokenizer
model.save_pretrained('llama_fine_tuned_60_steps')
tokenizer.save_pretrained('llama_fine_tuned_60_steps')


In [None]:
from unsloth.chat_templates import get_chat_template
import torch

# Initialize tokenizer with chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)
FastLanguageModel.for_inference(model)

# Define the prompt template
prompt_template = """You are an AI trained to analyze scientific research and suggest future directions based on the content of a paper.
Below, you will find sections from a scientific article including the 'Abstract', 'Introduction', 'Conclusion' of a scientific paper.
Based on these details, please generate comprehensive and plausible future work suggestions that could extend the research findings,
address limitations, and propose new avenues for exploration.
Generate a future work based on these texts. Future work should be within 100 words.\n"""

# Initialize a list to store generated future work
future_work = []

# Loop through each row in the DataFrame
for _, row in df_test.iterrows():
    # Construct the user prompt using the specific row's content
    prompt = (
        prompt_template +
        f"Abstract: {row['Abstract']}\nIntroduction: {row['Introduction']}\nConclusion: {row['Conclusion']}"
    )

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Extract input IDs and attention mask
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

   # Generate the output with new parameters
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=128,            # Increase max tokens to allow full sentence generation
        use_cache=True,
        temperature=1.0,               # Lower temperature for more focused output
        top_p=0.9,                     # Use top-p sampling for better coherence
        eos_token_id=tokenizer.eos_token_id,  # Set an end-of-sequence token
        repetition_penalty=1.2         # Discourage abrupt/repetitive phrases
    )

    # Decode the output and add to the future_work list
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()
    future_work.append(generated_text)

# Add the generated future work to the DataFrame
df['generated_future_work'] = future_work



In [None]:
# Function to refine the 'generated_future_work' column
def refine_future_work(row):
    # Extract the last sentence from the Conclusion
    last_sentence = row['Conclusion'].split('.')[-2].strip() + '.'

    # Split generated_future_work into sentences
    future_work_sentences = row['generated_future_work'].split('. ')

    # Reverse iterate to find the matching sentence
    match_index = None
    for i in range(len(future_work_sentences) - 1, -1, -1):
        sentence = future_work_sentences[i].strip() + '.'
        if sentence == last_sentence:
            match_index = i
            break

    # If a match is found, keep sentences after the matched index
    if match_index is not None:
        refined_text = '. '.join(future_work_sentences[match_index + 1:])  # Keep sentences after the match
    else:
        refined_text = row['generated_future_work']  # If no match is found, keep original text

    return refined_text

# Apply the function to create the new column
df_test['refine_generated_future_work'] = df_test.apply(refine_future_work, axis=1)




In [None]:
df_test.to_csv('df_llama2_ft.csv', index=False)  # Set index=False if you don't want to save the row indices