In [1]:
!pip install bitsandbytes accelerate peft trl huggingface_hub


Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl
  Downloading trl-0.22.2-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.22.2-py3-none-any.whl (544 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.47.0 trl-0.22.2


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from peft import get_peft_model, LoraConfig, TaskType
import json
import numpy as np
import transformers
import torch

#Set Seed

In [3]:
set_seed(42)

#Model and Tokenizer

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_dir = "marketeam/Qwen-Marketing"
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1


tokenizer_config.json:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

#Format prompt

In [6]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a campaign strategy expert with advanced knowledge in marketing, branding, and campaign planning.
Please answer the following campaign-related question, providing structured insights, creative ideas, and strategic recommendations.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""


In [7]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    responses = examples["response"]
    texts = []

    for instruction, inp, response in zip(instructions, inputs, responses):
        # Append the EOS token to the response if it's not already there
        if not response.endswith(EOS_TOKEN):
            response += EOS_TOKEN

        # Format prompt using your campaign expert train_prompt_style
        text = train_prompt_style.format(
            instruction,
            inputs,
            response
        )
        texts.append(text)

    return {"text": texts}


#Processing the Dataset

In [8]:
from datasets import load_dataset

dataset = load_dataset(
    "RafaM97/marketing_social_media",
    split="train[0:600]",
    trust_remote_code=True,
)
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)
dataset["instruction"][10]


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'RafaM97/marketing_social_media' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
ERROR:datasets.load:`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'RafaM97/marketing_social_media' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


README.md: 0.00B [00:00, ?B/s]

marketing_social_media_dataset_v2.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/689 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

'Create a social media campaign to increase brand awareness among younger audiences for a sustainable fashion brand.'

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

#Model Inference

In [10]:
inference_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a campaign strategy expert with advanced knowledge in marketing, branding, and campaign planning.
Please answer the following campaign-related question, providing structured insights, creative ideas, and strategic recommendations.

### Question:
{}

### Response:
"""


In [None]:
question = dataset[10]['instruction']

inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])





**Step 1: Understanding the Target Audience**  
Younger audiences (Gen Z and Millennials) prioritize sustainability, self-expression, and authenticity. They are active on platforms like Instagram, TikTok, and YouTube, and value transparency, community, and social impact.

**Step 2: Define Campaign Objectives**  
- **Raise awareness** of the brand’s sustainable mission.  
- **Drive engagement** through relatable content.  
- **Foster a community** around sustainability values.  

**Step 3: Leverage Trending Formats**  
- **TikTok Challenges**: Launch a trend like #SustainableStyleChallenge, encouraging users to share their eco-friendly outfits or upcycled looks.  
- **Instagram Reels/Stories**: Use quick, visually appealing content highlighting behind-the-scenes sustainability efforts (e.g., material sourcing, recycling processes).  
- **YouTube Shorts**: Partner with micro-influencers to create tutorials on sustainable fashion hacks (e.g., DIY upcycling, capsule wardrobes).  

**Ste

#Setting up

In [11]:
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=64,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

model = get_peft_model(model, peft_config)

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import LoraConfig, get_peft_model


# Training Arguments
training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=0.2,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    group_by_length=True,
    report_to="none"
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)



Adding EOS to train dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

In [13]:
import torch, gc

gc.collect()                # เก็บ garbage
torch.cuda.empty_cache()    # ล้าง VRAM cache


#TRAIN

In [14]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False
trainer.train()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
15,1.0441
30,0.1227
45,0.0179
60,0.0148
75,0.0137


TrainOutput(global_step=75, training_loss=0.2426283754905065, metrics={'train_runtime': 328.1065, 'train_samples_per_second': 1.829, 'train_steps_per_second': 0.229, 'total_flos': 2.85437708992512e+16, 'train_loss': 0.2426283754905065, 'epoch': 1.0})

#TEST MODEL AFTER TRAIN


In [16]:
question = dataset[10]['instruction']
inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])




['Company: GreenCycle Clothing\nTarget Audience: Environmentally conscious millennials\nConstraints: Limited budget ($5,000)\nGoals: 20% increase in brand awareness, 15% increase in sales\nWorkflow Stage: Strategy', 'Company: NovaTech Solutions\nTarget Audience: Business owners and IT professionals\nConstraints: Publish 2 articles per week\nGoals: 50% increase in website traffic, 20% increase in lead generation\nWorkflow Stage: Content Creation', 'Company: FitZone\nTarget Audience: Inactive subscribers (6+ months)\nConstraints: Limited email list (5,000 subscribers)\nGoals: 20% re-engagement rate, 10% conversion rate\nWorkflow Stage: Email Campaign', 'Company: GreenThreads, a fashion brand using eco-friendly materials and production methods.\nTarget Audience: Fashion-conscious millennials and Gen Z.\nConstraints: Limited budget, high competition in the sustainable fashion market.\nGoals: Increase brand awareness by 20%, drive sales by 15% within 6 months.\nWorkflow Stage: Strategy',

#LoRa

In [17]:
from peft import PeftModel

merged_model = model.merge_and_unload()   # merg LoRA with base
merged_model.save_pretrained("output/merged_model")
tokenizer.save_pretrained("output/merged_model")




('output/merged_model/tokenizer_config.json',
 'output/merged_model/special_tokens_map.json',
 'output/merged_model/chat_template.jinja',
 'output/merged_model/vocab.json',
 'output/merged_model/merges.txt',
 'output/merged_model/added_tokens.json',
 'output/merged_model/tokenizer.json')

#HG REPO

In [24]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("marketeam/Qwen-Marketing")
config.save_pretrained("output/merged_model")  # save config ลง folder เดิม


In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer

repo_id = "komsan/Qwen-Campaign-Concept"



# push model + tokenizer from memory
merged_model.push_to_hub(repo_id, use_auth_token=True)
tokenizer.push_to_hub(repo_id, use_auth_token=True)






Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...vq/model-00001-of-00002.safetensors:   1%|          | 33.5MB / 5.00GB            

  ...vq/model-00002-of-00002.safetensors:   2%|1         | 58.7MB / 3.89GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmppcx3tq6c/tokenizer.json       : 100%|##########| 11.4MB / 11.4MB            

CommitInfo(commit_url='https://huggingface.co/komsan/Qwen-Campaign-Concept/commit/60320369c13fbb7e52f222c10c3d4efb2b38c958', commit_message='Upload tokenizer', commit_description='', oid='60320369c13fbb7e52f222c10c3d4efb2b38c958', pr_url=None, repo_url=RepoUrl('https://huggingface.co/komsan/Qwen-Campaign-Concept', endpoint='https://huggingface.co', repo_type='model', repo_id='komsan/Qwen-Campaign-Concept'), pr_revision=None, pr_num=None)

In [29]:
from google.colab import drive
drive.mount('/content/gdrive')

# copy folder to Drive
import shutil
shutil.copytree("output", "/content/gdrive/MyDrive/output")



Mounted at /content/gdrive


'/content/gdrive/MyDrive/output'