In [1]:
!pip install -q -U \
    pandas \
    datasets \
    triton==2.3.0 \
    transformers==4.41.2 \
    peft==0.11.1 \
    accelerate==0.30.1 \
    bitsandbytes==0.43.1 \
    trl==0.8.6 \
    numpy==1.26.4 \
    requests \
    Pillow \
    tqdm \
    tokenizers>=0.19.1 \
    opencv-python \
    matplotlib

print("\n✅ All libraries installed with the correct, stable versions.")
print("‼️ IMPORTANT: Please restart the session/kernel NOW before running any other code.")


zsh:1: 0.19.1 not found

✅ All libraries installed with the correct, stable versions.
‼️ IMPORTANT: Please restart the session/kernel NOW before running any other code.


In [2]:
pip install -q -U pandas datasets transformers==4.41.2 tokenizers==0.19.1 peft==0.11.1 accelerate==0.30.1 bitsandbytes trl==0.8.6 numpy==1.26.4 requests Pillow tqdm


Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install tensorboard
# Then run your code as-is


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tensorboard
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.7 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading markdown-3.9-py3-none-any.whl.metadata (5.1 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 M

In [1]:
import os
import torch
import pandas as pd
import re
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer
import gc

# --- 1. Configuration ---
CSV_PATH = "train_enriched.csv"  # Output from Step 1
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

# --- 2. Load the Enriched Dataset ---
print(f"Loading enriched dataset from: {CSV_PATH}")
try:
    df = pd.read_csv(CSV_PATH)
    print(f"✅ Successfully loaded {len(df)} rows with VLM descriptions.")
    
    # Verify vlm_description column exists
    if 'vlm_description' not in df.columns:
        raise ValueError("❌ ERROR: 'vlm_description' column not found! Please run Step 1 first.")
        
except FileNotFoundError:
    print(f"❌ ERROR: Could not find {CSV_PATH}. Please run Step 1 (VLM processing) first!")
    raise

# --- 3. Preprocess Data ---
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df['content'] = df['content'].fillna('')
df['company'] = df['company'].fillna('unknown')
df['username'] = df['username'].fillna('unknown')
df['vlm_description'] = df['vlm_description'].fillna('no media')

print(f"\n📊 Data Statistics:")
print(f"   • Total tweets: {len(df)}")
print(f"   • Tweets with visual content: {sum(~df['vlm_description'].isin(['no media', 'media could not be processed', 'media could not be downloaded']))}")
print(f"   • Unique companies: {df['company'].nunique()}")

# --- 4. Create Training Format ---
def create_mistral_format(row):
    """
    Creates instruction-output pairs in Mistral format.
    Includes VLM visual descriptions when available.
    """
    vlm_desc = str(row['vlm_description'])
    
    # Build visual context string if we have valid media description
    if vlm_desc not in ['no media', 'media could not be processed', 'media could not be downloaded', 'nan']:
        visual_context = f" The image shows: {vlm_desc}"
    else:
        visual_context = ""
    
    # Create the instruction
    try:
        day_name = row['timestamp'].day_name() if pd.notna(row['timestamp']) else 'a weekday'
        hour = row['timestamp'].hour if pd.notna(row['timestamp']) else 12
    except:
        day_name = 'a weekday'
        hour = 12
    
    instruction = (
        f"Generate an engaging marketing tweet for {row['company']} "
        f"(username: @{row['username']}). "
        f"Context: It's {day_name} at {hour}:00."
        f"{visual_context}"
    )
    
    # Clean the output tweet
    output = re.sub(r'\s+', ' ', str(row['content'])).strip()
    
    # Format in Mistral instruction style
    return f"<s>[INST] {instruction} [/INST] {output} </s>"

# Apply formatting
print("\n🔄 Formatting dataset...")
df['text'] = df.apply(create_mistral_format, axis=1)

print("\n--- Example Training Instance ---")
print(df['text'].iloc[0])
print("\n" + "="*70)

# --- 5. Convert to HuggingFace Dataset ---
# FIXED: Reset index and convert to dict first
print("\n🔄 Converting to HuggingFace Dataset...")
df_subset = df[['text']].copy()
df_subset = df_subset.reset_index(drop=True)

# Convert to dict format (more reliable)
dataset_dict = {"text": df_subset['text'].tolist()}
dataset = Dataset.from_dict(dataset_dict)

print(f"✅ Dataset created with {len(dataset)} examples")

# Filter out very long sequences to prevent OOM
def get_length(example):
    return {"length": len(example["text"])}

print("\n🔄 Filtering long sequences...")
dataset = dataset.map(get_length)
original_size = len(dataset)
dataset = dataset.filter(lambda x: x["length"] < 500)
print(f"✅ Dataset prepared: {len(dataset)} examples (filtered {original_size - len(dataset)} long tweets)")

# --- 6. Initialize Tokenizer ---
print(f"\n🔄 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# --- 7. Quantization Config ---
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# --- 8. Load Base Model ---
print(f"\n🔄 Loading {MODEL_NAME}...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1
print("✅ Model loaded successfully!")

# --- 9. LoRA Configuration ---
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)

# --- 10. Training Arguments ---
training_args = TrainingArguments(
    output_dir="./mistral-vlm-tweet-generator",
    num_train_epochs=1,  # Changed back to 1 - you had 10 which is too many!
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    optim="paged_adamw_8bit",
    save_steps=1000,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    gradient_checkpointing=True,
)

# --- 11. Initialize Trainer ---
print("\n🔄 Initializing trainer...")
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)
print("✅ Trainer initialized!")

# --- 12. Clean Memory and Train ---
gc.collect()
torch.cuda.empty_cache()

print("\n" + "="*70)
print("🚀 Starting fine-tuning with VLM-enhanced data...")
print("="*70)

trainer.train()

print("\n✅ Fine-tuning complete!")

# --- 13. Save Model ---
output_dir = "./mistral_vlm_final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\n💾 Model saved to: {output_dir}")
print("\n🎉 Training complete! Your model is ready to generate tweets.")

Loading enriched dataset from: train_enriched.csv
✅ Successfully loaded 14956 rows with VLM descriptions.

📊 Data Statistics:
   • Total tweets: 14956
   • Tweets with visual content: 10178
   • Unique companies: 194

🔄 Formatting dataset...

--- Example Training Instance ---
<s>[INST] Generate an engaging marketing tweet for tim hortons (username: @TimHortonsPH). Context: It's Saturday at 0:00. The image shows: A marketing tweet for Tim Hortons showing a sandwich and a cup of coffee on a wooden table. [/INST] Spend your weekend morning with a Ham, Egg, and Cheese Wrap paired with a sweet Iced French Vanilla! ☀️ Order yours now via dine-in, takeout, and delivery. #TimHortonsPH <hyperlink> </s>


🔄 Converting to HuggingFace Dataset...
✅ Dataset created with 14956 examples

🔄 Filtering long sequences...


Map:   0%|          | 0/14956 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14956 [00:00<?, ? examples/s]

✅ Dataset prepared: 13945 examples (filtered 1011 long tweets)

🔄 Loading tokenizer...

🔄 Loading mistralai/Mistral-7B-v0.1...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model loaded successfully!

🔄 Initializing trainer...


Map:   0%|          | 0/13945 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


✅ Trainer initialized!

🚀 Starting fine-tuning with VLM-enhanced data...


  return fn(*args, **kwargs)


Step,Training Loss


KeyboardInterrupt: 

In [None]:
# --- Save Final Model (After Training Completes) ---
print(f"\n💾 Saving final model...")

OUTPUT_DIR = "./mistral_vlm_final"

# Save LoRA adapters and tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ Model saved to: {OUTPUT_DIR}")
print("   Files saved:")
print("   • adapter_model.bin (LoRA weights)")
print("   • adapter_config.json")
print("   • tokenizer files")

# Optional: Also save training args
import json
training_info = {
    "base_model": MODEL_NAME,
    "lora_r": 8,
    "lora_alpha": 16,
    "training_samples": len(dataset),
    "epochs": 1,
    "learning_rate": 2e-4,
}
with open(f"{OUTPUT_DIR}/training_info.json", "w") as f:
    json.dump(training_info, f, indent=2)

print("\n🎉 Model saving complete!")