In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/qwen-lola-fine-tune/train.csv
/kaggle/input/qwen-lola-fine-tune/test.csv
/kaggle/input/qwen-lola-fine-tune/calibrate.csv
/kaggle/input/qwen-lola-fine-tune/filtered-ctr-all.csv


In [None]:
import pandas as pd
import json
import os

# --- Configuration for train data---
INPUT_FILE = '/kaggle/input/qwen-lola-fine-tune/train.csv'  # Your input CSV file
OUTPUT_JSON = 'finetune_train.json'    # Output file for LLM training
OUTPUT_CSV = 'final_train.csv' # Output CSV with tracking columns

SYSTEM_PROMPT = "You are an editor tasked with choosing the catchier one from several drafted headlines for the same article. Catchier means the one that is likely to generate more clicks."
USER_INSTRUCTION = "You are presented with several headlines. Which one is catchier? **Return only the number before the headline. **No explanation is needed. No need to return the headline, only the number.****\n"

def prepare_data():
    # 1. Load the Data
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found. Please place your CSV file in the same folder.")
        # Creating a dummy file for demonstration purposes if you run this without your file
        print("Creating a dummy CSV for demonstration...")
        data = {
            'test_id': [14366, 14366, 14366, 14332, 14332, 14332],
            'headline': [
                'What They Found', 'A Science Mystery', 'He Sat There', 
                'The Selfies', 'Warning: Graphic', 'Sigh... I Want One'
            ],
            'CTR': [0.011, 0.012, 0.005, 0.041, 0.033, 0.035]
        }
        df = pd.DataFrame(data)
    else:
        df = pd.read_csv(INPUT_FILE)

    # Clean data: Ensure headlines are strings and handle NaNs if any
    df['headline'] = df['headline'].astype(str)
    
    # 2. Prepare storage for results
    json_dataset = []
    
    # We will add columns to the original dataframe to track assignments
    df['option_number'] = 0
    df['is_best'] = 0

    # 3. Group by test_id and process
    grouped = df.groupby('test_id')

    print(f"Found {len(grouped)} unique test groups. Processing...")

    for test_id, group in grouped:
        headlines = group['headline'].tolist()
        ctrs = group['CTR'].tolist()
        indices = group.index.tolist() # Keep track of original dataframe indices
        
        # Identify the winner (Index of max CTR in this specific group)
        # We use .index() on the list to find the local position (0, 1, 2...)
        max_ctr = max(ctrs)
        best_local_index = ctrs.index(max_ctr)
        
        # Calculate the "Human Readable" option number (1-based)
        best_option_number = best_local_index + 1

        # Construct User Message
        user_message = USER_INSTRUCTION
        for i, headline in enumerate(headlines):
            option_num = i + 1
            user_message += f"{option_num}. {headline}\n"
            
            # Update the original DataFrame with tracking info
            original_idx = indices[i]
            df.at[original_idx, 'option_number'] = option_num
            if i == best_local_index:
                df.at[original_idx, 'is_best'] = 1

        # Construct JSON Entry
        recording = {
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": str(best_option_number)}
            ]
        }
        json_dataset.append(recording)

    # 4. Save Outputs
    
    # Save JSON for Unsloth/Fine-tuning
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(json_dataset, f, indent=4)
    
    # Save processed CSV for your reference
    df.to_csv(OUTPUT_CSV, index=False)

    print(f"Success! Processed {len(df)} rows.")
    print(f"1. Training data saved to: {OUTPUT_JSON}")
    print(f"2. Tracking CSV saved to: {OUTPUT_CSV}")
    print("\nSample JSON Entry:")
    print(json.dumps(json_dataset[0], indent=2))

if __name__ == "__main__":
    prepare_data()

In [None]:
# --- Configuration for calibrate data---
INPUT_FILE = '/kaggle/input/qwen-lola-fine-tune/calibrate.csv'  # Your input CSV file
OUTPUT_JSON = 'finetune_calibrate.json'    # Output file for LLM training
OUTPUT_CSV = 'final_calibrate.csv' # Output CSV with tracking columns

SYSTEM_PROMPT = "You are an editor tasked with choosing the catchier one from several drafted headlines for the same article. Catchier means the one that is likely to generate more clicks."
USER_INSTRUCTION = "You are presented with several headlines. Which one is catchier? **Return only the number before the headline. **No explanation is needed. No need to return the headline, only the number.****\n"

def prepare_data():
    # 1. Load the Data
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found. Please place your CSV file in the same folder.")
        # Creating a dummy file for demonstration purposes if you run this without your file
        print("Creating a dummy CSV for demonstration...")
        data = {
            'test_id': [14366, 14366, 14366, 14332, 14332, 14332],
            'headline': [
                'What They Found', 'A Science Mystery', 'He Sat There', 
                'The Selfies', 'Warning: Graphic', 'Sigh... I Want One'
            ],
            'CTR': [0.011, 0.012, 0.005, 0.041, 0.033, 0.035]
        }
        df = pd.DataFrame(data)
    else:
        df = pd.read_csv(INPUT_FILE)

    # Clean data: Ensure headlines are strings and handle NaNs if any
    df['headline'] = df['headline'].astype(str)
    
    # 2. Prepare storage for results
    json_dataset = []
    
    # We will add columns to the original dataframe to track assignments
    df['option_number'] = 0
    df['is_best'] = 0

    # 3. Group by test_id and process
    grouped = df.groupby('test_id')

    print(f"Found {len(grouped)} unique test groups. Processing...")

    for test_id, group in grouped:
        headlines = group['headline'].tolist()
        ctrs = group['CTR'].tolist()
        indices = group.index.tolist() # Keep track of original dataframe indices
        
        # Identify the winner (Index of max CTR in this specific group)
        # We use .index() on the list to find the local position (0, 1, 2...)
        max_ctr = max(ctrs)
        best_local_index = ctrs.index(max_ctr)
        
        # Calculate the "Human Readable" option number (1-based)
        best_option_number = best_local_index + 1

        # Construct User Message
        user_message = USER_INSTRUCTION
        for i, headline in enumerate(headlines):
            option_num = i + 1
            user_message += f"{option_num}. {headline}\n"
            
            # Update the original DataFrame with tracking info
            original_idx = indices[i]
            df.at[original_idx, 'option_number'] = option_num
            if i == best_local_index:
                df.at[original_idx, 'is_best'] = 1

        # Construct JSON Entry
        recording = {
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": str(best_option_number)}
            ]
        }
        json_dataset.append(recording)

    # 4. Save Outputs
    
    # Save JSON for Unsloth/Fine-tuning
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(json_dataset, f, indent=4)
    
    # Save processed CSV for your reference
    df.to_csv(OUTPUT_CSV, index=False)

    print(f"Success! Processed {len(df)} rows.")
    print(f"1. Training data saved to: {OUTPUT_JSON}")
    print(f"2. Tracking CSV saved to: {OUTPUT_CSV}")
    print("\nSample JSON Entry:")
    print(json.dumps(json_dataset[0], indent=2))

if __name__ == "__main__":
    prepare_data()

In [None]:
# --- Configuration for calibrate data---
INPUT_FILE = '/kaggle/input/qwen-lola-fine-tune/test.csv'  # Your input CSV file
OUTPUT_JSON = 'finetune_test.json'    # Output file for LLM training
OUTPUT_CSV = 'final_test.csv' # Output CSV with tracking columns

SYSTEM_PROMPT = "You are an editor tasked with choosing the catchier one from several drafted headlines for the same article. Catchier means the one that is likely to generate more clicks."
USER_INSTRUCTION = "You are presented with several headlines. Which one is catchier? **Return only the number before the headline. **No explanation is needed. No need to return the headline, only the number.****\n"

def prepare_data():
    # 1. Load the Data
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found. Please place your CSV file in the same folder.")
        # Creating a dummy file for demonstration purposes if you run this without your file
        print("Creating a dummy CSV for demonstration...")
        data = {
            'test_id': [14366, 14366, 14366, 14332, 14332, 14332],
            'headline': [
                'What They Found', 'A Science Mystery', 'He Sat There', 
                'The Selfies', 'Warning: Graphic', 'Sigh... I Want One'
            ],
            'CTR': [0.011, 0.012, 0.005, 0.041, 0.033, 0.035]
        }
        df = pd.DataFrame(data)
    else:
        df = pd.read_csv(INPUT_FILE)

    # Clean data: Ensure headlines are strings and handle NaNs if any
    df['headline'] = df['headline'].astype(str)
    
    # 2. Prepare storage for results
    json_dataset = []
    
    # We will add columns to the original dataframe to track assignments
    df['option_number'] = 0
    df['is_best'] = 0

    # 3. Group by test_id and process
    grouped = df.groupby('test_id')

    print(f"Found {len(grouped)} unique test groups. Processing...")

    for test_id, group in grouped:
        headlines = group['headline'].tolist()
        ctrs = group['CTR'].tolist()
        indices = group.index.tolist() # Keep track of original dataframe indices
        
        # Identify the winner (Index of max CTR in this specific group)
        # We use .index() on the list to find the local position (0, 1, 2...)
        max_ctr = max(ctrs)
        best_local_index = ctrs.index(max_ctr)
        
        # Calculate the "Human Readable" option number (1-based)
        best_option_number = best_local_index + 1

        # Construct User Message
        user_message = USER_INSTRUCTION
        for i, headline in enumerate(headlines):
            option_num = i + 1
            user_message += f"{option_num}. {headline}\n"
            
            # Update the original DataFrame with tracking info
            original_idx = indices[i]
            df.at[original_idx, 'option_number'] = option_num
            if i == best_local_index:
                df.at[original_idx, 'is_best'] = 1

        # Construct JSON Entry
        recording = {
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": str(best_option_number)}
            ]
        }
        json_dataset.append(recording)

    # 4. Save Outputs
    
    # Save JSON for Unsloth/Fine-tuning
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(json_dataset, f, indent=4)
    
    # Save processed CSV for your reference
    df.to_csv(OUTPUT_CSV, index=False)

    print(f"Success! Processed {len(df)} rows.")
    print(f"1. Training data saved to: {OUTPUT_JSON}")
    print(f"2. Tracking CSV saved to: {OUTPUT_CSV}")
    print("\nSample JSON Entry:")
    print(json.dumps(json_dataset[0], indent=2))

if __name__ == "__main__":
    prepare_data()

In [None]:
!pip install torch datasets transformers peft bitsandbytes