<a href="https://colab.research.google.com/github/Hearlvein/colab/blob/main/guten_tag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎯 Fine-Tuning GPT-2 for Formality Translation with Few-Shot Prompting

This notebook guides you through fine-tuning GPT-2 to translate informal text to formal text using few-shot prompting. The process includes:
- Dataset preparation from valentin_dataset.csv
- Few-shot prompt engineering for formality translation
- Model fine-tuning with LoRA
- Interactive formality translation testing

**Task:** Given an informal sentence, generate its formal equivalent using in-context learning.

**Note:** This notebook is designed for execution in Google Colab.

## 🔧 Setup and Installation

In [1]:
# Install necessary packages
!pip install -q transformers datasets peft trl bitsandbytes accelerate
!pip install -q pandas scikit-learn
!pip install -q tf-keras

## 📚 Dataset Preparation and Few-Shot Example Selection

In [2]:
import pandas as pd
import numpy as np
import json
import random
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple
import re

# Load the valentin dataset
dataset_path = "valentin_dataset.csv"
df = pd.read_csv(dataset_path, sep=';')

print(f"Dataset loaded with {len(df)} pairs")
print("Sample data:")
print(df.head())

# Clean and validate the data
def clean_text(text):
    """Clean text by removing extra whitespace and normalizing"""
    if pd.isna(text):
        return ""
    return re.sub(r'\s+', ' ', str(text).strip())

df['formal'] = df['formal'].apply(clean_text)
df['informal'] = df['informal'].apply(clean_text)

# Remove empty or very short entries
df = df[(df['formal'].str.len() > 10) & (df['informal'].str.len() > 10)]
print(f"After cleaning: {len(df)} pairs")

def select_diverse_examples(df: pd.DataFrame, n_examples: int = 5) -> List[Tuple[str, str]]:
    """
    Select diverse examples for few-shot prompting using TF-IDF similarity
    to ensure variety in the selected examples.
    """
    # Use TF-IDF to find diverse examples
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    informal_vectors = vectorizer.fit_transform(df['informal'])
    
    selected_indices = []
    remaining_indices = list(range(len(df)))
    
    # Select first example randomly
    first_idx = random.choice(remaining_indices)
    selected_indices.append(first_idx)
    remaining_indices.remove(first_idx)
    
    # Select remaining examples to maximize diversity
    for _ in range(n_examples - 1):
        if not remaining_indices:
            break
            
        max_min_similarity = -1
        best_idx = None
        
        for idx in remaining_indices:
            # Calculate minimum similarity to already selected examples
            similarities = []
            for selected_idx in selected_indices:
                sim = cosine_similarity(
                    informal_vectors[idx:idx+1], 
                    informal_vectors[selected_idx:selected_idx+1]
                )[0][0]
                similarities.append(sim)
            
            min_similarity = min(similarities)
            if min_similarity > max_min_similarity:
                max_min_similarity = min_similarity
                best_idx = idx
        
        if best_idx is not None:
            selected_indices.append(best_idx)
            remaining_indices.remove(best_idx)
    
    # Return selected examples
    examples = []
    for idx in selected_indices:
        examples.append((df.iloc[idx]['informal'], df.iloc[idx]['formal']))
    
    return examples

# Select diverse examples for few-shot prompting
few_shot_examples = select_diverse_examples(df, n_examples=5)

print("\nSelected few-shot examples:")
for i, (informal, formal) in enumerate(few_shot_examples, 1):
    print(f"\n{i}. Informal: {informal}")
    print(f"   Formal: {formal}")

Dataset loaded with 2000 pairs
Sample data:
                                              formal  \
0  We kindly ask that you the system update will ...   
1  Good morning, I regret the oversight and will ...   
2  We kindly ask that you we have identified a di...   
3  Esteemed colleagues, I regret the oversight an...   
4  I would appreciate it if you could we require ...   

                                            informal  
0  We'd like you to we'll update the system this ...  
1  Morning! My bad, I'll fix it ASAP. Mind sendin...  
2  We'd like you to we found a mistake in the dat...  
3  Hey folks, My bad, I'll fix it ASAP. Let me kn...  
4  I'd be grateful if you we need more info to mo...  
After cleaning: 2000 pairs

Selected few-shot examples:

1. Informal: Heads up that your doc needs more edits. All the best.
   Formal: Kindly note that the document you provided requires further revision. Please accept my best regards.

2. Informal: Heads up that your doc needs more edit

## 🎯 Few-Shot Prompt Engineering

In [6]:
def create_formality_prompt(examples: List[Tuple[str, str]], test_informal: str = None) -> str:
    """
    Create a few-shot prompt for formality translation.
    
    Args:
        examples: List of (informal, formal) pairs for few-shot learning
        test_informal: Optional informal sentence to translate
    
    Returns:
        Formatted prompt string
    """
    prompt = """Task: Translate informal text to formal text while preserving the original meaning.

Examples:
"""
    
    for i, (informal, formal) in enumerate(examples, 1):
        prompt += f"""
Informal: {informal}
Formal: {formal}
"""
    
    if test_informal:
        prompt += f"""
Informal: {test_informal}
Formal:"""
    
    return prompt

def create_training_data_with_prompts(df: pd.DataFrame, few_shot_examples: List[Tuple[str, str]]) -> List[dict]:
    """
    Create training data where each example includes few-shot context.
    """
    training_data = []
    
    # Create a set of few-shot examples to exclude from training
    few_shot_informals = {informal for informal, _ in few_shot_examples}
    
    for _, row in df.iterrows():
        # Skip if this example is used in few-shot prompting
        if row['informal'] in few_shot_informals:
            continue
            
        # Create prompt with few-shot examples
        prompt = create_formality_prompt(few_shot_examples, row['informal'])
        full_text = prompt + " " + row['formal']
        
        training_data.append({
            "text": full_text,
            "informal": row['informal'],
            "formal": row['formal']
        })
    
    return training_data

# Create training data with few-shot prompts
training_data = create_training_data_with_prompts(df, few_shot_examples)
print(f"Created {len(training_data)} training examples")

# Save training data to JSONL
output_file = Path("formality_dataset.jsonl")
with output_file.open("w", encoding="utf-8") as f:
    for item in training_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Training data saved to {output_file}")

# Show example prompt
sample_prompt = create_formality_prompt(few_shot_examples, "Hey, can you help me out?")
print("\nSample few-shot prompt:")
print(sample_prompt)

Created 1995 training examples
Training data saved to formality_dataset.jsonl

Sample few-shot prompt:
Task: Translate informal text to formal text while preserving the original meaning.

Examples:

Informal: Heads up that your doc needs more edits. All the best.
Formal: Kindly note that the document you provided requires further revision. Please accept my best regards.

Informal: Heads up that your doc needs more edits. Thanks for understanding.
Formal: Kindly note that the document you provided requires further revision. Thank you for your understanding.

Informal: Could you please your doc needs more edits. Thanks for sticking with us.
Formal: It would be appreciated if you could the document you provided requires further revision. We appreciate your continued support.

Informal: Could you please your doc needs more edits. Really appreciate your help.
Formal: It would be appreciated if you could the document you provided requires further revision. Your cooperation is highly valued.


## 🧠 Model Fine-Tuning with LoRA for Formality Translation

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

# Load the dataset
print("Loading formality translation dataset...")
with open(output_file, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f if line.strip()]
dataset = Dataset.from_list(data)
print(f"Dataset loaded with {len(dataset)} records.")

# Load tokenizer and model
MODEL_NAME = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Check device availability and configure model loading accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Try to use quantization if supported, otherwise fall back to regular loading
try:
    if device == "cuda":
        # Load model with 8-bit precision for CUDA
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME, 
            quantization_config=bnb_config, 
            device_map="auto"
        )
        model = prepare_model_for_kbit_training(model)
        print("Using 8-bit quantization with bitsandbytes")
    else:
        raise RuntimeError("Not using CUDA, falling back to regular loading")
except Exception as e:
    print(f"Quantization not available ({e}), falling back to regular model loading...")
    # Fallback to regular model loading without quantization
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    model = model.to(device)
    print("Using regular model loading without quantization")

# Configure LoRA for formality translation
lora_config = LoraConfig(
    r=16,  # Slightly higher rank for better formality understanding
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text", "informal", "formal"])

# Training configuration optimized for formality translation
batch_size = 2 if device == "cpu" else 4

# Use SFTConfig compatible with TRL 0.18.2
training_args = SFTConfig(
    output_dir="./formality_translator_model",
    per_device_train_batch_size=batch_size,
    num_train_epochs=3,
    logging_steps=25,
    save_strategy="epoch",
    fp16=(device == "cuda"),  # Only use fp16 with CUDA
    push_to_hub=False,
    report_to="none",
    overwrite_output_dir=True
)

# Initialize trainer compatible with TRL 0.18.2
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train the model
print("Starting formality translation training...")
trainer.train()

# Save the model
model_path = "./formality_translator_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Formality translator model saved to {model_path}")

Loading formality translation dataset...
Dataset loaded with 1995 records.
Using device: cpu
Quantization not available (Not using CUDA, falling back to regular loading), falling back to regular model loading...
Using device: cpu
Quantization not available (Not using CUDA, falling back to regular loading), falling back to regular model loading...
Using regular model loading without quantization
Using regular model loading without quantization


TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'

## ✨ Formality Translation Testing and Evaluation

In [None]:
from transformers import pipeline
import random

# Load the fine-tuned formality translator
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

def translate_to_formal(informal_text: str, few_shot_examples: List[Tuple[str, str]]) -> str:
    """
    Translate informal text to formal using few-shot prompting.
    """
    prompt = create_formality_prompt(few_shot_examples, informal_text)
    
    output = generator(
        prompt,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.1,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = output[0]["generated_text"]
    # Extract only the formal translation (after the last "Formal:")
    formal_part = generated_text.split("Formal:")[-1].strip()
    
    # Clean up the output - take only the first sentence/phrase
    formal_sentences = formal_part.split('\n')[0].split('.')[0]
    return formal_sentences.strip()

# Test with some examples from the dataset
test_examples = df.sample(5, random_state=42)

print("🎯 Formality Translation Results:\n")
print("="*60)

for idx, row in test_examples.iterrows():
    informal_input = row['informal']
    expected_formal = row['formal']
    predicted_formal = translate_to_formal(informal_input, few_shot_examples)
    
    print(f"\nInput (Informal): {informal_input}")
    print(f"Expected (Formal): {expected_formal}")
    print(f"Generated (Formal): {predicted_formal}")
    print("-" * 40)

# Interactive testing function
def interactive_formality_test():
    """
    Interactive function to test formality translation with user input.
    """
    print("\n🔄 Interactive Formality Translation Test")
    print("Enter informal sentences to see their formal translations.")
    print("Type 'quit' to exit.\n")
    
    while True:
        user_input = input("Informal sentence: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            break
            
        if not user_input:
            continue
            
        formal_output = translate_to_formal(user_input, few_shot_examples)
        print(f"Formal translation: {formal_output}\n")

# Example translations
example_informal_sentences = [
    "Hey, what's up?",
    "Can you help me out with this thing?",
    "Thanks a bunch for your help!",
    "I'll get back to you ASAP.",
    "Let me know if you need anything."
]

print("\n📝 Example Translations:")
for informal in example_informal_sentences:
    formal = translate_to_formal(informal, few_shot_examples)
    print(f"• {informal} → {formal}")

# Run interactive test (uncomment to use)
# interactive_formality_test()

## 📊 Evaluation Metrics and Analysis

In [None]:
from sklearn.metrics import accuracy_score
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import Counter
import re

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
except:
    pass

def evaluate_formality_translation(test_size: int = 20):
    """
    Evaluate the formality translation model using multiple metrics.
    """
    # Select test examples (different from few-shot examples)
    few_shot_informals = {informal for informal, _ in few_shot_examples}
    test_df = df[~df['informal'].isin(few_shot_informals)].sample(test_size, random_state=42)
    
    predictions = []
    references = []
    
    print("Evaluating formality translation...")
    
    for _, row in test_df.iterrows():
        informal_input = row['informal']
        expected_formal = row['formal']
        predicted_formal = translate_to_formal(informal_input, few_shot_examples)
        
        predictions.append(predicted_formal)
        references.append(expected_formal)
    
    # Calculate BLEU scores
    bleu_scores = []
    smoothie = SmoothingFunction().method4
    
    for pred, ref in zip(predictions, references):
        # Tokenize sentences
        pred_tokens = pred.lower().split()
        ref_tokens = ref.lower().split()
        
        # Calculate BLEU score
        bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)
        bleu_scores.append(bleu)
    
    avg_bleu = np.mean(bleu_scores)
    
    # Analyze formality indicators
    formal_indicators = [
        'please', 'kindly', 'would', 'could', 'sincerely', 'respectfully',
        'appreciate', 'grateful', 'thank you', 'regards', 'esteemed'
    ]
    
    informal_indicators = [
        'hey', 'hi', 'thanks', 'gonna', 'wanna', 'yeah', 'ok', 'asap'
    ]
    
    def count_indicators(text, indicators):
        text_lower = text.lower()
        return sum(1 for indicator in indicators if indicator in text_lower)
    
    formal_gains = []
    informal_reductions = []
    
    for pred, informal in zip(predictions, test_df['informal']):
        # Count formal indicators gained
        formal_gain = count_indicators(pred, formal_indicators) - count_indicators(informal, formal_indicators)
        formal_gains.append(max(0, formal_gain))
        
        # Count informal indicators reduced
        informal_reduction = count_indicators(informal, informal_indicators) - count_indicators(pred, informal_indicators)
        informal_reductions.append(max(0, informal_reduction))
    
    print(f"\n📊 Evaluation Results (n={test_size}):")
    print("="*50)
    print(f"Average BLEU Score: {avg_bleu:.3f}")
    print(f"Average Formal Indicators Added: {np.mean(formal_gains):.2f}")
    print(f"Average Informal Indicators Removed: {np.mean(informal_reductions):.2f}")
    
    # Show some example results
    print(f"\n📝 Sample Results:")
    for i in range(min(3, len(predictions))):
        print(f"\nExample {i+1}:")
        print(f"Informal: {test_df.iloc[i]['informal']}")
        print(f"Reference: {test_df.iloc[i]['formal']}")
        print(f"Generated: {predictions[i]}")
        print(f"BLEU: {bleu_scores[i]:.3f}")
    
    return {
        'bleu_scores': bleu_scores,
        'avg_bleu': avg_bleu,
        'formal_gains': formal_gains,
        'informal_reductions': informal_reductions,
        'predictions': predictions,
        'references': references
    }

# Run evaluation
evaluation_results = evaluate_formality_translation(test_size=15)

print(f"\n🎯 Model Performance Summary:")
print(f"The formality translator achieves an average BLEU score of {evaluation_results['avg_bleu']:.3f}")
print(f"Successfully adds formal language indicators and reduces informal ones.")