In [None]:
// Algorithm 1: GRPO (Group Relative Policy Optimization)
// Core RL algorithm used in DeepSeek-R1

function GRPO_Train(base_model, reasoning_dataset, num_iterations):
    policy_model = base_model
    
    for iteration = 1 to num_iterations:
        batch = sample_batch(reasoning_dataset)
        
        for question q in batch:
            // Sample multiple outputs from current policy
            group_outputs = [] 
            group_rewards = []
            
            // Generate G different responses for each question
            for i = 1 to G:
                output = sample_from_policy(policy_model, q)
                group_outputs.append(output)
                
                // Calculate reward (accuracy + format adherence)
                reward = evaluate_reward(output, q)
                group_rewards.append(reward)
            
            // Normalize rewards within group to get advantages
            mean_reward = mean(group_rewards)
            std_reward = std(group_rewards)
            
            for i = 1 to G:
                advantage[i] = (group_rewards[i] - mean_reward) / std_reward
            
            // Update policy using clipped objective
            for output, adv in zip(group_outputs, advantage):
                ratio = policy_model(output|q) / old_policy_model(output|q)
                clipped_ratio = clip(ratio, 1-ε, 1+ε)
                
                objective = min(ratio * adv, clipped_ratio * adv)
                // Apply KL divergence regularization
                loss = -objective + β * KL(policy_model, reference_model)
                
                // Gradient update
                update_model_parameters(policy_model, loss)
        
        // Evaluate model performance on validation set
        if iteration % eval_frequency == 0:
            evaluate_model(policy_model)
    
    return policy_model




In [None]:
// Algorithm 2: DeepSeek-R1-Zero Training Pipeline
// Pure RL approach without supervised fine-tuning

function Train_DeepSeek_R1_Zero():
    // Initialize with base model
    model = DeepSeek_V3_Base()
    
    // Define simple template for reasoning format
    template = "User: {prompt} Assistant: <think>reasoning</think><answer>final_answer</answer>"
    
    // Train using GRPO with rule-based rewards
    for iteration = 1 to MAX_ITERATIONS:
        // Sample reasoning tasks (math, code, logic puzzles)
        tasks = sample_reasoning_tasks()
        
        // Format tasks with template
        prompts = [format_with_template(task, template) for task in tasks]
        
        // Apply GRPO update
        model = GRPO_Train(model, prompts, 1)
        
        // Evaluate performance on benchmarks
        if iteration % EVAL_FREQ == 0:
            accuracy = evaluate_on_benchmarks(model)
            if converged(accuracy):
                break
    
    return model




In [None]:
// Algorithm 3: DeepSeek-R1 Multi-Stage Training Pipeline

function Train_DeepSeek_R1():
    // Stage 1: Cold Start with SFT
    base_model = DeepSeek_V3_Base()
    cold_start_data = collect_high_quality_CoT_examples(thousands=True)
    model = supervised_fine_tune(base_model, cold_start_data)
    
    // Stage 2: Reasoning-Oriented RL
    for iteration = 1 to RL_ITERATIONS_1:
        reasoning_tasks = sample_reasoning_tasks()
        
        for task in reasoning_tasks:
            // Generate outputs and compute rewards
            outputs = sample_outputs(model, task)
            
            // Calculate combined reward
            for output in outputs:
                accuracy_reward = evaluate_accuracy(output, task)
                language_consistency = measure_language_consistency(output)
                
                // Combine rewards
                total_reward = accuracy_reward + language_consistency
            
            // Update model with GRPO
            model = update_with_GRPO(model, task, outputs, total_reward)
    
    // Stage 3: Rejection Sampling and SFT
    rl_checkpoint = model
    
    // Generate and filter reasoning data
    reasoning_data = []
    for prompt in reasoning_prompts:
        candidates = generate_multiple(rl_checkpoint, prompt)
        correct_responses = filter_correct(candidates)
        reasoning_data.extend(correct_responses)
    
    // Combine with non-reasoning data
    non_reasoning_data = collect_general_data()  // writing, QA, etc.
    combined_data = reasoning_data + non_reasoning_data  // ~800k samples
    
    // Fine-tune fresh base model
    model = supervised_fine_tune(DeepSeek_V3_Base(), combined_data, epochs=2)
    
    // Stage 4: RL for All Scenarios
    for iteration = 1 to RL_ITERATIONS_2:
        // Mix of reasoning and general tasks
        tasks = sample_mixed_tasks()
        
        for task in tasks:
            if is_reasoning_task(task):
                // Use rule-based rewards
                reward = calculate_rule_based_reward(model(task))
            else:
                // Use reward model for general tasks
                reward = reward_model.evaluate(model(task))
            
            // Update with GRPO
            model = update_with_GRPO(model, task, model(task), reward)
    
    return model




In [None]:
// Algorithm 4: Distillation to Smaller Models

function Distill_To_Small_Models(teacher_model):
    // Prepare distillation dataset from teacher model
    distillation_data = []
    
    for prompt in diverse_prompts:
        // Generate high-quality response from teacher
        response = teacher_model(prompt)
        
        // Create prompt-response pair
        distillation_data.append((prompt, response))
    
    // Target models for distillation
    target_models = [
        "Qwen2.5-1.5B", "Qwen2.5-7B", "Qwen2.5-14B", "Qwen2.5-32B",
        "Llama-3.1-8B", "Llama-3.3-70B"
    ]
    
    distilled_models = {}
    
    for model_name in target_models:
        // Load base model
        small_model = load_base_model(model_name)
        
        // Fine-tune with teacher outputs
        distilled_model = supervised_fine_tune(
            small_model,
            distillation_data,
            optimizer="AdamW",
            epochs=2
        )
        
        distilled_models[model_name] = distilled_model
    
    return distilled_models