From fd06ed1e2f2fb663f6faa17fd7cad58f6ab0c9ad Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Fri, 11 Jul 2025 13:42:49 +0200
Subject: [PATCH 1/9] feat: add draft starklings ci, ok script
 starklings-evaluate and root script starklings-eval launch

---
 .github/scripts/starklings-evaluate.js | 353 +++++++++++++++++++++++++
 .github/workflows/starklings.yml       | 138 ++++++++++
 script-starklings.sh                   |  25 ++
 3 files changed, 516 insertions(+)
 create mode 100644 .github/scripts/starklings-evaluate.js
 create mode 100644 .github/workflows/starklings.yml
 create mode 100644 script-starklings.sh

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
new file mode 100644
index 00000000..93e2961a
--- /dev/null
+++ b/.github/scripts/starklings-evaluate.js
@@ -0,0 +1,353 @@
+const fs = require('fs');
+const { execSync } = require('child_process');
+const path = require('path');
+
+// Configuration de débogage
+const DEBUG = true;
+const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; // ex: "intro1"
+const SAVE_RESPONSES = true;
+
+function log(message) {
+    if (DEBUG) {
+        console.log(`[DEBUG] ${message}`);
+    }
+}
+
+function parseInfoToml(infoPath) {
+    // log(`Parsing info.toml from: ${infoPath}`);
+    
+    if (!fs.existsSync(infoPath)) {
+        throw new Error(`info.toml not found at: ${infoPath}`);
+    }
+    
+    const content = fs.readFileSync(infoPath, 'utf8');
+    // log(`File content length: ${content.length} characters`);
+    
+    const exercises = [];
+    const lines = content.split('\n');
+    let currentExercise = null;
+    let collectingHint = false;
+    let hintLines = [];
+
+    for (let i = 0; i < lines.length; i++) {
+        const line = lines[i];
+        const cleanLine = line.trim();
+        
+        if (cleanLine.startsWith('[[exercises]]')) {
+            if (currentExercise) {
+                if (hintLines.length > 0) {
+                    currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, '');
+                }
+                exercises.push(currentExercise);
+                // log(`Added exercise: ${currentExercise.name}`);
+            }
+            currentExercise = {};
+            collectingHint = false;
+            hintLines = [];
+        } else if (cleanLine.startsWith('hint = """')) {
+            collectingHint = true;
+            hintLines.push(cleanLine.replace('hint = """', '').trim());
+        } else if (collectingHint) {
+            if (cleanLine.endsWith('"""')) {
+                hintLines.push(cleanLine.replace('"""', '').trim());
+                collectingHint = false;
+            } else {
+                hintLines.push(cleanLine);
+            }
+        } else if (cleanLine.startsWith('name = ')) {
+            const match = cleanLine.match(/name = "(.+)"/);
+            if (match) {
+                currentExercise.name = match[1];
+            }
+        } else if (cleanLine.startsWith('path = ')) {
+            const match = cleanLine.match(/path = "(.+)"/);
+            if (match) {
+                currentExercise.path = match[1];
+            }
+        } else if (cleanLine.startsWith('mode = ')) {
+            const match = cleanLine.match(/mode = "(.+)"/);
+            if (match) {
+                currentExercise.mode = match[1];
+            }
+        }
+    }
+    
+    // N'oublie pas le dernier exercice
+    if (currentExercise) {
+        if (hintLines.length > 0) {
+            currentExercise.hint = hintLines.join('\n').replace(/"""$/, '');
+        }
+        exercises.push(currentExercise);
+        // log(`Added final exercise: ${currentExercise.name}`);
+    }
+    
+    // log(`Total exercises parsed: ${exercises.length}`);
+    return exercises;
+}
+
+async function testServerConnection() {
+    log('Testing server connection...');
+
+    try {
+        const response = await fetch('http://localhost:3002/', {
+            method: 'GET',
+            timeout: 5000
+        });
+        
+        if (response.ok) {
+            log('✅ Server connection successful');
+            return true;
+        } else {
+            log(`❌ Server responded with status: ${response.status}`);
+            return false;
+        }
+    } catch (error) {
+        log(`❌ Server connection failed: ${error.message}`);
+        return false;
+    }
+}
+
+async function callCairoCoderAPI(exerciseContent, exercise) {
+    // log(`Calling API for exercise: ${exercise.name}`);
+    
+    const prompt = `You are solving a Cairo programming exercise.
+
+Exercise: ${exercise.name}
+${exercise.hint ? `Hint: ${exercise.hint}` : ''}
+
+Instructions:
+1. Read and understand the exercise requirements
+2. Fix any compilation errors
+3. Remove the "// I AM NOT DONE" comment when complete
+4. Ensure the solution demonstrates the intended concept
+5. The solution must be in the same language as the exercise (Cairo)
+
+Code to fix:
+${exerciseContent}
+
+Please provide only the corrected code, without any additional explanation or markdown formatting.`;
+
+    const requestBody = {
+        model: 'cairo-coder',
+        messages: [{ role: 'user', content: prompt }],
+        stream: false
+    };
+
+    // log(`Request body size: ${JSON.stringify(requestBody).length} characters`);
+
+    try {
+        const response = await fetch('http://localhost:3002/v1/chat/completions', {
+            method: 'POST',
+            headers: { 
+                'Content-Type': 'application/json',
+            },
+            body: JSON.stringify(requestBody),
+            timeout: 60000 // 60 secondes
+        });
+
+        if (!response.ok) {
+            const errorText = await response.text();
+            log(`API Error - Status: ${response.status}, Response: ${errorText}`);
+            throw new Error(`HTTP error! status: ${response.status} - ${errorText}`);
+        }
+
+        const data = await response.json();
+        // log(`API Response received, data structure: ${Object.keys(data).join(', ')}`);
+        
+        // Sauvegarder la réponse complète si demandé
+        if (SAVE_RESPONSES) {
+            const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`);
+            fs.mkdirSync(path.dirname(responseFile), { recursive: true });
+            fs.writeFileSync(responseFile, JSON.stringify(data, null, 2));
+            // log(`Response saved to: ${responseFile}`);
+        }
+        
+        // Extraire le contenu de la réponse
+        if (data.choices && data.choices[0] && data.choices[0].message) {
+            const rawContent = data.choices[0].message.content;
+            const cleanCode = extractCairoCode(rawContent);
+            // log(`Generated code length: ${cleanCode.length} characters`);
+            // log(`Raw response length: ${rawContent.length} characters`);
+            return cleanCode;
+        } else {
+            log(`Invalid response format: ${JSON.stringify(data)}`);
+            throw new Error('Invalid response format from API');
+        }
+    } catch (error) {
+        log(`API call failed: ${error.message}`);
+        throw error;
+    }
+}
+
+async function testExercise(exercise, starklingsPath) {
+    log(`\n=== Testing exercise: ${exercise.name} ===`);
+    
+    const exercisePath = path.join(starklingsPath, exercise.path);
+    // log(`Exercise path: ${exercisePath}`);
+    
+    if (!fs.existsSync(exercisePath)) {
+        log(`❌ Exercise file not found: ${exercisePath}`);
+        return false;
+    }
+    
+    // Lire le contenu original
+    const originalContent = fs.readFileSync(exercisePath, 'utf8');
+    // log(`Original file size: ${originalContent.length} characters`);
+    
+    // Sauvegarder l'original
+    const backupPath = exercisePath + '.backup';
+    fs.writeFileSync(backupPath, originalContent);
+    // log(`Backup saved to: ${backupPath}`);
+    
+    try {
+        // Appeler l'API
+        const correctedCode = await callCairoCoderAPI(originalContent, exercise);
+        
+        // Sauvegarder la solution
+        fs.writeFileSync(exercisePath, correctedCode);
+        log(`Updated exercise file with generated code`);
+        
+        // Sauvegarder la solution générée pour debug
+        if (SAVE_RESPONSES) {
+            const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`);
+            fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
+            fs.writeFileSync(solutionFile, correctedCode);
+            // log(`Solution saved to: ${solutionFile}`);
+        }
+        
+        // Tester la solution
+        try {
+            log(`Running starklings for ${exercise.name}...`);
+            const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, {
+                cwd: starklingsPath,
+                stdio: 'pipe',
+                timeout: 300000,
+                encoding: 'utf8'
+            });
+            
+            log(`✅ ${exercise.name} - Success`);
+            log(`Starklings output: ${result.substring(0, 200)}...`);
+            return true;
+        } catch (error) {
+            log(`❌ ${exercise.name} - Execution failed`);
+            log(`Error code: ${error.status}`);
+            log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`);
+            log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`);
+            
+            // Sauvegarder l'erreur pour debug
+            if (SAVE_RESPONSES) {
+                const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
+                fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`);
+                log(`Error details saved to: ${errorFile}`);
+            }
+            
+            return false;
+        }
+    } catch (error) {
+        log(`❌ ${exercise.name} - API call failed: ${error.message}`);
+        return false;
+    } finally {
+        // Restaurer l'original
+        fs.writeFileSync(exercisePath, originalContent);
+        fs.unlinkSync(backupPath);
+        log(`Restored original file and cleaned up backup`);
+    }
+}
+
+function extractCairoCode(generatedResponse) {
+    // Chercher les blocs de code Cairo ou génériques
+    const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g;
+    const matches = generatedResponse.match(codeBlockRegex);
+    
+    if (matches && matches.length > 0) {
+        // Extraire le contenu du premier bloc de code trouvé
+        const codeBlock = matches[0];
+        const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, '');
+        return codeContent.trim();
+    }
+    
+    // Si pas de bloc de code trouvé, retourner le texte tel quel
+    return generatedResponse.trim();
+}
+
+async function main() {
+    // log('=== Starting Starklings Debug Session ===');
+    
+    const starklingsPath = path.join(process.cwd(), 'starklings');
+    const infoPath = path.join(starklingsPath, 'info.toml');
+    
+    // Vérifications initiales
+    // log(`Working directory: ${process.cwd()}`);
+    // log(`Starklings path: ${starklingsPath}`);
+    // log(`Info.toml path: ${infoPath}`);
+    
+    if (!fs.existsSync(starklingsPath)) {
+        console.error('❌ Starklings directory not found');
+        process.exit(1);
+    }
+    
+    if (!fs.existsSync(infoPath)) {
+        console.error('❌ info.toml not found in starklings directory');
+        process.exit(1);
+    }
+    
+    // Tester la connexion au serveur
+    const serverOk = await testServerConnection();
+    if (!serverOk) {
+        console.error('❌ Server is not accessible');
+        process.exit(1);
+    }
+    
+    // Parser les exercices
+    const exercises = parseInfoToml(infoPath);
+    
+    if (exercises.length === 0) {
+        console.error('❌ No exercises found');
+        process.exit(1);
+    }
+    
+    // Filtrer à un seul exercice si demandé
+    let exercisesToTest = exercises;
+    if (SINGLE_EXERCISE) {
+        exercisesToTest = exercises.filter(ex => ex.name === SINGLE_EXERCISE);
+        if (exercisesToTest.length === 0) {
+            console.error(`❌ Exercise '${SINGLE_EXERCISE}' not found`);
+            console.log('Available exercises:', exercises.map(ex => ex.name).join(', '));
+            process.exit(1);
+        }
+        // log(`Testing single exercise: ${SINGLE_EXERCISE}`);
+    }
+    
+    // Créer le dossier de debug
+    const debugDir = path.join(__dirname, '..', '..', 'debug');
+    fs.mkdirSync(debugDir, { recursive: true });
+    
+    // Tester les exercices
+    let passed = 0;
+    let total = exercisesToTest.length;
+    
+    console.log(`\n🧪 Starting evaluation of ${total} exercises...`);
+    
+    for (const exercise of exercisesToTest) {
+        const success = await testExercise(exercise, starklingsPath);
+        if (success) {
+            passed++;
+        }
+        
+        // Pause entre les exercices pour éviter la surcharge
+        if (exercisesToTest.length > 1) {
+            await new Promise(resolve => setTimeout(resolve, 1000));
+        }
+    }
+    
+    console.log(`\n=== Final Results ===`);
+    console.log(`${passed}/${total} exercises passed (${(passed/total*100).toFixed(1)}%)`);
+    
+    log(`Debug files saved in: ${debugDir}`);
+    // log('=== Debug Session Complete ===');
+}
+
+main().catch(error => {
+    console.error('❌ Fatal error:', error);
+    process.exit(1);
+});
\ No newline at end of file
diff --git a/.github/workflows/starklings.yml b/.github/workflows/starklings.yml
new file mode 100644
index 00000000..baf5f490
--- /dev/null
+++ b/.github/workflows/starklings.yml
@@ -0,0 +1,138 @@
+name: Starklings Benchmark
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  starklings-benchmark:
+    name: Starklings Benchmark
+    runs-on: ubuntu-latest
+    
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+      
+      - name: Setup Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+      
+      - name: Install pnpm
+        uses: pnpm/action-setup@v3
+        with:
+          version: 9
+      
+      - name: Install dependencies
+        run: pnpm install
+      
+      - name: Build Cairo Coder
+        run: pnpm build
+      
+      - name: Setup PostgreSQL
+        uses: harmon758/postgresql-action@v1
+        with:
+          postgresql version: '15'
+          postgresql db: 'cairo_coder_test'
+          postgresql user: 'test_user'
+          postgresql password: 'test_password'
+      
+      - name: Install PostgreSQL client and pgvector
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y postgresql-client-15
+          sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"
+      
+      - name: Setup test configuration
+        run: |
+          mkdir -p packages/agents
+          cat > packages/agents/config.toml << 'EOL'
+          [API_KEYS]
+          OPENAI = "${{ secrets.OPENAI_API_KEY }}"
+          ANTHROPIC = "${{ secrets.ANTHROPIC_API_KEY }}"
+          GEMINI = "${{ secrets.GEMINI_API_KEY }}"
+
+          [VECTOR_DB]
+          POSTGRES_USER = "test_user"
+          POSTGRES_HOST = "localhost"
+          POSTGRES_DB = "cairo_coder_test"
+          POSTGRES_PASSWORD = "test_password"
+          POSTGRES_PORT = "5432"
+
+          [GENERAL]
+          PORT = 3001
+          SIMILARITY_MEASURE = "cosine"
+
+          [PROVIDERS]
+          DEFAULT_CHAT_PROVIDER = "gemini"
+          DEFAULT_CHAT_MODEL = "Gemini Flash 2.5"
+          DEFAULT_FAST_CHAT_PROVIDER = "gemini"
+          DEFAULT_FAST_CHAT_MODEL = "Gemini Flash 2.5"
+          DEFAULT_EMBEDDING_PROVIDER = "openai"
+          DEFAULT_EMBEDDING_MODEL = "Text embedding 3 large"
+
+          [VERSIONS]
+          STARKNET_FOUNDRY = "0.37.0"
+          SCARB = "2.9.2"
+          EOL
+      
+      - name: Create env file
+        run: |
+          cat > .env << 'EOL'
+          POSTGRES_USER=test_user
+          POSTGRES_HOST=localhost
+          POSTGRES_DB=cairo_coder_test
+          POSTGRES_PASSWORD=test_password
+          POSTGRES_PORT=5432
+          EOL
+      
+      - name: Clone Starklings
+        run: |
+          if [ ! -d "starklings" ]; then
+            git clone https://github.com/starknet-edu/starklings.git
+          fi
+      
+      - name: Install Scarb
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://docs.swmansion.com/scarb/install.sh | sh
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+      
+      - name: Start Cairo Coder (background)
+        run: |
+          pnpm start &
+          # Attendre que le serveur démarre
+          for i in {1..30}; do
+            if curl -s http://localhost:3001/ > /dev/null; then
+              echo "Server is ready"
+              break
+            fi
+            echo "Waiting for server... ($i/30)"
+            sleep 2
+          done
+          
+          # Vérifier si le serveur est vraiment prêt
+          if ! curl -s http://localhost:3001/ > /dev/null; then
+            echo "Server failed to start"
+            exit 1
+          fi
+      
+      - name: Run Starklings Evaluation
+        run: node .github/scripts/starklings-evaluate.js
+        timeout-minutes: 30
+      
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: starklings-results
+          path: |
+            starklings/
+            *.log
\ No newline at end of file
diff --git a/script-starklings.sh b/script-starklings.sh
new file mode 100644
index 00000000..5d9cf978
--- /dev/null
+++ b/script-starklings.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Vérifier si le serveur répond
+if ! curl -s http://localhost:3002/ > /dev/null 2>&1; then
+    echo "❌ Server failed to start"
+    kill $SERVER_PID 2>/dev/null || true
+    exit 1
+fi
+
+# 8. Lancer le test avec un seul exercice
+echo "🎯 Running single Starklings evaluation..."
+
+# SINGLE_EXERCISE=variables2  node .github/scripts/starklings-evaluate.js
+node .github/scripts/starklings-evaluate.js
+
+# 9. Nettoyer
+echo "🧹 Cleaning up..."
+kill $SERVER_PID 2>/dev/null || true
+
+if command -v docker &> /dev/null; then
+    docker stop cairo-coder-test-db 2>/dev/null || true
+    docker rm cairo-coder-test-db 2>/dev/null || true
+fi
+
+echo "✅ Test completed!"
\ No newline at end of file

From 90ae8e37ac72cc4f0a484f7b31d8c93f9f66362e Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Mon, 14 Jul 2025 14:40:28 +0200
Subject: [PATCH 2/9] working in sequential

---
 .gitignore           | 4 +++-
 docker-compose.yml   | 2 +-
 script-starklings.sh | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index c5e154f3..5bc944bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,4 +47,6 @@ packages/**/dist
 .trunk
 !.trunk/trunk.yaml
 !.trunk/configs
-!.trunk/.gitignore
\ No newline at end of file
+!.trunk/.gitignore
+
+starklings/
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 421f5dcd..8832ee3b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,7 +19,7 @@ services:
       dockerfile: backend.dockerfile
     container_name: 'cairo-coder-backend'
     ports:
-      - 3001:3001
+      - 3002:3001
     depends_on:
       postgres:
         condition: service_started
diff --git a/script-starklings.sh b/script-starklings.sh
index 5d9cf978..1e5c89b5 100644
--- a/script-starklings.sh
+++ b/script-starklings.sh
@@ -10,8 +10,8 @@ fi
 # 8. Lancer le test avec un seul exercice
 echo "🎯 Running single Starklings evaluation..."
 
-# SINGLE_EXERCISE=variables2  node .github/scripts/starklings-evaluate.js
-node .github/scripts/starklings-evaluate.js
+SINGLE_EXERCISE=primitive_types2  node .github/scripts/starklings-evaluate.js
+# node .github/scripts/starklings-evaluate.js
 
 # 9. Nettoyer
 echo "🧹 Cleaning up..."

From a8cb875ebb2a6125a14cd1437da6ec943000806d Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Mon, 14 Jul 2025 17:35:56 +0200
Subject: [PATCH 3/9] parallelization of starklings evaluation

---
 .github/scripts/starklings-evaluate.js | 186 +++++++++++++++++--------
 .gitignore                             |   3 +-
 script-starklings.sh                   |   4 +-
 3 files changed, 133 insertions(+), 60 deletions(-)

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
index 93e2961a..5a45ffd1 100644
--- a/.github/scripts/starklings-evaluate.js
+++ b/.github/scripts/starklings-evaluate.js
@@ -14,17 +14,15 @@ function log(message) {
 }
 
 function parseInfoToml(infoPath) {
-    // log(`Parsing info.toml from: ${infoPath}`);
-    
     if (!fs.existsSync(infoPath)) {
         throw new Error(`info.toml not found at: ${infoPath}`);
     }
     
     const content = fs.readFileSync(infoPath, 'utf8');
-    // log(`File content length: ${content.length} characters`);
-    
-    const exercises = [];
     const lines = content.split('\n');
+    
+    const categories = {};
+    let currentCategory = null;
     let currentExercise = null;
     let collectingHint = false;
     let hintLines = [];
@@ -33,15 +31,23 @@ function parseInfoToml(infoPath) {
         const line = lines[i];
         const cleanLine = line.trim();
         
+        // Détecter les catégories
+        if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) {
+            currentCategory = cleanLine.substring(2).trim();
+            categories[currentCategory] = [];
+            continue;
+        }
+        
         if (cleanLine.startsWith('[[exercises]]')) {
             if (currentExercise) {
                 if (hintLines.length > 0) {
                     currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, '');
                 }
-                exercises.push(currentExercise);
-                // log(`Added exercise: ${currentExercise.name}`);
+                if (currentCategory) {
+                    categories[currentCategory].push(currentExercise);
+                }
             }
-            currentExercise = {};
+            currentExercise = { category: currentCategory };
             collectingHint = false;
             hintLines = [];
         } else if (cleanLine.startsWith('hint = """')) {
@@ -77,12 +83,12 @@ function parseInfoToml(infoPath) {
         if (hintLines.length > 0) {
             currentExercise.hint = hintLines.join('\n').replace(/"""$/, '');
         }
-        exercises.push(currentExercise);
-        // log(`Added final exercise: ${currentExercise.name}`);
+        if (currentCategory) {
+            categories[currentCategory].push(currentExercise);
+        }
     }
     
-    // log(`Total exercises parsed: ${exercises.length}`);
-    return exercises;
+    return categories;
 }
 
 async function testServerConnection() {
@@ -227,13 +233,20 @@ async function testExercise(exercise, starklingsPath) {
             
             log(`✅ ${exercise.name} - Success`);
             log(`Starklings output: ${result.substring(0, 200)}...`);
-            return true;
+            return { success: true };
         } catch (error) {
             log(`❌ ${exercise.name} - Execution failed`);
             log(`Error code: ${error.status}`);
             log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`);
             log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`);
             
+            // Formater l'erreur pour le rapport
+            const errorDetails = {
+                exitCode: error.status,
+                stdout: error.stdout || '',
+                stderr: error.stderr || ''
+            };
+            
             // Sauvegarder l'erreur pour debug
             if (SAVE_RESPONSES) {
                 const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
@@ -241,11 +254,11 @@ async function testExercise(exercise, starklingsPath) {
                 log(`Error details saved to: ${errorFile}`);
             }
             
-            return false;
+            return { success: false, error: errorDetails };
         }
     } catch (error) {
         log(`❌ ${exercise.name} - API call failed: ${error.message}`);
-        return false;
+        return { success: false, error: { message: error.message, type: 'API_ERROR' } };
     } finally {
         // Restaurer l'original
         fs.writeFileSync(exercisePath, originalContent);
@@ -254,6 +267,47 @@ async function testExercise(exercise, starklingsPath) {
     }
 }
 
+async function processCategoryWorker(categoryName, exercises, starklingsPath) {
+    const categoryResults = {
+        category: categoryName,
+        exercises: [],
+        passed: 0,
+        total: exercises.length
+    };
+
+    log(`\n[${categoryName}] Starting ${exercises.length} exercises...`);
+
+    for (const exercise of exercises) {
+        const result = await testExercise(exercise, starklingsPath);
+        
+        const exerciseResult = {
+            name: exercise.name,
+            success: result.success
+        };
+
+        // Ajouter les erreurs seulement si échec
+        if (!result.success && result.error) {
+            exerciseResult.error = result.error;
+        }
+
+        categoryResults.exercises.push(exerciseResult);
+        if (result.success) {
+            categoryResults.passed++;
+        }
+
+        log(`[${categoryName}] ${exercise.name}: ${result.success ? '✅' : '❌'}`);
+    }
+
+    categoryResults.successRate = (categoryResults.passed / categoryResults.total * 100).toFixed(1);
+
+    // Sauvegarder le rapport de catégorie
+    const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report.json`);
+    fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2));
+
+    log(`[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`);
+    return categoryResults;
+}
+
 function extractCairoCode(generatedResponse) {
     // Chercher les blocs de code Cairo ou génériques
     const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g;
@@ -271,16 +325,9 @@ function extractCairoCode(generatedResponse) {
 }
 
 async function main() {
-    // log('=== Starting Starklings Debug Session ===');
-    
     const starklingsPath = path.join(process.cwd(), 'starklings');
     const infoPath = path.join(starklingsPath, 'info.toml');
-    
-    // Vérifications initiales
-    // log(`Working directory: ${process.cwd()}`);
-    // log(`Starklings path: ${starklingsPath}`);
-    // log(`Info.toml path: ${infoPath}`);
-    
+
     if (!fs.existsSync(starklingsPath)) {
         console.error('❌ Starklings directory not found');
         process.exit(1);
@@ -298,53 +345,78 @@ async function main() {
         process.exit(1);
     }
     
-    // Parser les exercices
-    const exercises = parseInfoToml(infoPath);
+    // Parser les exercices par catégorie
+    const categories = parseInfoToml(infoPath);
     
-    if (exercises.length === 0) {
-        console.error('❌ No exercises found');
+    if (Object.keys(categories).length === 0) {
+        console.error('❌ No categories found');
         process.exit(1);
     }
-    
-    // Filtrer à un seul exercice si demandé
-    let exercisesToTest = exercises;
+
+    // Filtrer à une seule catégorie si demandé
+    let categoriesToTest = categories;
     if (SINGLE_EXERCISE) {
-        exercisesToTest = exercises.filter(ex => ex.name === SINGLE_EXERCISE);
-        if (exercisesToTest.length === 0) {
+        // Trouver la catégorie contenant l'exercice
+        let foundCategory = null;
+        for (const [categoryName, exercises] of Object.entries(categories)) {
+            if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) {
+                foundCategory = categoryName;
+                break;
+            }
+        }
+        
+        if (!foundCategory) {
             console.error(`❌ Exercise '${SINGLE_EXERCISE}' not found`);
-            console.log('Available exercises:', exercises.map(ex => ex.name).join(', '));
             process.exit(1);
         }
-        // log(`Testing single exercise: ${SINGLE_EXERCISE}`);
+        
+        categoriesToTest = {
+            [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE)
+        };
+        log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`);
     }
-    
+
     // Créer le dossier de debug
     const debugDir = path.join(__dirname, '..', '..', 'debug');
     fs.mkdirSync(debugDir, { recursive: true });
-    
-    // Tester les exercices
-    let passed = 0;
-    let total = exercisesToTest.length;
-    
-    console.log(`\n🧪 Starting evaluation of ${total} exercises...`);
-    
-    for (const exercise of exercisesToTest) {
-        const success = await testExercise(exercise, starklingsPath);
-        if (success) {
-            passed++;
-        }
-        
-        // Pause entre les exercices pour éviter la surcharge
-        if (exercisesToTest.length > 1) {
-            await new Promise(resolve => setTimeout(resolve, 1000));
-        }
-    }
-    
+
+    // Calculer le total d'exercices
+    const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0);
+    console.log(`\n🧪 Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
+
+    // Traiter les catégories en parallèle
+    const startTime = Date.now();
+    const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => 
+        processCategoryWorker(categoryName, exercises, starklingsPath)
+    );
+
+    const categoryResults = await Promise.all(categoryPromises);
+    const endTime = Date.now();
+
+    // Consolider les résultats
+    const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0);
+    const globalResults = {
+        totalExercises: totalExercises,
+        totalPassed: totalPassed,
+        globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1),
+        categories: categoryResults
+    };
+
+    // Sauvegarder le rapport global
+    const globalReportPath = path.join(debugDir, 'global_report.json');
+    fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2));
+
     console.log(`\n=== Final Results ===`);
-    console.log(`${passed}/${total} exercises passed (${(passed/total*100).toFixed(1)}%)`);
+    console.log(`${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`);
+    console.log(`Total time: ${(endTime - startTime) / 1000}s`);
+    console.log(`\nCategory breakdown:`);
     
-    log(`Debug files saved in: ${debugDir}`);
-    // log('=== Debug Session Complete ===');
+    categoryResults.forEach(result => {
+        console.log(`  ${result.category}: ${result.passed}/${result.total} (${result.successRate}%)`);
+    });
+
+    log(`Reports saved in: ${debugDir}`);
+    log(`Global report: ${globalReportPath}`);
 }
 
 main().catch(error => {
diff --git a/.gitignore b/.gitignore
index 5bc944bf..4efc92a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,4 +49,5 @@ packages/**/dist
 !.trunk/configs
 !.trunk/.gitignore
 
-starklings/
\ No newline at end of file
+starklings/
+debug/
\ No newline at end of file
diff --git a/script-starklings.sh b/script-starklings.sh
index 1e5c89b5..0ad3f18a 100644
--- a/script-starklings.sh
+++ b/script-starklings.sh
@@ -10,8 +10,8 @@ fi
 # 8. Lancer le test avec un seul exercice
 echo "🎯 Running single Starklings evaluation..."
 
-SINGLE_EXERCISE=primitive_types2  node .github/scripts/starklings-evaluate.js
-# node .github/scripts/starklings-evaluate.js
+# SINGLE_EXERCISE=primitive_types2  node .github/scripts/starklings-evaluate.js
+node .github/scripts/starklings-evaluate.js
 
 # 9. Nettoyer
 echo "🧹 Cleaning up..."

From 4834388282959820bab1247b8ef31d0f01e4394e Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Mon, 14 Jul 2025 18:11:06 +0200
Subject: [PATCH 4/9] add 10 launch of evaluation and global report

---
 .github/scripts/starklings-evaluate.js | 736 ++++++++++++++-----------
 1 file changed, 408 insertions(+), 328 deletions(-)

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
index 5a45ffd1..7cd78bdd 100644
--- a/.github/scripts/starklings-evaluate.js
+++ b/.github/scripts/starklings-evaluate.js
@@ -8,114 +8,112 @@ const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; // ex: "intro1"
 const SAVE_RESPONSES = true;
 
 function log(message) {
-    if (DEBUG) {
-        console.log(`[DEBUG] ${message}`);
-    }
+   if (DEBUG) {
+       console.log(`[DEBUG] ${message}`);
+   }
 }
 
 function parseInfoToml(infoPath) {
-    if (!fs.existsSync(infoPath)) {
-        throw new Error(`info.toml not found at: ${infoPath}`);
-    }
-    
-    const content = fs.readFileSync(infoPath, 'utf8');
-    const lines = content.split('\n');
-    
-    const categories = {};
-    let currentCategory = null;
-    let currentExercise = null;
-    let collectingHint = false;
-    let hintLines = [];
-
-    for (let i = 0; i < lines.length; i++) {
-        const line = lines[i];
-        const cleanLine = line.trim();
-        
-        // Détecter les catégories
-        if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) {
-            currentCategory = cleanLine.substring(2).trim();
-            categories[currentCategory] = [];
-            continue;
-        }
-        
-        if (cleanLine.startsWith('[[exercises]]')) {
-            if (currentExercise) {
-                if (hintLines.length > 0) {
-                    currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, '');
-                }
-                if (currentCategory) {
-                    categories[currentCategory].push(currentExercise);
-                }
-            }
-            currentExercise = { category: currentCategory };
-            collectingHint = false;
-            hintLines = [];
-        } else if (cleanLine.startsWith('hint = """')) {
-            collectingHint = true;
-            hintLines.push(cleanLine.replace('hint = """', '').trim());
-        } else if (collectingHint) {
-            if (cleanLine.endsWith('"""')) {
-                hintLines.push(cleanLine.replace('"""', '').trim());
-                collectingHint = false;
-            } else {
-                hintLines.push(cleanLine);
-            }
-        } else if (cleanLine.startsWith('name = ')) {
-            const match = cleanLine.match(/name = "(.+)"/);
-            if (match) {
-                currentExercise.name = match[1];
-            }
-        } else if (cleanLine.startsWith('path = ')) {
-            const match = cleanLine.match(/path = "(.+)"/);
-            if (match) {
-                currentExercise.path = match[1];
-            }
-        } else if (cleanLine.startsWith('mode = ')) {
-            const match = cleanLine.match(/mode = "(.+)"/);
-            if (match) {
-                currentExercise.mode = match[1];
-            }
-        }
-    }
-    
-    // N'oublie pas le dernier exercice
-    if (currentExercise) {
-        if (hintLines.length > 0) {
-            currentExercise.hint = hintLines.join('\n').replace(/"""$/, '');
-        }
-        if (currentCategory) {
-            categories[currentCategory].push(currentExercise);
-        }
-    }
-    
-    return categories;
+   if (!fs.existsSync(infoPath)) {
+       throw new Error(`info.toml not found at: ${infoPath}`);
+   }
+   
+   const content = fs.readFileSync(infoPath, 'utf8');
+   const lines = content.split('\n');
+   
+   const categories = {};
+   let currentCategory = null;
+   let currentExercise = null;
+   let collectingHint = false;
+   let hintLines = [];
+
+   for (let i = 0; i < lines.length; i++) {
+       const line = lines[i];
+       const cleanLine = line.trim();
+       
+       // Détecter les catégories
+       if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) {
+           currentCategory = cleanLine.substring(2).trim();
+           categories[currentCategory] = [];
+           continue;
+       }
+       
+       if (cleanLine.startsWith('[[exercises]]')) {
+           if (currentExercise) {
+               if (hintLines.length > 0) {
+                   currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, '');
+               }
+               if (currentCategory) {
+                   categories[currentCategory].push(currentExercise);
+               }
+           }
+           currentExercise = { category: currentCategory };
+           collectingHint = false;
+           hintLines = [];
+       } else if (cleanLine.startsWith('hint = """')) {
+           collectingHint = true;
+           hintLines.push(cleanLine.replace('hint = """', '').trim());
+       } else if (collectingHint) {
+           if (cleanLine.endsWith('"""')) {
+               hintLines.push(cleanLine.replace('"""', '').trim());
+               collectingHint = false;
+           } else {
+               hintLines.push(cleanLine);
+           }
+       } else if (cleanLine.startsWith('name = ')) {
+           const match = cleanLine.match(/name = "(.+)"/);
+           if (match) {
+               currentExercise.name = match[1];
+           }
+       } else if (cleanLine.startsWith('path = ')) {
+           const match = cleanLine.match(/path = "(.+)"/);
+           if (match) {
+               currentExercise.path = match[1];
+           }
+       } else if (cleanLine.startsWith('mode = ')) {
+           const match = cleanLine.match(/mode = "(.+)"/);
+           if (match) {
+               currentExercise.mode = match[1];
+           }
+       }
+   }
+   
+   // N'oublie pas le dernier exercice
+   if (currentExercise) {
+       if (hintLines.length > 0) {
+           currentExercise.hint = hintLines.join('\n').replace(/"""$/, '');
+       }
+       if (currentCategory) {
+           categories[currentCategory].push(currentExercise);
+       }
+   }
+   
+   return categories;
 }
 
 async function testServerConnection() {
-    log('Testing server connection...');
-
-    try {
-        const response = await fetch('http://localhost:3002/', {
-            method: 'GET',
-            timeout: 5000
-        });
-        
-        if (response.ok) {
-            log('✅ Server connection successful');
-            return true;
-        } else {
-            log(`❌ Server responded with status: ${response.status}`);
-            return false;
-        }
-    } catch (error) {
-        log(`❌ Server connection failed: ${error.message}`);
-        return false;
-    }
+   log('Testing server connection...');
+
+   try {
+       const response = await fetch('http://localhost:3002/', {
+           method: 'GET',
+           timeout: 5000
+       });
+       
+       if (response.ok) {
+           log('✅ Server connection successful');
+           return true;
+       } else {
+           log(`❌ Server responded with status: ${response.status}`);
+           return false;
+       }
+   } catch (error) {
+       log(`❌ Server connection failed: ${error.message}`);
+       return false;
+   }
 }
 
-async function callCairoCoderAPI(exerciseContent, exercise) {
-    // log(`Calling API for exercise: ${exercise.name}`);
-    
+async function callCairoCoderAPI(exerciseContent, exercise, retries = 3) {
     const prompt = `You are solving a Cairo programming exercise.
 
 Exercise: ${exercise.name}
@@ -139,135 +137,137 @@ Please provide only the corrected code, without any additional explanation or ma
         stream: false
     };
 
-    // log(`Request body size: ${JSON.stringify(requestBody).length} characters`);
-
-    try {
-        const response = await fetch('http://localhost:3002/v1/chat/completions', {
-            method: 'POST',
-            headers: { 
-                'Content-Type': 'application/json',
-            },
-            body: JSON.stringify(requestBody),
-            timeout: 60000 // 60 secondes
-        });
-
-        if (!response.ok) {
-            const errorText = await response.text();
-            log(`API Error - Status: ${response.status}, Response: ${errorText}`);
-            throw new Error(`HTTP error! status: ${response.status} - ${errorText}`);
-        }
-
-        const data = await response.json();
-        // log(`API Response received, data structure: ${Object.keys(data).join(', ')}`);
-        
-        // Sauvegarder la réponse complète si demandé
-        if (SAVE_RESPONSES) {
-            const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`);
-            fs.mkdirSync(path.dirname(responseFile), { recursive: true });
-            fs.writeFileSync(responseFile, JSON.stringify(data, null, 2));
-            // log(`Response saved to: ${responseFile}`);
-        }
-        
-        // Extraire le contenu de la réponse
-        if (data.choices && data.choices[0] && data.choices[0].message) {
-            const rawContent = data.choices[0].message.content;
-            const cleanCode = extractCairoCode(rawContent);
-            // log(`Generated code length: ${cleanCode.length} characters`);
-            // log(`Raw response length: ${rawContent.length} characters`);
-            return cleanCode;
-        } else {
-            log(`Invalid response format: ${JSON.stringify(data)}`);
-            throw new Error('Invalid response format from API');
-        }
-    } catch (error) {
-        log(`API call failed: ${error.message}`);
-        throw error;
-    }
-}
-
-async function testExercise(exercise, starklingsPath) {
-    log(`\n=== Testing exercise: ${exercise.name} ===`);
-    
-    const exercisePath = path.join(starklingsPath, exercise.path);
-    // log(`Exercise path: ${exercisePath}`);
-    
-    if (!fs.existsSync(exercisePath)) {
-        log(`❌ Exercise file not found: ${exercisePath}`);
-        return false;
-    }
-    
-    // Lire le contenu original
-    const originalContent = fs.readFileSync(exercisePath, 'utf8');
-    // log(`Original file size: ${originalContent.length} characters`);
-    
-    // Sauvegarder l'original
-    const backupPath = exercisePath + '.backup';
-    fs.writeFileSync(backupPath, originalContent);
-    // log(`Backup saved to: ${backupPath}`);
-    
-    try {
-        // Appeler l'API
-        const correctedCode = await callCairoCoderAPI(originalContent, exercise);
-        
-        // Sauvegarder la solution
-        fs.writeFileSync(exercisePath, correctedCode);
-        log(`Updated exercise file with generated code`);
-        
-        // Sauvegarder la solution générée pour debug
-        if (SAVE_RESPONSES) {
-            const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`);
-            fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
-            fs.writeFileSync(solutionFile, correctedCode);
-            // log(`Solution saved to: ${solutionFile}`);
-        }
-        
-        // Tester la solution
+    for (let attempt = 1; attempt <= retries; attempt++) {
         try {
-            log(`Running starklings for ${exercise.name}...`);
-            const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, {
-                cwd: starklingsPath,
-                stdio: 'pipe',
-                timeout: 300000,
-                encoding: 'utf8'
+            log(`API call attempt ${attempt}/${retries} for ${exercise.name}`);
+            
+            const response = await fetch('http://localhost:3002/v1/chat/completions', {
+                method: 'POST',
+                headers: { 
+                    'Content-Type': 'application/json',
+                },
+                body: JSON.stringify(requestBody),
+                timeout: 120000 // 2 minutes au lieu de 60 secondes
             });
+
+            if (!response.ok) {
+                const errorText = await response.text();
+                throw new Error(`HTTP error! status: ${response.status} - ${errorText}`);
+            }
+
+            const data = await response.json();
             
-            log(`✅ ${exercise.name} - Success`);
-            log(`Starklings output: ${result.substring(0, 200)}...`);
-            return { success: true };
-        } catch (error) {
-            log(`❌ ${exercise.name} - Execution failed`);
-            log(`Error code: ${error.status}`);
-            log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`);
-            log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`);
+            // Sauvegarder la réponse complète si demandé
+            if (SAVE_RESPONSES) {
+                const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`);
+                fs.mkdirSync(path.dirname(responseFile), { recursive: true });
+                fs.writeFileSync(responseFile, JSON.stringify(data, null, 2));
+            }
             
-            // Formater l'erreur pour le rapport
-            const errorDetails = {
-                exitCode: error.status,
-                stdout: error.stdout || '',
-                stderr: error.stderr || ''
-            };
+            // Extraire le contenu de la réponse
+            if (data.choices && data.choices[0] && data.choices[0].message) {
+                const rawContent = data.choices[0].message.content;
+                const cleanCode = extractCairoCode(rawContent);
+                log(`✅ API call successful for ${exercise.name}`);
+                return cleanCode;
+            } else {
+                throw new Error('Invalid response format from API');
+            }
             
-            // Sauvegarder l'erreur pour debug
-            if (SAVE_RESPONSES) {
-                const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
-                fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`);
-                log(`Error details saved to: ${errorFile}`);
+        } catch (error) {
+            log(`❌ API call failed (attempt ${attempt}/${retries}) for ${exercise.name}: ${error.message}`);
+            
+            if (attempt === retries) {
+                throw error; // Dernier essai, on lance l'erreur
             }
             
-            return { success: false, error: errorDetails };
+            // Attendre de plus en plus longtemps à chaque retry
+            const waitTime = 3000 * attempt; // 3s, 6s, 9s
+            log(`Waiting ${waitTime}ms before retry...`);
+            await new Promise(resolve => setTimeout(resolve, waitTime));
         }
-    } catch (error) {
-        log(`❌ ${exercise.name} - API call failed: ${error.message}`);
-        return { success: false, error: { message: error.message, type: 'API_ERROR' } };
-    } finally {
-        // Restaurer l'original
-        fs.writeFileSync(exercisePath, originalContent);
-        fs.unlinkSync(backupPath);
-        log(`Restored original file and cleaned up backup`);
     }
 }
 
-async function processCategoryWorker(categoryName, exercises, starklingsPath) {
+async function testExercise(exercise, starklingsPath, runNumber = 1) {
+   log(`\n=== Testing exercise: ${exercise.name} ===`);
+   
+   const exercisePath = path.join(starklingsPath, exercise.path);
+   
+   if (!fs.existsSync(exercisePath)) {
+       log(`❌ Exercise file not found: ${exercisePath}`);
+       return { success: false, error: { message: 'File not found', type: 'FILE_ERROR' } };
+   }
+   
+   // Lire le contenu original
+   const originalContent = fs.readFileSync(exercisePath, 'utf8');
+   
+   // Sauvegarder l'original
+   const backupPath = exercisePath + '.backup';
+   fs.writeFileSync(backupPath, originalContent);
+   
+   try {
+       // Appeler l'API
+       const correctedCode = await callCairoCoderAPI(originalContent, exercise);
+       
+       // Sauvegarder la solution
+       fs.writeFileSync(exercisePath, correctedCode);
+       log(`Updated exercise file with generated code`);
+       
+       // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10)
+       if (SAVE_RESPONSES && runNumber === 10) {
+           const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`);
+           fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
+           fs.writeFileSync(solutionFile, correctedCode);
+       }
+       
+       // Tester la solution
+       try {
+           log(`Running starklings for ${exercise.name}...`);
+           const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, {
+               cwd: starklingsPath,
+               stdio: 'pipe',
+               timeout: 300000,
+               encoding: 'utf8'
+           });
+           
+           log(`✅ ${exercise.name} - Success`);
+           log(`Starklings output: ${result.substring(0, 200)}...`);
+           return { success: true };
+       } catch (error) {
+           log(`❌ ${exercise.name} - Execution failed`);
+           log(`Error code: ${error.status}`);
+           log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`);
+           log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`);
+           
+           // Formater l'erreur pour le rapport
+           const errorDetails = {
+               exitCode: error.status,
+               stdout: error.stdout || '',
+               stderr: error.stderr || ''
+           };
+           
+           // Sauvegarder les erreurs SEULEMENT pour le dernier run
+           if (SAVE_RESPONSES && runNumber === 10) {
+               const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
+               fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`);
+               log(`Error details saved to: ${errorFile}`);
+           }
+           
+           return { success: false, error: errorDetails };
+       }
+   } catch (error) {
+       log(`❌ ${exercise.name} - API call failed: ${error.message}`);
+       return { success: false, error: { message: error.message, type: 'API_ERROR' } };
+   } finally {
+       // Restaurer l'original
+       fs.writeFileSync(exercisePath, originalContent);
+       fs.unlinkSync(backupPath);
+       log(`Restored original file and cleaned up backup`);
+   }
+}
+
+async function processCategoryWorker(categoryName, exercises, starklingsPath, runNumber = 1) {
     const categoryResults = {
         category: categoryName,
         exercises: [],
@@ -278,14 +278,18 @@ async function processCategoryWorker(categoryName, exercises, starklingsPath) {
     log(`\n[${categoryName}] Starting ${exercises.length} exercises...`);
 
     for (const exercise of exercises) {
-        const result = await testExercise(exercise, starklingsPath);
+        // Délai entre chaque exercice pour éviter la surcharge
+        if (categoryResults.exercises.length > 0) {
+            await new Promise(resolve => setTimeout(resolve, 1000)); // 1 seconde
+        }
+        
+        const result = await testExercise(exercise, starklingsPath, runNumber);
         
         const exerciseResult = {
             name: exercise.name,
             success: result.success
         };
 
-        // Ajouter les erreurs seulement si échec
         if (!result.success && result.error) {
             exerciseResult.error = result.error;
         }
@@ -300,8 +304,7 @@ async function processCategoryWorker(categoryName, exercises, starklingsPath) {
 
     categoryResults.successRate = (categoryResults.passed / categoryResults.total * 100).toFixed(1);
 
-    // Sauvegarder le rapport de catégorie
-    const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report.json`);
+    const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`);
     fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2));
 
     log(`[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`);
@@ -309,117 +312,194 @@ async function processCategoryWorker(categoryName, exercises, starklingsPath) {
 }
 
 function extractCairoCode(generatedResponse) {
-    // Chercher les blocs de code Cairo ou génériques
-    const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g;
-    const matches = generatedResponse.match(codeBlockRegex);
-    
-    if (matches && matches.length > 0) {
-        // Extraire le contenu du premier bloc de code trouvé
-        const codeBlock = matches[0];
-        const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, '');
-        return codeContent.trim();
-    }
-    
-    // Si pas de bloc de code trouvé, retourner le texte tel quel
-    return generatedResponse.trim();
+   // Chercher les blocs de code Cairo ou génériques
+   const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g;
+   const matches = generatedResponse.match(codeBlockRegex);
+   
+   if (matches && matches.length > 0) {
+       // Extraire le contenu du premier bloc de code trouvé
+       const codeBlock = matches[0];
+       const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, '');
+       return codeContent.trim();
+   }
+   
+   // Si pas de bloc de code trouvé, retourner le texte tel quel
+   return generatedResponse.trim();
 }
 
-async function main() {
-    const starklingsPath = path.join(process.cwd(), 'starklings');
-    const infoPath = path.join(starklingsPath, 'info.toml');
-
-    if (!fs.existsSync(starklingsPath)) {
-        console.error('❌ Starklings directory not found');
-        process.exit(1);
-    }
-    
-    if (!fs.existsSync(infoPath)) {
-        console.error('❌ info.toml not found in starklings directory');
-        process.exit(1);
-    }
-    
-    // Tester la connexion au serveur
-    const serverOk = await testServerConnection();
-    if (!serverOk) {
-        console.error('❌ Server is not accessible');
-        process.exit(1);
-    }
-    
-    // Parser les exercices par catégorie
-    const categories = parseInfoToml(infoPath);
-    
-    if (Object.keys(categories).length === 0) {
-        console.error('❌ No categories found');
-        process.exit(1);
-    }
-
-    // Filtrer à une seule catégorie si demandé
-    let categoriesToTest = categories;
-    if (SINGLE_EXERCISE) {
-        // Trouver la catégorie contenant l'exercice
-        let foundCategory = null;
-        for (const [categoryName, exercises] of Object.entries(categories)) {
-            if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) {
-                foundCategory = categoryName;
-                break;
-            }
-        }
-        
-        if (!foundCategory) {
-            console.error(`❌ Exercise '${SINGLE_EXERCISE}' not found`);
-            process.exit(1);
-        }
-        
-        categoriesToTest = {
-            [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE)
-        };
-        log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`);
-    }
+function generateConsolidatedReport(allResults) {
+   if (allResults.length === 0) {
+       return { error: 'No successful runs' };
+   }
+   
+   const successRates = allResults.map(r => parseFloat(r.globalSuccessRate));
+   const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1);
+   
+   const bestRun = allResults.reduce((best, current) => 
+       parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best
+   );
+   
+   const worstRun = allResults.reduce((worst, current) => 
+       parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst
+   );
+   
+   // Analyse par catégorie
+   const categoryStats = {};
+   allResults.forEach(run => {
+       run.categories.forEach(category => {
+           if (!categoryStats[category.category]) {
+               categoryStats[category.category] = {
+                   successRates: [],
+                   averageSuccessRate: 0,
+                   bestRate: 0,
+                   worstRate: 100
+               };
+           }
+           
+           const rate = parseFloat(category.successRate);
+           categoryStats[category.category].successRates.push(rate);
+           categoryStats[category.category].bestRate = Math.max(categoryStats[category.category].bestRate, rate);
+           categoryStats[category.category].worstRate = Math.min(categoryStats[category.category].worstRate, rate);
+       });
+   });
+   
+   // Calculer les moyennes par catégorie
+   Object.keys(categoryStats).forEach(category => {
+       const rates = categoryStats[category].successRates;
+       categoryStats[category].averageSuccessRate = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1);
+   });
+   
+   return {
+       totalRuns: allResults.length,
+       averageSuccessRate: averageSuccessRate,
+       bestRun: bestRun,
+       worstRun: worstRun,
+       categoryStats: categoryStats,
+       allRuns: allResults
+   };
+}
 
-    // Créer le dossier de debug
-    const debugDir = path.join(__dirname, '..', '..', 'debug');
-    fs.mkdirSync(debugDir, { recursive: true });
-
-    // Calculer le total d'exercices
-    const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0);
-    console.log(`\n🧪 Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
-
-    // Traiter les catégories en parallèle
-    const startTime = Date.now();
-    const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => 
-        processCategoryWorker(categoryName, exercises, starklingsPath)
-    );
-
-    const categoryResults = await Promise.all(categoryPromises);
-    const endTime = Date.now();
-
-    // Consolider les résultats
-    const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0);
-    const globalResults = {
-        totalExercises: totalExercises,
-        totalPassed: totalPassed,
-        globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1),
-        categories: categoryResults
-    };
+async function runSingleTest(runNumber) {
+   const starklingsPath = path.join(process.cwd(), 'starklings');
+   const infoPath = path.join(starklingsPath, 'info.toml');
+
+   if (!fs.existsSync(starklingsPath)) {
+       throw new Error('Starklings directory not found');
+   }
+   
+   if (!fs.existsSync(infoPath)) {
+       throw new Error('info.toml not found in starklings directory');
+   }
+   
+   // Tester la connexion au serveur
+   const serverOk = await testServerConnection();
+   if (!serverOk) {
+       throw new Error('Server is not accessible');
+   }
+   
+   // Parser les exercices par catégorie
+   const categories = parseInfoToml(infoPath);
+   
+   if (Object.keys(categories).length === 0) {
+       throw new Error('No categories found');
+   }
+
+   // Filtrer à une seule catégorie si demandé
+   let categoriesToTest = categories;
+   if (SINGLE_EXERCISE) {
+       let foundCategory = null;
+       for (const [categoryName, exercises] of Object.entries(categories)) {
+           if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) {
+               foundCategory = categoryName;
+               break;
+           }
+       }
+       
+       if (!foundCategory) {
+           throw new Error(`Exercise '${SINGLE_EXERCISE}' not found`);
+       }
+       
+       categoriesToTest = {
+           [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE)
+       };
+       log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`);
+   }
+
+   // Créer le dossier de debug
+   const debugDir = path.join(__dirname, '..', '..', 'debug');
+   fs.mkdirSync(debugDir, { recursive: true });
+
+   // Calculer le total d'exercices
+   const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0);
+   console.log(`\n🧪 [RUN ${runNumber}/10] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
+
+   // Traiter les catégories en parallèle
+   const startTime = Date.now();
+   const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => 
+       processCategoryWorker(categoryName, exercises, starklingsPath, runNumber)
+   );
+
+   const categoryResults = await Promise.all(categoryPromises);
+   const endTime = Date.now();
+
+   // Consolider les résultats
+   const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0);
+   const globalResults = {
+       runNumber: runNumber,
+       timestamp: new Date().toISOString(),
+       totalExercises: totalExercises,
+       totalPassed: totalPassed,
+       globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1),
+       executionTime: (endTime - startTime) / 1000,
+       categories: categoryResults
+   };
+
+   // Sauvegarder le rapport global pour ce run
+   const globalReportPath = path.join(debugDir, `global_report_run${runNumber}.json`);
+   fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2));
+
+   console.log(`[RUN ${runNumber}] ${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`);
+   
+   return globalResults;
+}
 
-    // Sauvegarder le rapport global
-    const globalReportPath = path.join(debugDir, 'global_report.json');
-    fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2));
-
-    console.log(`\n=== Final Results ===`);
-    console.log(`${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`);
-    console.log(`Total time: ${(endTime - startTime) / 1000}s`);
-    console.log(`\nCategory breakdown:`);
-    
-    categoryResults.forEach(result => {
-        console.log(`  ${result.category}: ${result.passed}/${result.total} (${result.successRate}%)`);
-    });
-
-    log(`Reports saved in: ${debugDir}`);
-    log(`Global report: ${globalReportPath}`);
+async function main() {
+   const NUM_RUNS = 10;
+   const allResults = [];
+   
+   console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`);
+   
+   for (let i = 1; i <= NUM_RUNS; i++) {
+       try {
+           const result = await runSingleTest(i);
+           allResults.push(result);
+           
+           // Petite pause entre les runs pour éviter la surcharge
+           if (i < NUM_RUNS) {
+               await new Promise(resolve => setTimeout(resolve, 2000));
+           }
+       } catch (error) {
+           console.error(`❌ Run ${i} failed:`, error.message);
+           // Continuer avec les autres runs même si un échoue
+       }
+   }
+   
+   // Générer le rapport consolidé
+   const debugDir = path.join(__dirname, '..', '..', 'debug');
+   const consolidatedReport = generateConsolidatedReport(allResults);
+   const consolidatedReportPath = path.join(debugDir, 'consolidated_report.json');
+   fs.writeFileSync(consolidatedReportPath, JSON.stringify(consolidatedReport, null, 2));
+   
+   console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`);
+   console.log(`Average success rate: ${consolidatedReport.averageSuccessRate}%`);
+   console.log(`Best run: ${consolidatedReport.bestRun.globalSuccessRate}% (Run ${consolidatedReport.bestRun.runNumber})`);
+   console.log(`Worst run: ${consolidatedReport.worstRun.globalSuccessRate}% (Run ${consolidatedReport.worstRun.runNumber})`);
+   
+   log(`All reports saved in: ${debugDir}`);
+   log(`Consolidated report: ${consolidatedReportPath}`);
 }
 
 main().catch(error => {
-    console.error('❌ Fatal error:', error);
-    process.exit(1);
+   console.error('❌ Fatal error:', error);
+   process.exit(1);
 });
\ No newline at end of file

From 48f48398abae5d4c5cf488307c93c27b673607f3 Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Wed, 16 Jul 2025 00:12:38 +0200
Subject: [PATCH 5/9] improve overall report

---
 .github/scripts/starklings-evaluate.js | 116 ++++++++++++++-----------
 1 file changed, 63 insertions(+), 53 deletions(-)

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
index 7cd78bdd..2e95036e 100644
--- a/.github/scripts/starklings-evaluate.js
+++ b/.github/scripts/starklings-evaluate.js
@@ -215,7 +215,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) {
        log(`Updated exercise file with generated code`);
        
        // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10)
-       if (SAVE_RESPONSES && runNumber === 10) {
+       if (SAVE_RESPONSES && runNumber === 2) {
            const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`);
            fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
            fs.writeFileSync(solutionFile, correctedCode);
@@ -248,7 +248,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) {
            };
            
            // Sauvegarder les erreurs SEULEMENT pour le dernier run
-           if (SAVE_RESPONSES && runNumber === 10) {
+           if (SAVE_RESPONSES && runNumber === 2) {
                const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
                fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`);
                log(`Error details saved to: ${errorFile}`);
@@ -328,55 +328,65 @@ function extractCairoCode(generatedResponse) {
 }
 
 function generateConsolidatedReport(allResults) {
-   if (allResults.length === 0) {
-       return { error: 'No successful runs' };
-   }
-   
-   const successRates = allResults.map(r => parseFloat(r.globalSuccessRate));
-   const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1);
-   
-   const bestRun = allResults.reduce((best, current) => 
-       parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best
-   );
-   
-   const worstRun = allResults.reduce((worst, current) => 
-       parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst
-   );
-   
-   // Analyse par catégorie
-   const categoryStats = {};
-   allResults.forEach(run => {
-       run.categories.forEach(category => {
-           if (!categoryStats[category.category]) {
-               categoryStats[category.category] = {
-                   successRates: [],
-                   averageSuccessRate: 0,
-                   bestRate: 0,
-                   worstRate: 100
-               };
-           }
-           
-           const rate = parseFloat(category.successRate);
-           categoryStats[category.category].successRates.push(rate);
-           categoryStats[category.category].bestRate = Math.max(categoryStats[category.category].bestRate, rate);
-           categoryStats[category.category].worstRate = Math.min(categoryStats[category.category].worstRate, rate);
-       });
-   });
-   
-   // Calculer les moyennes par catégorie
-   Object.keys(categoryStats).forEach(category => {
-       const rates = categoryStats[category].successRates;
-       categoryStats[category].averageSuccessRate = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1);
-   });
-   
-   return {
-       totalRuns: allResults.length,
-       averageSuccessRate: averageSuccessRate,
-       bestRun: bestRun,
-       worstRun: worstRun,
-       categoryStats: categoryStats,
-       allRuns: allResults
-   };
+    if (allResults.length === 0) {
+        return { error: 'No successful runs' };
+    }
+    
+    // Taux de réussite global
+    const successRates = allResults.map(r => parseFloat(r.globalSuccessRate));
+    const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1);
+    
+    // Taux de réussite par catégorie
+    const categoryStats = {};
+    allResults.forEach(run => {
+        run.categories.forEach(category => {
+            if (!categoryStats[category.category]) {
+                categoryStats[category.category] = {
+                    successRates: []
+                };
+            }
+            categoryStats[category.category].successRates.push(parseFloat(category.successRate));
+        });
+    });
+    
+    // Calculer les moyennes par catégorie
+    const categoryAverages = {};
+    Object.keys(categoryStats).forEach(category => {
+        const rates = categoryStats[category].successRates;
+        categoryAverages[category] = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + '%';
+    });
+    
+    // Collecter les erreurs par exercice et par run
+    const exerciseErrors = {};
+    allResults.forEach(run => {
+        run.categories.forEach(category => {
+            category.exercises.forEach(exercise => {
+                if (!exercise.success && exercise.error) {
+                    if (!exerciseErrors[exercise.name]) {
+                        exerciseErrors[exercise.name] = [];
+                    }
+                    
+                    // Ajouter l'erreur avec le numéro de run
+                    exerciseErrors[exercise.name].push({
+                        run: run.runNumber,
+                        type: exercise.error.type || 'COMPILATION_ERROR',
+                        message: exercise.error.message || 'Compilation failed',
+                        stdout: exercise.error.stdout ? exercise.error.stdout.substring(0, 500) : null,
+                        stderr: exercise.error.stderr ? exercise.error.stderr.substring(0, 500) : null
+                    });
+                }
+            });
+        });
+    });
+    
+    return {
+        summary: {
+            totalRuns: allResults.length,
+            globalSuccessRate: averageSuccessRate + '%'
+        },
+        categorySuccessRates: categoryAverages,
+        exerciseErrors: exerciseErrors
+    };
 }
 
 async function runSingleTest(runNumber) {
@@ -431,7 +441,7 @@ async function runSingleTest(runNumber) {
 
    // Calculer le total d'exercices
    const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0);
-   console.log(`\n🧪 [RUN ${runNumber}/10] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
+   console.log(`\n🧪 [RUN ${runNumber}/2] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
 
    // Traiter les catégories en parallèle
    const startTime = Date.now();
@@ -464,7 +474,7 @@ async function runSingleTest(runNumber) {
 }
 
 async function main() {
-   const NUM_RUNS = 10;
+   const NUM_RUNS = 2;
    const allResults = [];
    
    console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`);

From a9f3f6313256a69252253ebbe36f1f0897259b9d Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Wed, 16 Jul 2025 00:14:29 +0200
Subject: [PATCH 6/9] fix: remove starklings ci

---
 .github/workflows/starklings.yml | 138 -------------------------------
 1 file changed, 138 deletions(-)
 delete mode 100644 .github/workflows/starklings.yml

diff --git a/.github/workflows/starklings.yml b/.github/workflows/starklings.yml
deleted file mode 100644
index baf5f490..00000000
--- a/.github/workflows/starklings.yml
+++ /dev/null
@@ -1,138 +0,0 @@
-name: Starklings Benchmark
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-  workflow_dispatch:
-
-jobs:
-  starklings-benchmark:
-    name: Starklings Benchmark
-    runs-on: ubuntu-latest
-    
-    steps:
-      - uses: actions/checkout@v4
-      
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-      
-      - name: Setup Rust
-        uses: actions-rs/toolchain@v1
-        with:
-          toolchain: stable
-          override: true
-      
-      - name: Install pnpm
-        uses: pnpm/action-setup@v3
-        with:
-          version: 9
-      
-      - name: Install dependencies
-        run: pnpm install
-      
-      - name: Build Cairo Coder
-        run: pnpm build
-      
-      - name: Setup PostgreSQL
-        uses: harmon758/postgresql-action@v1
-        with:
-          postgresql version: '15'
-          postgresql db: 'cairo_coder_test'
-          postgresql user: 'test_user'
-          postgresql password: 'test_password'
-      
-      - name: Install PostgreSQL client and pgvector
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y postgresql-client-15
-          sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"
-      
-      - name: Setup test configuration
-        run: |
-          mkdir -p packages/agents
-          cat > packages/agents/config.toml << 'EOL'
-          [API_KEYS]
-          OPENAI = "${{ secrets.OPENAI_API_KEY }}"
-          ANTHROPIC = "${{ secrets.ANTHROPIC_API_KEY }}"
-          GEMINI = "${{ secrets.GEMINI_API_KEY }}"
-
-          [VECTOR_DB]
-          POSTGRES_USER = "test_user"
-          POSTGRES_HOST = "localhost"
-          POSTGRES_DB = "cairo_coder_test"
-          POSTGRES_PASSWORD = "test_password"
-          POSTGRES_PORT = "5432"
-
-          [GENERAL]
-          PORT = 3001
-          SIMILARITY_MEASURE = "cosine"
-
-          [PROVIDERS]
-          DEFAULT_CHAT_PROVIDER = "gemini"
-          DEFAULT_CHAT_MODEL = "Gemini Flash 2.5"
-          DEFAULT_FAST_CHAT_PROVIDER = "gemini"
-          DEFAULT_FAST_CHAT_MODEL = "Gemini Flash 2.5"
-          DEFAULT_EMBEDDING_PROVIDER = "openai"
-          DEFAULT_EMBEDDING_MODEL = "Text embedding 3 large"
-
-          [VERSIONS]
-          STARKNET_FOUNDRY = "0.37.0"
-          SCARB = "2.9.2"
-          EOL
-      
-      - name: Create env file
-        run: |
-          cat > .env << 'EOL'
-          POSTGRES_USER=test_user
-          POSTGRES_HOST=localhost
-          POSTGRES_DB=cairo_coder_test
-          POSTGRES_PASSWORD=test_password
-          POSTGRES_PORT=5432
-          EOL
-      
-      - name: Clone Starklings
-        run: |
-          if [ ! -d "starklings" ]; then
-            git clone https://github.com/starknet-edu/starklings.git
-          fi
-      
-      - name: Install Scarb
-        run: |
-          curl --proto '=https' --tlsv1.2 -sSf https://docs.swmansion.com/scarb/install.sh | sh
-          echo "$HOME/.local/bin" >> $GITHUB_PATH
-      
-      - name: Start Cairo Coder (background)
-        run: |
-          pnpm start &
-          # Attendre que le serveur démarre
-          for i in {1..30}; do
-            if curl -s http://localhost:3001/ > /dev/null; then
-              echo "Server is ready"
-              break
-            fi
-            echo "Waiting for server... ($i/30)"
-            sleep 2
-          done
-          
-          # Vérifier si le serveur est vraiment prêt
-          if ! curl -s http://localhost:3001/ > /dev/null; then
-            echo "Server failed to start"
-            exit 1
-          fi
-      
-      - name: Run Starklings Evaluation
-        run: node .github/scripts/starklings-evaluate.js
-        timeout-minutes: 30
-      
-      - name: Upload results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: starklings-results
-          path: |
-            starklings/
-            *.log
\ No newline at end of file

From d9e177a0136dad1dd69abb8123a09e147979fd7a Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Wed, 16 Jul 2025 20:37:10 +0200
Subject: [PATCH 7/9] improve bash script to install good version of starkling
 repo and starling script to improve consolidated report

---
 .github/scripts/starklings-evaluate.js | 48 ++++++++++++++++++--------
 script-starklings.sh                   | 25 ++++++++++++++
 2 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
index 2e95036e..b5ac7573 100644
--- a/.github/scripts/starklings-evaluate.js
+++ b/.github/scripts/starklings-evaluate.js
@@ -4,7 +4,7 @@ const path = require('path');
 
 // Configuration de débogage
 const DEBUG = true;
-const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; // ex: "intro1"
+const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; 
 const SAVE_RESPONSES = true;
 
 function log(message) {
@@ -215,7 +215,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) {
        log(`Updated exercise file with generated code`);
        
        // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10)
-       if (SAVE_RESPONSES && runNumber === 2) {
+       if (SAVE_RESPONSES && runNumber === 5) {
            const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`);
            fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
            fs.writeFileSync(solutionFile, correctedCode);
@@ -224,7 +224,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) {
        // Tester la solution
        try {
            log(`Running starklings for ${exercise.name}...`);
-           const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, {
+           const result = execSync(`cargo run --bin starklings run ${exercise.name}`, {
                cwd: starklingsPath,
                stdio: 'pipe',
                timeout: 300000,
@@ -248,7 +248,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) {
            };
            
            // Sauvegarder les erreurs SEULEMENT pour le dernier run
-           if (SAVE_RESPONSES && runNumber === 2) {
+           if (SAVE_RESPONSES && runNumber === 5) {
                const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
                fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`);
                log(`Error details saved to: ${errorFile}`);
@@ -356,18 +356,24 @@ function generateConsolidatedReport(allResults) {
         categoryAverages[category] = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + '%';
     });
     
-    // Collecter les erreurs par exercice et par run
-    const exerciseErrors = {};
+    // Collecter les erreurs par catégorie et par exercice
+    const exerciseErrorsByCategory = {};
     allResults.forEach(run => {
         run.categories.forEach(category => {
             category.exercises.forEach(exercise => {
                 if (!exercise.success && exercise.error) {
-                    if (!exerciseErrors[exercise.name]) {
-                        exerciseErrors[exercise.name] = [];
+                    // Initialiser la catégorie si elle n'existe pas
+                    if (!exerciseErrorsByCategory[category.category]) {
+                        exerciseErrorsByCategory[category.category] = {};
+                    }
+                    
+                    // Initialiser l'exercice si il n'existe pas
+                    if (!exerciseErrorsByCategory[category.category][exercise.name]) {
+                        exerciseErrorsByCategory[category.category][exercise.name] = [];
                     }
                     
                     // Ajouter l'erreur avec le numéro de run
-                    exerciseErrors[exercise.name].push({
+                    exerciseErrorsByCategory[category.category][exercise.name].push({
                         run: run.runNumber,
                         type: exercise.error.type || 'COMPILATION_ERROR',
                         message: exercise.error.message || 'Compilation failed',
@@ -385,7 +391,7 @@ function generateConsolidatedReport(allResults) {
             globalSuccessRate: averageSuccessRate + '%'
         },
         categorySuccessRates: categoryAverages,
-        exerciseErrors: exerciseErrors
+        exerciseErrorsByCategory: exerciseErrorsByCategory
     };
 }
 
@@ -441,7 +447,7 @@ async function runSingleTest(runNumber) {
 
    // Calculer le total d'exercices
    const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0);
-   console.log(`\n🧪 [RUN ${runNumber}/2] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
+   console.log(`\n🧪 [RUN ${runNumber}/5] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
 
    // Traiter les catégories en parallèle
    const startTime = Date.now();
@@ -474,7 +480,7 @@ async function runSingleTest(runNumber) {
 }
 
 async function main() {
-   const NUM_RUNS = 2;
+   const NUM_RUNS = 1;
    const allResults = [];
    
    console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`);
@@ -501,9 +507,21 @@ async function main() {
    fs.writeFileSync(consolidatedReportPath, JSON.stringify(consolidatedReport, null, 2));
    
    console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`);
-   console.log(`Average success rate: ${consolidatedReport.averageSuccessRate}%`);
-   console.log(`Best run: ${consolidatedReport.bestRun.globalSuccessRate}% (Run ${consolidatedReport.bestRun.runNumber})`);
-   console.log(`Worst run: ${consolidatedReport.worstRun.globalSuccessRate}% (Run ${consolidatedReport.worstRun.runNumber})`);
+   console.log(`Average success rate: ${consolidatedReport.summary.globalSuccessRate}`);
+   
+   // Calculer le meilleur et pire run pour l'affichage
+   if (allResults.length > 0) {
+       const bestRun = allResults.reduce((best, current) => 
+           parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best
+       );
+       
+       const worstRun = allResults.reduce((worst, current) => 
+           parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst
+       );
+       
+       console.log(`Best run: ${bestRun.globalSuccessRate}% (Run ${bestRun.runNumber})`);
+       console.log(`Worst run: ${worstRun.globalSuccessRate}% (Run ${worstRun.runNumber})`);
+   }
    
    log(`All reports saved in: ${debugDir}`);
    log(`Consolidated report: ${consolidatedReportPath}`);
diff --git a/script-starklings.sh b/script-starklings.sh
index 0ad3f18a..a5e28e45 100644
--- a/script-starklings.sh
+++ b/script-starklings.sh
@@ -1,5 +1,30 @@
 #!/bin/bash
 
+
+# 1. Nettoyer les éventuels anciens dossiers
+echo "🧹 Cleaning up previous installations..."
+rm -rf starklings
+
+# 2. Cloner le repo starklings
+echo "📦 Cloning starklings repository..."
+git clone https://github.com/shramee/starklings.git
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to clone starklings repository"
+    exit 1
+fi
+
+# 3. Changer vers la branche feat/upgrade-cairo-and-use-scarb
+echo "🔄 Switching to feat/upgrade-cairo-and-use-scarb branch..."
+cd starklings
+git checkout feat/upgrade-cairo-and-use-scarb
+if [ $? -ne 0 ]; then
+    echo "❌ Failed to switch to feat/upgrade-cairo-and-use-scarb branch"
+    exit 1
+fi
+
+# 4. Retourner au dossier parent
+cd ..
+
 # Vérifier si le serveur répond
 if ! curl -s http://localhost:3002/ > /dev/null 2>&1; then
     echo "❌ Server failed to start"

From 40d74e47f083a51d9a6d370be29de5d5069fff0a Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Wed, 16 Jul 2025 20:41:54 +0200
Subject: [PATCH 8/9] trunk fmt

---
 .github/scripts/starklings-evaluate.js | 1040 +++++++++++++-----------
 1 file changed, 571 insertions(+), 469 deletions(-)

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
index b5ac7573..69d73107 100644
--- a/.github/scripts/starklings-evaluate.js
+++ b/.github/scripts/starklings-evaluate.js
@@ -4,117 +4,120 @@ const path = require('path');
 
 // Configuration de débogage
 const DEBUG = true;
-const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; 
+const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null;
 const SAVE_RESPONSES = true;
 
 function log(message) {
-   if (DEBUG) {
-       console.log(`[DEBUG] ${message}`);
-   }
+  if (DEBUG) {
+    console.log(`[DEBUG] ${message}`);
+  }
 }
 
 function parseInfoToml(infoPath) {
-   if (!fs.existsSync(infoPath)) {
-       throw new Error(`info.toml not found at: ${infoPath}`);
-   }
-   
-   const content = fs.readFileSync(infoPath, 'utf8');
-   const lines = content.split('\n');
-   
-   const categories = {};
-   let currentCategory = null;
-   let currentExercise = null;
-   let collectingHint = false;
-   let hintLines = [];
-
-   for (let i = 0; i < lines.length; i++) {
-       const line = lines[i];
-       const cleanLine = line.trim();
-       
-       // Détecter les catégories
-       if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) {
-           currentCategory = cleanLine.substring(2).trim();
-           categories[currentCategory] = [];
-           continue;
-       }
-       
-       if (cleanLine.startsWith('[[exercises]]')) {
-           if (currentExercise) {
-               if (hintLines.length > 0) {
-                   currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, '');
-               }
-               if (currentCategory) {
-                   categories[currentCategory].push(currentExercise);
-               }
-           }
-           currentExercise = { category: currentCategory };
-           collectingHint = false;
-           hintLines = [];
-       } else if (cleanLine.startsWith('hint = """')) {
-           collectingHint = true;
-           hintLines.push(cleanLine.replace('hint = """', '').trim());
-       } else if (collectingHint) {
-           if (cleanLine.endsWith('"""')) {
-               hintLines.push(cleanLine.replace('"""', '').trim());
-               collectingHint = false;
-           } else {
-               hintLines.push(cleanLine);
-           }
-       } else if (cleanLine.startsWith('name = ')) {
-           const match = cleanLine.match(/name = "(.+)"/);
-           if (match) {
-               currentExercise.name = match[1];
-           }
-       } else if (cleanLine.startsWith('path = ')) {
-           const match = cleanLine.match(/path = "(.+)"/);
-           if (match) {
-               currentExercise.path = match[1];
-           }
-       } else if (cleanLine.startsWith('mode = ')) {
-           const match = cleanLine.match(/mode = "(.+)"/);
-           if (match) {
-               currentExercise.mode = match[1];
-           }
-       }
-   }
-   
-   // N'oublie pas le dernier exercice
-   if (currentExercise) {
-       if (hintLines.length > 0) {
-           currentExercise.hint = hintLines.join('\n').replace(/"""$/, '');
-       }
-       if (currentCategory) {
-           categories[currentCategory].push(currentExercise);
-       }
-   }
-   
-   return categories;
+  if (!fs.existsSync(infoPath)) {
+    throw new Error(`info.toml not found at: ${infoPath}`);
+  }
+
+  const content = fs.readFileSync(infoPath, 'utf8');
+  const lines = content.split('\n');
+
+  const categories = {};
+  let currentCategory = null;
+  let currentExercise = null;
+  let collectingHint = false;
+  let hintLines = [];
+
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    const cleanLine = line.trim();
+
+    // Détecter les catégories
+    if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) {
+      currentCategory = cleanLine.substring(2).trim();
+      categories[currentCategory] = [];
+      continue;
+    }
+
+    if (cleanLine.startsWith('[[exercises]]')) {
+      if (currentExercise) {
+        if (hintLines.length > 0) {
+          currentExercise.hint = hintLines
+            .join('\n')
+            .replace(/^"""/, '')
+            .replace(/"""$/, '');
+        }
+        if (currentCategory) {
+          categories[currentCategory].push(currentExercise);
+        }
+      }
+      currentExercise = { category: currentCategory };
+      collectingHint = false;
+      hintLines = [];
+    } else if (cleanLine.startsWith('hint = """')) {
+      collectingHint = true;
+      hintLines.push(cleanLine.replace('hint = """', '').trim());
+    } else if (collectingHint) {
+      if (cleanLine.endsWith('"""')) {
+        hintLines.push(cleanLine.replace('"""', '').trim());
+        collectingHint = false;
+      } else {
+        hintLines.push(cleanLine);
+      }
+    } else if (cleanLine.startsWith('name = ')) {
+      const match = cleanLine.match(/name = "(.+)"/);
+      if (match) {
+        currentExercise.name = match[1];
+      }
+    } else if (cleanLine.startsWith('path = ')) {
+      const match = cleanLine.match(/path = "(.+)"/);
+      if (match) {
+        currentExercise.path = match[1];
+      }
+    } else if (cleanLine.startsWith('mode = ')) {
+      const match = cleanLine.match(/mode = "(.+)"/);
+      if (match) {
+        currentExercise.mode = match[1];
+      }
+    }
+  }
+
+  // N'oublie pas le dernier exercice
+  if (currentExercise) {
+    if (hintLines.length > 0) {
+      currentExercise.hint = hintLines.join('\n').replace(/"""$/, '');
+    }
+    if (currentCategory) {
+      categories[currentCategory].push(currentExercise);
+    }
+  }
+
+  return categories;
 }
 
 async function testServerConnection() {
-   log('Testing server connection...');
-
-   try {
-       const response = await fetch('http://localhost:3002/', {
-           method: 'GET',
-           timeout: 5000
-       });
-       
-       if (response.ok) {
-           log('✅ Server connection successful');
-           return true;
-       } else {
-           log(`❌ Server responded with status: ${response.status}`);
-           return false;
-       }
-   } catch (error) {
-       log(`❌ Server connection failed: ${error.message}`);
-       return false;
-   }
+  log('Testing server connection...');
+
+  try {
+    const response = await fetch('http://localhost:3002/', {
+      method: 'GET',
+      timeout: 5000,
+    });
+
+    if (response.ok) {
+      log('✅ Server connection successful');
+      return true;
+    } else {
+      log(`❌ Server responded with status: ${response.status}`);
+      return false;
+    }
+  } catch (error) {
+    log(`❌ Server connection failed: ${error.message}`);
+    return false;
+  }
 }
 
 async function callCairoCoderAPI(exerciseContent, exercise, retries = 3) {
-    const prompt = `You are solving a Cairo programming exercise.
+  const prompt = `You are solving a Cairo programming exercise.
 
 Exercise: ${exercise.name}
 ${exercise.hint ? `Hint: ${exercise.hint}` : ''}
@@ -131,403 +134,502 @@ ${exerciseContent}
 
 Please provide only the corrected code, without any additional explanation or markdown formatting.`;
 
-    const requestBody = {
-        model: 'cairo-coder',
-        messages: [{ role: 'user', content: prompt }],
-        stream: false
-    };
-
-    for (let attempt = 1; attempt <= retries; attempt++) {
-        try {
-            log(`API call attempt ${attempt}/${retries} for ${exercise.name}`);
-            
-            const response = await fetch('http://localhost:3002/v1/chat/completions', {
-                method: 'POST',
-                headers: { 
-                    'Content-Type': 'application/json',
-                },
-                body: JSON.stringify(requestBody),
-                timeout: 120000 // 2 minutes au lieu de 60 secondes
-            });
-
-            if (!response.ok) {
-                const errorText = await response.text();
-                throw new Error(`HTTP error! status: ${response.status} - ${errorText}`);
-            }
-
-            const data = await response.json();
-            
-            // Sauvegarder la réponse complète si demandé
-            if (SAVE_RESPONSES) {
-                const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`);
-                fs.mkdirSync(path.dirname(responseFile), { recursive: true });
-                fs.writeFileSync(responseFile, JSON.stringify(data, null, 2));
-            }
-            
-            // Extraire le contenu de la réponse
-            if (data.choices && data.choices[0] && data.choices[0].message) {
-                const rawContent = data.choices[0].message.content;
-                const cleanCode = extractCairoCode(rawContent);
-                log(`✅ API call successful for ${exercise.name}`);
-                return cleanCode;
-            } else {
-                throw new Error('Invalid response format from API');
-            }
-            
-        } catch (error) {
-            log(`❌ API call failed (attempt ${attempt}/${retries}) for ${exercise.name}: ${error.message}`);
-            
-            if (attempt === retries) {
-                throw error; // Dernier essai, on lance l'erreur
-            }
-            
-            // Attendre de plus en plus longtemps à chaque retry
-            const waitTime = 3000 * attempt; // 3s, 6s, 9s
-            log(`Waiting ${waitTime}ms before retry...`);
-            await new Promise(resolve => setTimeout(resolve, waitTime));
-        }
+  const requestBody = {
+    model: 'cairo-coder',
+    messages: [{ role: 'user', content: prompt }],
+    stream: false,
+  };
+
+  for (let attempt = 1; attempt <= retries; attempt++) {
+    try {
+      log(`API call attempt ${attempt}/${retries} for ${exercise.name}`);
+
+      const response = await fetch(
+        'http://localhost:3002/v1/chat/completions',
+        {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+          },
+          body: JSON.stringify(requestBody),
+          timeout: 120000, // 2 minutes au lieu de 60 secondes
+        },
+      );
+
+      if (!response.ok) {
+        const errorText = await response.text();
+        throw new Error(
+          `HTTP error! status: ${response.status} - ${errorText}`,
+        );
+      }
+
+      const data = await response.json();
+
+      // Sauvegarder la réponse complète si demandé
+      if (SAVE_RESPONSES) {
+        const responseFile = path.join(
+          __dirname,
+          '..',
+          '..',
+          'debug',
+          `${exercise.name}_response.json`,
+        );
+        fs.mkdirSync(path.dirname(responseFile), { recursive: true });
+        fs.writeFileSync(responseFile, JSON.stringify(data, null, 2));
+      }
+
+      // Extraire le contenu de la réponse
+      if (data.choices && data.choices[0] && data.choices[0].message) {
+        const rawContent = data.choices[0].message.content;
+        const cleanCode = extractCairoCode(rawContent);
+        log(`✅ API call successful for ${exercise.name}`);
+        return cleanCode;
+      } else {
+        throw new Error('Invalid response format from API');
+      }
+    } catch (error) {
+      log(
+        `❌ API call failed (attempt ${attempt}/${retries}) for ${exercise.name}: ${error.message}`,
+      );
+
+      if (attempt === retries) {
+        throw error; // Dernier essai, on lance l'erreur
+      }
+
+      // Attendre de plus en plus longtemps à chaque retry
+      const waitTime = 3000 * attempt; // 3s, 6s, 9s
+      log(`Waiting ${waitTime}ms before retry...`);
+      await new Promise((resolve) => setTimeout(resolve, waitTime));
     }
+  }
 }
 
 async function testExercise(exercise, starklingsPath, runNumber = 1) {
-   log(`\n=== Testing exercise: ${exercise.name} ===`);
-   
-   const exercisePath = path.join(starklingsPath, exercise.path);
-   
-   if (!fs.existsSync(exercisePath)) {
-       log(`❌ Exercise file not found: ${exercisePath}`);
-       return { success: false, error: { message: 'File not found', type: 'FILE_ERROR' } };
-   }
-   
-   // Lire le contenu original
-   const originalContent = fs.readFileSync(exercisePath, 'utf8');
-   
-   // Sauvegarder l'original
-   const backupPath = exercisePath + '.backup';
-   fs.writeFileSync(backupPath, originalContent);
-   
-   try {
-       // Appeler l'API
-       const correctedCode = await callCairoCoderAPI(originalContent, exercise);
-       
-       // Sauvegarder la solution
-       fs.writeFileSync(exercisePath, correctedCode);
-       log(`Updated exercise file with generated code`);
-       
-       // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10)
-       if (SAVE_RESPONSES && runNumber === 5) {
-           const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`);
-           fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
-           fs.writeFileSync(solutionFile, correctedCode);
-       }
-       
-       // Tester la solution
-       try {
-           log(`Running starklings for ${exercise.name}...`);
-           const result = execSync(`cargo run --bin starklings run ${exercise.name}`, {
-               cwd: starklingsPath,
-               stdio: 'pipe',
-               timeout: 300000,
-               encoding: 'utf8'
-           });
-           
-           log(`✅ ${exercise.name} - Success`);
-           log(`Starklings output: ${result.substring(0, 200)}...`);
-           return { success: true };
-       } catch (error) {
-           log(`❌ ${exercise.name} - Execution failed`);
-           log(`Error code: ${error.status}`);
-           log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`);
-           log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`);
-           
-           // Formater l'erreur pour le rapport
-           const errorDetails = {
-               exitCode: error.status,
-               stdout: error.stdout || '',
-               stderr: error.stderr || ''
-           };
-           
-           // Sauvegarder les erreurs SEULEMENT pour le dernier run
-           if (SAVE_RESPONSES && runNumber === 5) {
-               const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`);
-               fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`);
-               log(`Error details saved to: ${errorFile}`);
-           }
-           
-           return { success: false, error: errorDetails };
-       }
-   } catch (error) {
-       log(`❌ ${exercise.name} - API call failed: ${error.message}`);
-       return { success: false, error: { message: error.message, type: 'API_ERROR' } };
-   } finally {
-       // Restaurer l'original
-       fs.writeFileSync(exercisePath, originalContent);
-       fs.unlinkSync(backupPath);
-       log(`Restored original file and cleaned up backup`);
-   }
-}
+  log(`\n=== Testing exercise: ${exercise.name} ===`);
 
-async function processCategoryWorker(categoryName, exercises, starklingsPath, runNumber = 1) {
-    const categoryResults = {
-        category: categoryName,
-        exercises: [],
-        passed: 0,
-        total: exercises.length
+  const exercisePath = path.join(starklingsPath, exercise.path);
+
+  if (!fs.existsSync(exercisePath)) {
+    log(`❌ Exercise file not found: ${exercisePath}`);
+    return {
+      success: false,
+      error: { message: 'File not found', type: 'FILE_ERROR' },
     };
+  }
+
+  // Lire le contenu original
+  const originalContent = fs.readFileSync(exercisePath, 'utf8');
+
+  // Sauvegarder l'original
+  const backupPath = exercisePath + '.backup';
+  fs.writeFileSync(backupPath, originalContent);
+
+  try {
+    // Appeler l'API
+    const correctedCode = await callCairoCoderAPI(originalContent, exercise);
+
+    // Sauvegarder la solution
+    fs.writeFileSync(exercisePath, correctedCode);
+    log(`Updated exercise file with generated code`);
+
+    // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10)
+    if (SAVE_RESPONSES && runNumber === 5) {
+      const solutionFile = path.join(
+        __dirname,
+        '..',
+        '..',
+        'debug',
+        `${exercise.name}_solution.cairo`,
+      );
+      fs.mkdirSync(path.dirname(solutionFile), { recursive: true });
+      fs.writeFileSync(solutionFile, correctedCode);
+    }
 
-    log(`\n[${categoryName}] Starting ${exercises.length} exercises...`);
+    // Tester la solution
+    try {
+      log(`Running starklings for ${exercise.name}...`);
+      const result = execSync(
+        `cargo run --bin starklings run ${exercise.name}`,
+        {
+          cwd: starklingsPath,
+          stdio: 'pipe',
+          timeout: 300000,
+          encoding: 'utf8',
+        },
+      );
+
+      log(`✅ ${exercise.name} - Success`);
+      log(`Starklings output: ${result.substring(0, 200)}...`);
+      return { success: true };
+    } catch (error) {
+      log(`❌ ${exercise.name} - Execution failed`);
+      log(`Error code: ${error.status}`);
+      log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`);
+      log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`);
+
+      // Formater l'erreur pour le rapport
+      const errorDetails = {
+        exitCode: error.status,
+        stdout: error.stdout || '',
+        stderr: error.stderr || '',
+      };
+
+      // Sauvegarder les erreurs SEULEMENT pour le dernier run
+      if (SAVE_RESPONSES && runNumber === 5) {
+        const errorFile = path.join(
+          __dirname,
+          '..',
+          '..',
+          'debug',
+          `${exercise.name}_error.txt`,
+        );
+        fs.writeFileSync(
+          errorFile,
+          `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`,
+        );
+        log(`Error details saved to: ${errorFile}`);
+      }
+
+      return { success: false, error: errorDetails };
+    }
+  } catch (error) {
+    log(`❌ ${exercise.name} - API call failed: ${error.message}`);
+    return {
+      success: false,
+      error: { message: error.message, type: 'API_ERROR' },
+    };
+  } finally {
+    // Restaurer l'original
+    fs.writeFileSync(exercisePath, originalContent);
+    fs.unlinkSync(backupPath);
+    log(`Restored original file and cleaned up backup`);
+  }
+}
 
-    for (const exercise of exercises) {
-        // Délai entre chaque exercice pour éviter la surcharge
-        if (categoryResults.exercises.length > 0) {
-            await new Promise(resolve => setTimeout(resolve, 1000)); // 1 seconde
-        }
-        
-        const result = await testExercise(exercise, starklingsPath, runNumber);
-        
-        const exerciseResult = {
-            name: exercise.name,
-            success: result.success
-        };
+async function processCategoryWorker(
+  categoryName,
+  exercises,
+  starklingsPath,
+  runNumber = 1,
+) {
+  const categoryResults = {
+    category: categoryName,
+    exercises: [],
+    passed: 0,
+    total: exercises.length,
+  };
+
+  log(`\n[${categoryName}] Starting ${exercises.length} exercises...`);
+
+  for (const exercise of exercises) {
+    // Délai entre chaque exercice pour éviter la surcharge
+    if (categoryResults.exercises.length > 0) {
+      await new Promise((resolve) => setTimeout(resolve, 1000)); // 1 seconde
+    }
 
-        if (!result.success && result.error) {
-            exerciseResult.error = result.error;
-        }
+    const result = await testExercise(exercise, starklingsPath, runNumber);
 
-        categoryResults.exercises.push(exerciseResult);
-        if (result.success) {
-            categoryResults.passed++;
-        }
+    const exerciseResult = {
+      name: exercise.name,
+      success: result.success,
+    };
 
-        log(`[${categoryName}] ${exercise.name}: ${result.success ? '✅' : '❌'}`);
+    if (!result.success && result.error) {
+      exerciseResult.error = result.error;
     }
 
-    categoryResults.successRate = (categoryResults.passed / categoryResults.total * 100).toFixed(1);
-
-    const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`);
-    fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2));
+    categoryResults.exercises.push(exerciseResult);
+    if (result.success) {
+      categoryResults.passed++;
+    }
 
-    log(`[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`);
-    return categoryResults;
+    log(`[${categoryName}] ${exercise.name}: ${result.success ? '✅' : '❌'}`);
+  }
+
+  categoryResults.successRate = (
+    (categoryResults.passed / categoryResults.total) *
+    100
+  ).toFixed(1);
+
+  const reportPath = path.join(
+    __dirname,
+    '..',
+    '..',
+    'debug',
+    `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`,
+  );
+  fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2));
+
+  log(
+    `[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`,
+  );
+  return categoryResults;
 }
 
 function extractCairoCode(generatedResponse) {
-   // Chercher les blocs de code Cairo ou génériques
-   const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g;
-   const matches = generatedResponse.match(codeBlockRegex);
-   
-   if (matches && matches.length > 0) {
-       // Extraire le contenu du premier bloc de code trouvé
-       const codeBlock = matches[0];
-       const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, '');
-       return codeContent.trim();
-   }
-   
-   // Si pas de bloc de code trouvé, retourner le texte tel quel
-   return generatedResponse.trim();
+  // Chercher les blocs de code Cairo ou génériques
+  const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g;
+  const matches = generatedResponse.match(codeBlockRegex);
+
+  if (matches && matches.length > 0) {
+    // Extraire le contenu du premier bloc de code trouvé
+    const codeBlock = matches[0];
+    const codeContent = codeBlock
+      .replace(/```(?:cairo|rust|)?\s*\n/, '')
+      .replace(/\n```$/, '');
+    return codeContent.trim();
+  }
+
+  // Si pas de bloc de code trouvé, retourner le texte tel quel
+  return generatedResponse.trim();
 }
 
 function generateConsolidatedReport(allResults) {
-    if (allResults.length === 0) {
-        return { error: 'No successful runs' };
-    }
-    
-    // Taux de réussite global
-    const successRates = allResults.map(r => parseFloat(r.globalSuccessRate));
-    const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1);
-    
-    // Taux de réussite par catégorie
-    const categoryStats = {};
-    allResults.forEach(run => {
-        run.categories.forEach(category => {
-            if (!categoryStats[category.category]) {
-                categoryStats[category.category] = {
-                    successRates: []
-                };
-            }
-            categoryStats[category.category].successRates.push(parseFloat(category.successRate));
-        });
-    });
-    
-    // Calculer les moyennes par catégorie
-    const categoryAverages = {};
-    Object.keys(categoryStats).forEach(category => {
-        const rates = categoryStats[category].successRates;
-        categoryAverages[category] = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + '%';
+  if (allResults.length === 0) {
+    return { error: 'No successful runs' };
+  }
+
+  // Taux de réussite global
+  const successRates = allResults.map((r) => parseFloat(r.globalSuccessRate));
+  const averageSuccessRate = (
+    successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length
+  ).toFixed(1);
+
+  // Taux de réussite par catégorie
+  const categoryStats = {};
+  allResults.forEach((run) => {
+    run.categories.forEach((category) => {
+      if (!categoryStats[category.category]) {
+        categoryStats[category.category] = {
+          successRates: [],
+        };
+      }
+      categoryStats[category.category].successRates.push(
+        parseFloat(category.successRate),
+      );
     });
-    
-    // Collecter les erreurs par catégorie et par exercice
-    const exerciseErrorsByCategory = {};
-    allResults.forEach(run => {
-        run.categories.forEach(category => {
-            category.exercises.forEach(exercise => {
-                if (!exercise.success && exercise.error) {
-                    // Initialiser la catégorie si elle n'existe pas
-                    if (!exerciseErrorsByCategory[category.category]) {
-                        exerciseErrorsByCategory[category.category] = {};
-                    }
-                    
-                    // Initialiser l'exercice si il n'existe pas
-                    if (!exerciseErrorsByCategory[category.category][exercise.name]) {
-                        exerciseErrorsByCategory[category.category][exercise.name] = [];
-                    }
-                    
-                    // Ajouter l'erreur avec le numéro de run
-                    exerciseErrorsByCategory[category.category][exercise.name].push({
-                        run: run.runNumber,
-                        type: exercise.error.type || 'COMPILATION_ERROR',
-                        message: exercise.error.message || 'Compilation failed',
-                        stdout: exercise.error.stdout ? exercise.error.stdout.substring(0, 500) : null,
-                        stderr: exercise.error.stderr ? exercise.error.stderr.substring(0, 500) : null
-                    });
-                }
-            });
-        });
+  });
+
+  // Calculer les moyennes par catégorie
+  const categoryAverages = {};
+  Object.keys(categoryStats).forEach((category) => {
+    const rates = categoryStats[category].successRates;
+    categoryAverages[category] =
+      (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) +
+      '%';
+  });
+
+  // Collecter les erreurs par catégorie et par exercice
+  const exerciseErrorsByCategory = {};
+  allResults.forEach((run) => {
+    run.categories.forEach((category) => {
+      category.exercises.forEach((exercise) => {
+        if (!exercise.success && exercise.error) {
+          // Initialiser la catégorie si elle n'existe pas
+          if (!exerciseErrorsByCategory[category.category]) {
+            exerciseErrorsByCategory[category.category] = {};
+          }
+
+          // Initialiser l'exercice si il n'existe pas
+          if (!exerciseErrorsByCategory[category.category][exercise.name]) {
+            exerciseErrorsByCategory[category.category][exercise.name] = [];
+          }
+
+          // Ajouter l'erreur avec le numéro de run
+          exerciseErrorsByCategory[category.category][exercise.name].push({
+            run: run.runNumber,
+            type: exercise.error.type || 'COMPILATION_ERROR',
+            message: exercise.error.message || 'Compilation failed',
+            stdout: exercise.error.stdout
+              ? exercise.error.stdout.substring(0, 500)
+              : null,
+            stderr: exercise.error.stderr
+              ? exercise.error.stderr.substring(0, 500)
+              : null,
+          });
+        }
+      });
     });
-    
-    return {
-        summary: {
-            totalRuns: allResults.length,
-            globalSuccessRate: averageSuccessRate + '%'
-        },
-        categorySuccessRates: categoryAverages,
-        exerciseErrorsByCategory: exerciseErrorsByCategory
-    };
+  });
+
+  return {
+    summary: {
+      totalRuns: allResults.length,
+      globalSuccessRate: averageSuccessRate + '%',
+    },
+    categorySuccessRates: categoryAverages,
+    exerciseErrorsByCategory: exerciseErrorsByCategory,
+  };
 }
 
 async function runSingleTest(runNumber) {
-   const starklingsPath = path.join(process.cwd(), 'starklings');
-   const infoPath = path.join(starklingsPath, 'info.toml');
-
-   if (!fs.existsSync(starklingsPath)) {
-       throw new Error('Starklings directory not found');
-   }
-   
-   if (!fs.existsSync(infoPath)) {
-       throw new Error('info.toml not found in starklings directory');
-   }
-   
-   // Tester la connexion au serveur
-   const serverOk = await testServerConnection();
-   if (!serverOk) {
-       throw new Error('Server is not accessible');
-   }
-   
-   // Parser les exercices par catégorie
-   const categories = parseInfoToml(infoPath);
-   
-   if (Object.keys(categories).length === 0) {
-       throw new Error('No categories found');
-   }
-
-   // Filtrer à une seule catégorie si demandé
-   let categoriesToTest = categories;
-   if (SINGLE_EXERCISE) {
-       let foundCategory = null;
-       for (const [categoryName, exercises] of Object.entries(categories)) {
-           if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) {
-               foundCategory = categoryName;
-               break;
-           }
-       }
-       
-       if (!foundCategory) {
-           throw new Error(`Exercise '${SINGLE_EXERCISE}' not found`);
-       }
-       
-       categoriesToTest = {
-           [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE)
-       };
-       log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`);
-   }
-
-   // Créer le dossier de debug
-   const debugDir = path.join(__dirname, '..', '..', 'debug');
-   fs.mkdirSync(debugDir, { recursive: true });
-
-   // Calculer le total d'exercices
-   const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0);
-   console.log(`\n🧪 [RUN ${runNumber}/5] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`);
-
-   // Traiter les catégories en parallèle
-   const startTime = Date.now();
-   const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => 
-       processCategoryWorker(categoryName, exercises, starklingsPath, runNumber)
-   );
-
-   const categoryResults = await Promise.all(categoryPromises);
-   const endTime = Date.now();
-
-   // Consolider les résultats
-   const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0);
-   const globalResults = {
-       runNumber: runNumber,
-       timestamp: new Date().toISOString(),
-       totalExercises: totalExercises,
-       totalPassed: totalPassed,
-       globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1),
-       executionTime: (endTime - startTime) / 1000,
-       categories: categoryResults
-   };
-
-   // Sauvegarder le rapport global pour ce run
-   const globalReportPath = path.join(debugDir, `global_report_run${runNumber}.json`);
-   fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2));
-
-   console.log(`[RUN ${runNumber}] ${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`);
-   
-   return globalResults;
+  const starklingsPath = path.join(process.cwd(), 'starklings');
+  const infoPath = path.join(starklingsPath, 'info.toml');
+
+  if (!fs.existsSync(starklingsPath)) {
+    throw new Error('Starklings directory not found');
+  }
+
+  if (!fs.existsSync(infoPath)) {
+    throw new Error('info.toml not found in starklings directory');
+  }
+
+  // Tester la connexion au serveur
+  const serverOk = await testServerConnection();
+  if (!serverOk) {
+    throw new Error('Server is not accessible');
+  }
+
+  // Parser les exercices par catégorie
+  const categories = parseInfoToml(infoPath);
+
+  if (Object.keys(categories).length === 0) {
+    throw new Error('No categories found');
+  }
+
+  // Filtrer à une seule catégorie si demandé
+  let categoriesToTest = categories;
+  if (SINGLE_EXERCISE) {
+    let foundCategory = null;
+    for (const [categoryName, exercises] of Object.entries(categories)) {
+      if (exercises.some((ex) => ex.name === SINGLE_EXERCISE)) {
+        foundCategory = categoryName;
+        break;
+      }
+    }
+
+    if (!foundCategory) {
+      throw new Error(`Exercise '${SINGLE_EXERCISE}' not found`);
+    }
+
+    categoriesToTest = {
+      [foundCategory]: categories[foundCategory].filter(
+        (ex) => ex.name === SINGLE_EXERCISE,
+      ),
+    };
+    log(
+      `Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`,
+    );
+  }
+
+  // Créer le dossier de debug
+  const debugDir = path.join(__dirname, '..', '..', 'debug');
+  fs.mkdirSync(debugDir, { recursive: true });
+
+  // Calculer le total d'exercices
+  const totalExercises = Object.values(categoriesToTest).reduce(
+    (sum, exercises) => sum + exercises.length,
+    0,
+  );
+  console.log(
+    `\n🧪 [RUN ${runNumber}/5] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`,
+  );
+
+  // Traiter les catégories en parallèle
+  const startTime = Date.now();
+  const categoryPromises = Object.entries(categoriesToTest).map(
+    ([categoryName, exercises]) =>
+      processCategoryWorker(categoryName, exercises, starklingsPath, runNumber),
+  );
+
+  const categoryResults = await Promise.all(categoryPromises);
+  const endTime = Date.now();
+
+  // Consolider les résultats
+  const totalPassed = categoryResults.reduce(
+    (sum, result) => sum + result.passed,
+    0,
+  );
+  const globalResults = {
+    runNumber: runNumber,
+    timestamp: new Date().toISOString(),
+    totalExercises: totalExercises,
+    totalPassed: totalPassed,
+    globalSuccessRate: ((totalPassed / totalExercises) * 100).toFixed(1),
+    executionTime: (endTime - startTime) / 1000,
+    categories: categoryResults,
+  };
+
+  // Sauvegarder le rapport global pour ce run
+  const globalReportPath = path.join(
+    debugDir,
+    `global_report_run${runNumber}.json`,
+  );
+  fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2));
+
+  console.log(
+    `[RUN ${runNumber}] ${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`,
+  );
+
+  return globalResults;
 }
 
 async function main() {
-   const NUM_RUNS = 1;
-   const allResults = [];
-   
-   console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`);
-   
-   for (let i = 1; i <= NUM_RUNS; i++) {
-       try {
-           const result = await runSingleTest(i);
-           allResults.push(result);
-           
-           // Petite pause entre les runs pour éviter la surcharge
-           if (i < NUM_RUNS) {
-               await new Promise(resolve => setTimeout(resolve, 2000));
-           }
-       } catch (error) {
-           console.error(`❌ Run ${i} failed:`, error.message);
-           // Continuer avec les autres runs même si un échoue
-       }
-   }
-   
-   // Générer le rapport consolidé
-   const debugDir = path.join(__dirname, '..', '..', 'debug');
-   const consolidatedReport = generateConsolidatedReport(allResults);
-   const consolidatedReportPath = path.join(debugDir, 'consolidated_report.json');
-   fs.writeFileSync(consolidatedReportPath, JSON.stringify(consolidatedReport, null, 2));
-   
-   console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`);
-   console.log(`Average success rate: ${consolidatedReport.summary.globalSuccessRate}`);
-   
-   // Calculer le meilleur et pire run pour l'affichage
-   if (allResults.length > 0) {
-       const bestRun = allResults.reduce((best, current) => 
-           parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best
-       );
-       
-       const worstRun = allResults.reduce((worst, current) => 
-           parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst
-       );
-       
-       console.log(`Best run: ${bestRun.globalSuccessRate}% (Run ${bestRun.runNumber})`);
-       console.log(`Worst run: ${worstRun.globalSuccessRate}% (Run ${worstRun.runNumber})`);
-   }
-   
-   log(`All reports saved in: ${debugDir}`);
-   log(`Consolidated report: ${consolidatedReportPath}`);
+  const NUM_RUNS = 1;
+  const allResults = [];
+
+  console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`);
+
+  for (let i = 1; i <= NUM_RUNS; i++) {
+    try {
+      const result = await runSingleTest(i);
+      allResults.push(result);
+
+      // Petite pause entre les runs pour éviter la surcharge
+      if (i < NUM_RUNS) {
+        await new Promise((resolve) => setTimeout(resolve, 2000));
+      }
+    } catch (error) {
+      console.error(`❌ Run ${i} failed:`, error.message);
+      // Continuer avec les autres runs même si un échoue
+    }
+  }
+
+  // Générer le rapport consolidé
+  const debugDir = path.join(__dirname, '..', '..', 'debug');
+  const consolidatedReport = generateConsolidatedReport(allResults);
+  const consolidatedReportPath = path.join(
+    debugDir,
+    'consolidated_report.json',
+  );
+  fs.writeFileSync(
+    consolidatedReportPath,
+    JSON.stringify(consolidatedReport, null, 2),
+  );
+
+  console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`);
+  console.log(
+    `Average success rate: ${consolidatedReport.summary.globalSuccessRate}`,
+  );
+
+  // Calculer le meilleur et pire run pour l'affichage
+  if (allResults.length > 0) {
+    const bestRun = allResults.reduce((best, current) =>
+      parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate)
+        ? current
+        : best,
+    );
+
+    const worstRun = allResults.reduce((worst, current) =>
+      parseFloat(current.globalSuccessRate) <
+      parseFloat(worst.globalSuccessRate)
+        ? current
+        : worst,
+    );
+
+    console.log(
+      `Best run: ${bestRun.globalSuccessRate}% (Run ${bestRun.runNumber})`,
+    );
+    console.log(
+      `Worst run: ${worstRun.globalSuccessRate}% (Run ${worstRun.runNumber})`,
+    );
+  }
+
+  log(`All reports saved in: ${debugDir}`);
+  log(`Consolidated report: ${consolidatedReportPath}`);
 }
 
-main().catch(error => {
-   console.error('❌ Fatal error:', error);
-   process.exit(1);
-});
\ No newline at end of file
+main().catch((error) => {
+  console.error('❌ Fatal error:', error);
+  process.exit(1);
+});

From 312897d3cf94d4b74002b0aa402faa4b92c6e299 Mon Sep 17 00:00:00 2001
From: alvinouille <alvinalesaint@protonmail.com>
Date: Wed, 16 Jul 2025 20:45:24 +0200
Subject: [PATCH 9/9] fix: port 3001 backend compose

---
 .github/scripts/starklings-evaluate.js | 4 ++--
 docker-compose.yml                     | 2 +-
 script-starklings.sh                   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js
index 69d73107..7370215e 100644
--- a/.github/scripts/starklings-evaluate.js
+++ b/.github/scripts/starklings-evaluate.js
@@ -98,7 +98,7 @@ async function testServerConnection() {
   log('Testing server connection...');
 
   try {
-    const response = await fetch('http://localhost:3002/', {
+    const response = await fetch('http://localhost:3001/', {
       method: 'GET',
       timeout: 5000,
     });
@@ -145,7 +145,7 @@ Please provide only the corrected code, without any additional explanation or ma
       log(`API call attempt ${attempt}/${retries} for ${exercise.name}`);
 
       const response = await fetch(
-        'http://localhost:3002/v1/chat/completions',
+        'http://localhost:3001/v1/chat/completions',
         {
           method: 'POST',
           headers: {
diff --git a/docker-compose.yml b/docker-compose.yml
index 8832ee3b..421f5dcd 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,7 +19,7 @@ services:
       dockerfile: backend.dockerfile
     container_name: 'cairo-coder-backend'
     ports:
-      - 3002:3001
+      - 3001:3001
     depends_on:
       postgres:
         condition: service_started
diff --git a/script-starklings.sh b/script-starklings.sh
index a5e28e45..2a24d930 100644
--- a/script-starklings.sh
+++ b/script-starklings.sh
@@ -26,7 +26,7 @@ fi
 cd ..
 
 # Vérifier si le serveur répond
-if ! curl -s http://localhost:3002/ > /dev/null 2>&1; then
+if ! curl -s http://localhost:3001/ > /dev/null 2>&1; then
     echo "❌ Server failed to start"
     kill $SERVER_PID 2>/dev/null || true
     exit 1