From fd06ed1e2f2fb663f6faa17fd7cad58f6ab0c9ad Mon Sep 17 00:00:00 2001 From: alvinouille Date: Fri, 11 Jul 2025 13:42:49 +0200 Subject: [PATCH 1/9] feat: add draft starklings ci, ok script starklings-evaluate and root script starklings-eval launch --- .github/scripts/starklings-evaluate.js | 353 +++++++++++++++++++++++++ .github/workflows/starklings.yml | 138 ++++++++++ script-starklings.sh | 25 ++ 3 files changed, 516 insertions(+) create mode 100644 .github/scripts/starklings-evaluate.js create mode 100644 .github/workflows/starklings.yml create mode 100644 script-starklings.sh diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js new file mode 100644 index 00000000..93e2961a --- /dev/null +++ b/.github/scripts/starklings-evaluate.js @@ -0,0 +1,353 @@ +const fs = require('fs'); +const { execSync } = require('child_process'); +const path = require('path'); + +// Configuration de débogage +const DEBUG = true; +const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; // ex: "intro1" +const SAVE_RESPONSES = true; + +function log(message) { + if (DEBUG) { + console.log(`[DEBUG] ${message}`); + } +} + +function parseInfoToml(infoPath) { + // log(`Parsing info.toml from: ${infoPath}`); + + if (!fs.existsSync(infoPath)) { + throw new Error(`info.toml not found at: ${infoPath}`); + } + + const content = fs.readFileSync(infoPath, 'utf8'); + // log(`File content length: ${content.length} characters`); + + const exercises = []; + const lines = content.split('\n'); + let currentExercise = null; + let collectingHint = false; + let hintLines = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const cleanLine = line.trim(); + + if (cleanLine.startsWith('[[exercises]]')) { + if (currentExercise) { + if (hintLines.length > 0) { + currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, ''); + } + exercises.push(currentExercise); + // log(`Added exercise: ${currentExercise.name}`); + } + currentExercise = {}; + collectingHint = false; + hintLines = []; + } else if (cleanLine.startsWith('hint = """')) { + collectingHint = true; + hintLines.push(cleanLine.replace('hint = """', '').trim()); + } else if (collectingHint) { + if (cleanLine.endsWith('"""')) { + hintLines.push(cleanLine.replace('"""', '').trim()); + collectingHint = false; + } else { + hintLines.push(cleanLine); + } + } else if (cleanLine.startsWith('name = ')) { + const match = cleanLine.match(/name = "(.+)"/); + if (match) { + currentExercise.name = match[1]; + } + } else if (cleanLine.startsWith('path = ')) { + const match = cleanLine.match(/path = "(.+)"/); + if (match) { + currentExercise.path = match[1]; + } + } else if (cleanLine.startsWith('mode = ')) { + const match = cleanLine.match(/mode = "(.+)"/); + if (match) { + currentExercise.mode = match[1]; + } + } + } + + // N'oublie pas le dernier exercice + if (currentExercise) { + if (hintLines.length > 0) { + currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); + } + exercises.push(currentExercise); + // log(`Added final exercise: ${currentExercise.name}`); + } + + // log(`Total exercises parsed: ${exercises.length}`); + return exercises; +} + +async function testServerConnection() { + log('Testing server connection...'); + + try { + const response = await fetch('http://localhost:3002/', { + method: 'GET', + timeout: 5000 + }); + + if (response.ok) { + log('✅ Server connection successful'); + return true; + } else { + log(`❌ Server responded with status: ${response.status}`); + return false; + } + } catch (error) { + log(`❌ Server connection failed: ${error.message}`); + return false; + } +} + +async function callCairoCoderAPI(exerciseContent, exercise) { + // log(`Calling API for exercise: ${exercise.name}`); + + const prompt = `You are solving a Cairo programming exercise. + +Exercise: ${exercise.name} +${exercise.hint ? `Hint: ${exercise.hint}` : ''} + +Instructions: +1. Read and understand the exercise requirements +2. Fix any compilation errors +3. Remove the "// I AM NOT DONE" comment when complete +4. Ensure the solution demonstrates the intended concept +5. The solution must be in the same language as the exercise (Cairo) + +Code to fix: +${exerciseContent} + +Please provide only the corrected code, without any additional explanation or markdown formatting.`; + + const requestBody = { + model: 'cairo-coder', + messages: [{ role: 'user', content: prompt }], + stream: false + }; + + // log(`Request body size: ${JSON.stringify(requestBody).length} characters`); + + try { + const response = await fetch('http://localhost:3002/v1/chat/completions', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + timeout: 60000 // 60 secondes + }); + + if (!response.ok) { + const errorText = await response.text(); + log(`API Error - Status: ${response.status}, Response: ${errorText}`); + throw new Error(`HTTP error! status: ${response.status} - ${errorText}`); + } + + const data = await response.json(); + // log(`API Response received, data structure: ${Object.keys(data).join(', ')}`); + + // Sauvegarder la réponse complète si demandé + if (SAVE_RESPONSES) { + const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`); + fs.mkdirSync(path.dirname(responseFile), { recursive: true }); + fs.writeFileSync(responseFile, JSON.stringify(data, null, 2)); + // log(`Response saved to: ${responseFile}`); + } + + // Extraire le contenu de la réponse + if (data.choices && data.choices[0] && data.choices[0].message) { + const rawContent = data.choices[0].message.content; + const cleanCode = extractCairoCode(rawContent); + // log(`Generated code length: ${cleanCode.length} characters`); + // log(`Raw response length: ${rawContent.length} characters`); + return cleanCode; + } else { + log(`Invalid response format: ${JSON.stringify(data)}`); + throw new Error('Invalid response format from API'); + } + } catch (error) { + log(`API call failed: ${error.message}`); + throw error; + } +} + +async function testExercise(exercise, starklingsPath) { + log(`\n=== Testing exercise: ${exercise.name} ===`); + + const exercisePath = path.join(starklingsPath, exercise.path); + // log(`Exercise path: ${exercisePath}`); + + if (!fs.existsSync(exercisePath)) { + log(`❌ Exercise file not found: ${exercisePath}`); + return false; + } + + // Lire le contenu original + const originalContent = fs.readFileSync(exercisePath, 'utf8'); + // log(`Original file size: ${originalContent.length} characters`); + + // Sauvegarder l'original + const backupPath = exercisePath + '.backup'; + fs.writeFileSync(backupPath, originalContent); + // log(`Backup saved to: ${backupPath}`); + + try { + // Appeler l'API + const correctedCode = await callCairoCoderAPI(originalContent, exercise); + + // Sauvegarder la solution + fs.writeFileSync(exercisePath, correctedCode); + log(`Updated exercise file with generated code`); + + // Sauvegarder la solution générée pour debug + if (SAVE_RESPONSES) { + const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`); + fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); + fs.writeFileSync(solutionFile, correctedCode); + // log(`Solution saved to: ${solutionFile}`); + } + + // Tester la solution + try { + log(`Running starklings for ${exercise.name}...`); + const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, { + cwd: starklingsPath, + stdio: 'pipe', + timeout: 300000, + encoding: 'utf8' + }); + + log(`✅ ${exercise.name} - Success`); + log(`Starklings output: ${result.substring(0, 200)}...`); + return true; + } catch (error) { + log(`❌ ${exercise.name} - Execution failed`); + log(`Error code: ${error.status}`); + log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`); + log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`); + + // Sauvegarder l'erreur pour debug + if (SAVE_RESPONSES) { + const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); + fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`); + log(`Error details saved to: ${errorFile}`); + } + + return false; + } + } catch (error) { + log(`❌ ${exercise.name} - API call failed: ${error.message}`); + return false; + } finally { + // Restaurer l'original + fs.writeFileSync(exercisePath, originalContent); + fs.unlinkSync(backupPath); + log(`Restored original file and cleaned up backup`); + } +} + +function extractCairoCode(generatedResponse) { + // Chercher les blocs de code Cairo ou génériques + const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g; + const matches = generatedResponse.match(codeBlockRegex); + + if (matches && matches.length > 0) { + // Extraire le contenu du premier bloc de code trouvé + const codeBlock = matches[0]; + const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, ''); + return codeContent.trim(); + } + + // Si pas de bloc de code trouvé, retourner le texte tel quel + return generatedResponse.trim(); +} + +async function main() { + // log('=== Starting Starklings Debug Session ==='); + + const starklingsPath = path.join(process.cwd(), 'starklings'); + const infoPath = path.join(starklingsPath, 'info.toml'); + + // Vérifications initiales + // log(`Working directory: ${process.cwd()}`); + // log(`Starklings path: ${starklingsPath}`); + // log(`Info.toml path: ${infoPath}`); + + if (!fs.existsSync(starklingsPath)) { + console.error('❌ Starklings directory not found'); + process.exit(1); + } + + if (!fs.existsSync(infoPath)) { + console.error('❌ info.toml not found in starklings directory'); + process.exit(1); + } + + // Tester la connexion au serveur + const serverOk = await testServerConnection(); + if (!serverOk) { + console.error('❌ Server is not accessible'); + process.exit(1); + } + + // Parser les exercices + const exercises = parseInfoToml(infoPath); + + if (exercises.length === 0) { + console.error('❌ No exercises found'); + process.exit(1); + } + + // Filtrer à un seul exercice si demandé + let exercisesToTest = exercises; + if (SINGLE_EXERCISE) { + exercisesToTest = exercises.filter(ex => ex.name === SINGLE_EXERCISE); + if (exercisesToTest.length === 0) { + console.error(`❌ Exercise '${SINGLE_EXERCISE}' not found`); + console.log('Available exercises:', exercises.map(ex => ex.name).join(', ')); + process.exit(1); + } + // log(`Testing single exercise: ${SINGLE_EXERCISE}`); + } + + // Créer le dossier de debug + const debugDir = path.join(__dirname, '..', '..', 'debug'); + fs.mkdirSync(debugDir, { recursive: true }); + + // Tester les exercices + let passed = 0; + let total = exercisesToTest.length; + + console.log(`\n🧪 Starting evaluation of ${total} exercises...`); + + for (const exercise of exercisesToTest) { + const success = await testExercise(exercise, starklingsPath); + if (success) { + passed++; + } + + // Pause entre les exercices pour éviter la surcharge + if (exercisesToTest.length > 1) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + console.log(`\n=== Final Results ===`); + console.log(`${passed}/${total} exercises passed (${(passed/total*100).toFixed(1)}%)`); + + log(`Debug files saved in: ${debugDir}`); + // log('=== Debug Session Complete ==='); +} + +main().catch(error => { + console.error('❌ Fatal error:', error); + process.exit(1); +}); \ No newline at end of file diff --git a/.github/workflows/starklings.yml b/.github/workflows/starklings.yml new file mode 100644 index 00000000..baf5f490 --- /dev/null +++ b/.github/workflows/starklings.yml @@ -0,0 +1,138 @@ +name: Starklings Benchmark + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + starklings-benchmark: + name: Starklings Benchmark + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Install pnpm + uses: pnpm/action-setup@v3 + with: + version: 9 + + - name: Install dependencies + run: pnpm install + + - name: Build Cairo Coder + run: pnpm build + + - name: Setup PostgreSQL + uses: harmon758/postgresql-action@v1 + with: + postgresql version: '15' + postgresql db: 'cairo_coder_test' + postgresql user: 'test_user' + postgresql password: 'test_password' + + - name: Install PostgreSQL client and pgvector + run: | + sudo apt-get update + sudo apt-get install -y postgresql-client-15 + sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;" + + - name: Setup test configuration + run: | + mkdir -p packages/agents + cat > packages/agents/config.toml << 'EOL' + [API_KEYS] + OPENAI = "${{ secrets.OPENAI_API_KEY }}" + ANTHROPIC = "${{ secrets.ANTHROPIC_API_KEY }}" + GEMINI = "${{ secrets.GEMINI_API_KEY }}" + + [VECTOR_DB] + POSTGRES_USER = "test_user" + POSTGRES_HOST = "localhost" + POSTGRES_DB = "cairo_coder_test" + POSTGRES_PASSWORD = "test_password" + POSTGRES_PORT = "5432" + + [GENERAL] + PORT = 3001 + SIMILARITY_MEASURE = "cosine" + + [PROVIDERS] + DEFAULT_CHAT_PROVIDER = "gemini" + DEFAULT_CHAT_MODEL = "Gemini Flash 2.5" + DEFAULT_FAST_CHAT_PROVIDER = "gemini" + DEFAULT_FAST_CHAT_MODEL = "Gemini Flash 2.5" + DEFAULT_EMBEDDING_PROVIDER = "openai" + DEFAULT_EMBEDDING_MODEL = "Text embedding 3 large" + + [VERSIONS] + STARKNET_FOUNDRY = "0.37.0" + SCARB = "2.9.2" + EOL + + - name: Create env file + run: | + cat > .env << 'EOL' + POSTGRES_USER=test_user + POSTGRES_HOST=localhost + POSTGRES_DB=cairo_coder_test + POSTGRES_PASSWORD=test_password + POSTGRES_PORT=5432 + EOL + + - name: Clone Starklings + run: | + if [ ! -d "starklings" ]; then + git clone https://github.com/starknet-edu/starklings.git + fi + + - name: Install Scarb + run: | + curl --proto '=https' --tlsv1.2 -sSf https://docs.swmansion.com/scarb/install.sh | sh + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Start Cairo Coder (background) + run: | + pnpm start & + # Attendre que le serveur démarre + for i in {1..30}; do + if curl -s http://localhost:3001/ > /dev/null; then + echo "Server is ready" + break + fi + echo "Waiting for server... ($i/30)" + sleep 2 + done + + # Vérifier si le serveur est vraiment prêt + if ! curl -s http://localhost:3001/ > /dev/null; then + echo "Server failed to start" + exit 1 + fi + + - name: Run Starklings Evaluation + run: node .github/scripts/starklings-evaluate.js + timeout-minutes: 30 + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: starklings-results + path: | + starklings/ + *.log \ No newline at end of file diff --git a/script-starklings.sh b/script-starklings.sh new file mode 100644 index 00000000..5d9cf978 --- /dev/null +++ b/script-starklings.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Vérifier si le serveur répond +if ! curl -s http://localhost:3002/ > /dev/null 2>&1; then + echo "❌ Server failed to start" + kill $SERVER_PID 2>/dev/null || true + exit 1 +fi + +# 8. Lancer le test avec un seul exercice +echo "🎯 Running single Starklings evaluation..." + +# SINGLE_EXERCISE=variables2 node .github/scripts/starklings-evaluate.js +node .github/scripts/starklings-evaluate.js + +# 9. Nettoyer +echo "🧹 Cleaning up..." +kill $SERVER_PID 2>/dev/null || true + +if command -v docker &> /dev/null; then + docker stop cairo-coder-test-db 2>/dev/null || true + docker rm cairo-coder-test-db 2>/dev/null || true +fi + +echo "✅ Test completed!" \ No newline at end of file From 90ae8e37ac72cc4f0a484f7b31d8c93f9f66362e Mon Sep 17 00:00:00 2001 From: alvinouille Date: Mon, 14 Jul 2025 14:40:28 +0200 Subject: [PATCH 2/9] working in sequential --- .gitignore | 4 +++- docker-compose.yml | 2 +- script-starklings.sh | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c5e154f3..5bc944bf 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,6 @@ packages/**/dist .trunk !.trunk/trunk.yaml !.trunk/configs -!.trunk/.gitignore \ No newline at end of file +!.trunk/.gitignore + +starklings/ \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 421f5dcd..8832ee3b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,7 +19,7 @@ services: dockerfile: backend.dockerfile container_name: 'cairo-coder-backend' ports: - - 3001:3001 + - 3002:3001 depends_on: postgres: condition: service_started diff --git a/script-starklings.sh b/script-starklings.sh index 5d9cf978..1e5c89b5 100644 --- a/script-starklings.sh +++ b/script-starklings.sh @@ -10,8 +10,8 @@ fi # 8. Lancer le test avec un seul exercice echo "🎯 Running single Starklings evaluation..." -# SINGLE_EXERCISE=variables2 node .github/scripts/starklings-evaluate.js -node .github/scripts/starklings-evaluate.js +SINGLE_EXERCISE=primitive_types2 node .github/scripts/starklings-evaluate.js +# node .github/scripts/starklings-evaluate.js # 9. Nettoyer echo "🧹 Cleaning up..." From a8cb875ebb2a6125a14cd1437da6ec943000806d Mon Sep 17 00:00:00 2001 From: alvinouille Date: Mon, 14 Jul 2025 17:35:56 +0200 Subject: [PATCH 3/9] parallelization of starklings evaluation --- .github/scripts/starklings-evaluate.js | 186 +++++++++++++++++-------- .gitignore | 3 +- script-starklings.sh | 4 +- 3 files changed, 133 insertions(+), 60 deletions(-) diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index 93e2961a..5a45ffd1 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -14,17 +14,15 @@ function log(message) { } function parseInfoToml(infoPath) { - // log(`Parsing info.toml from: ${infoPath}`); - if (!fs.existsSync(infoPath)) { throw new Error(`info.toml not found at: ${infoPath}`); } const content = fs.readFileSync(infoPath, 'utf8'); - // log(`File content length: ${content.length} characters`); - - const exercises = []; const lines = content.split('\n'); + + const categories = {}; + let currentCategory = null; let currentExercise = null; let collectingHint = false; let hintLines = []; @@ -33,15 +31,23 @@ function parseInfoToml(infoPath) { const line = lines[i]; const cleanLine = line.trim(); + // Détecter les catégories + if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) { + currentCategory = cleanLine.substring(2).trim(); + categories[currentCategory] = []; + continue; + } + if (cleanLine.startsWith('[[exercises]]')) { if (currentExercise) { if (hintLines.length > 0) { currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, ''); } - exercises.push(currentExercise); - // log(`Added exercise: ${currentExercise.name}`); + if (currentCategory) { + categories[currentCategory].push(currentExercise); + } } - currentExercise = {}; + currentExercise = { category: currentCategory }; collectingHint = false; hintLines = []; } else if (cleanLine.startsWith('hint = """')) { @@ -77,12 +83,12 @@ function parseInfoToml(infoPath) { if (hintLines.length > 0) { currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); } - exercises.push(currentExercise); - // log(`Added final exercise: ${currentExercise.name}`); + if (currentCategory) { + categories[currentCategory].push(currentExercise); + } } - // log(`Total exercises parsed: ${exercises.length}`); - return exercises; + return categories; } async function testServerConnection() { @@ -227,13 +233,20 @@ async function testExercise(exercise, starklingsPath) { log(`✅ ${exercise.name} - Success`); log(`Starklings output: ${result.substring(0, 200)}...`); - return true; + return { success: true }; } catch (error) { log(`❌ ${exercise.name} - Execution failed`); log(`Error code: ${error.status}`); log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`); log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`); + // Formater l'erreur pour le rapport + const errorDetails = { + exitCode: error.status, + stdout: error.stdout || '', + stderr: error.stderr || '' + }; + // Sauvegarder l'erreur pour debug if (SAVE_RESPONSES) { const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); @@ -241,11 +254,11 @@ async function testExercise(exercise, starklingsPath) { log(`Error details saved to: ${errorFile}`); } - return false; + return { success: false, error: errorDetails }; } } catch (error) { log(`❌ ${exercise.name} - API call failed: ${error.message}`); - return false; + return { success: false, error: { message: error.message, type: 'API_ERROR' } }; } finally { // Restaurer l'original fs.writeFileSync(exercisePath, originalContent); @@ -254,6 +267,47 @@ async function testExercise(exercise, starklingsPath) { } } +async function processCategoryWorker(categoryName, exercises, starklingsPath) { + const categoryResults = { + category: categoryName, + exercises: [], + passed: 0, + total: exercises.length + }; + + log(`\n[${categoryName}] Starting ${exercises.length} exercises...`); + + for (const exercise of exercises) { + const result = await testExercise(exercise, starklingsPath); + + const exerciseResult = { + name: exercise.name, + success: result.success + }; + + // Ajouter les erreurs seulement si échec + if (!result.success && result.error) { + exerciseResult.error = result.error; + } + + categoryResults.exercises.push(exerciseResult); + if (result.success) { + categoryResults.passed++; + } + + log(`[${categoryName}] ${exercise.name}: ${result.success ? '✅' : '❌'}`); + } + + categoryResults.successRate = (categoryResults.passed / categoryResults.total * 100).toFixed(1); + + // Sauvegarder le rapport de catégorie + const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report.json`); + fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2)); + + log(`[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`); + return categoryResults; +} + function extractCairoCode(generatedResponse) { // Chercher les blocs de code Cairo ou génériques const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g; @@ -271,16 +325,9 @@ function extractCairoCode(generatedResponse) { } async function main() { - // log('=== Starting Starklings Debug Session ==='); - const starklingsPath = path.join(process.cwd(), 'starklings'); const infoPath = path.join(starklingsPath, 'info.toml'); - - // Vérifications initiales - // log(`Working directory: ${process.cwd()}`); - // log(`Starklings path: ${starklingsPath}`); - // log(`Info.toml path: ${infoPath}`); - + if (!fs.existsSync(starklingsPath)) { console.error('❌ Starklings directory not found'); process.exit(1); @@ -298,53 +345,78 @@ async function main() { process.exit(1); } - // Parser les exercices - const exercises = parseInfoToml(infoPath); + // Parser les exercices par catégorie + const categories = parseInfoToml(infoPath); - if (exercises.length === 0) { - console.error('❌ No exercises found'); + if (Object.keys(categories).length === 0) { + console.error('❌ No categories found'); process.exit(1); } - - // Filtrer à un seul exercice si demandé - let exercisesToTest = exercises; + + // Filtrer à une seule catégorie si demandé + let categoriesToTest = categories; if (SINGLE_EXERCISE) { - exercisesToTest = exercises.filter(ex => ex.name === SINGLE_EXERCISE); - if (exercisesToTest.length === 0) { + // Trouver la catégorie contenant l'exercice + let foundCategory = null; + for (const [categoryName, exercises] of Object.entries(categories)) { + if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) { + foundCategory = categoryName; + break; + } + } + + if (!foundCategory) { console.error(`❌ Exercise '${SINGLE_EXERCISE}' not found`); - console.log('Available exercises:', exercises.map(ex => ex.name).join(', ')); process.exit(1); } - // log(`Testing single exercise: ${SINGLE_EXERCISE}`); + + categoriesToTest = { + [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE) + }; + log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`); } - + // Créer le dossier de debug const debugDir = path.join(__dirname, '..', '..', 'debug'); fs.mkdirSync(debugDir, { recursive: true }); - - // Tester les exercices - let passed = 0; - let total = exercisesToTest.length; - - console.log(`\n🧪 Starting evaluation of ${total} exercises...`); - - for (const exercise of exercisesToTest) { - const success = await testExercise(exercise, starklingsPath); - if (success) { - passed++; - } - - // Pause entre les exercices pour éviter la surcharge - if (exercisesToTest.length > 1) { - await new Promise(resolve => setTimeout(resolve, 1000)); - } - } - + + // Calculer le total d'exercices + const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0); + console.log(`\n🧪 Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); + + // Traiter les catégories en parallèle + const startTime = Date.now(); + const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => + processCategoryWorker(categoryName, exercises, starklingsPath) + ); + + const categoryResults = await Promise.all(categoryPromises); + const endTime = Date.now(); + + // Consolider les résultats + const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0); + const globalResults = { + totalExercises: totalExercises, + totalPassed: totalPassed, + globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1), + categories: categoryResults + }; + + // Sauvegarder le rapport global + const globalReportPath = path.join(debugDir, 'global_report.json'); + fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2)); + console.log(`\n=== Final Results ===`); - console.log(`${passed}/${total} exercises passed (${(passed/total*100).toFixed(1)}%)`); + console.log(`${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`); + console.log(`Total time: ${(endTime - startTime) / 1000}s`); + console.log(`\nCategory breakdown:`); - log(`Debug files saved in: ${debugDir}`); - // log('=== Debug Session Complete ==='); + categoryResults.forEach(result => { + console.log(` ${result.category}: ${result.passed}/${result.total} (${result.successRate}%)`); + }); + + log(`Reports saved in: ${debugDir}`); + log(`Global report: ${globalReportPath}`); } main().catch(error => { diff --git a/.gitignore b/.gitignore index 5bc944bf..4efc92a5 100644 --- a/.gitignore +++ b/.gitignore @@ -49,4 +49,5 @@ packages/**/dist !.trunk/configs !.trunk/.gitignore -starklings/ \ No newline at end of file +starklings/ +debug/ \ No newline at end of file diff --git a/script-starklings.sh b/script-starklings.sh index 1e5c89b5..0ad3f18a 100644 --- a/script-starklings.sh +++ b/script-starklings.sh @@ -10,8 +10,8 @@ fi # 8. Lancer le test avec un seul exercice echo "🎯 Running single Starklings evaluation..." -SINGLE_EXERCISE=primitive_types2 node .github/scripts/starklings-evaluate.js -# node .github/scripts/starklings-evaluate.js +# SINGLE_EXERCISE=primitive_types2 node .github/scripts/starklings-evaluate.js +node .github/scripts/starklings-evaluate.js # 9. Nettoyer echo "🧹 Cleaning up..." From 4834388282959820bab1247b8ef31d0f01e4394e Mon Sep 17 00:00:00 2001 From: alvinouille Date: Mon, 14 Jul 2025 18:11:06 +0200 Subject: [PATCH 4/9] add 10 launch of evaluation and global report --- .github/scripts/starklings-evaluate.js | 736 ++++++++++++++----------- 1 file changed, 408 insertions(+), 328 deletions(-) diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index 5a45ffd1..7cd78bdd 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -8,114 +8,112 @@ const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; // ex: "intro1" const SAVE_RESPONSES = true; function log(message) { - if (DEBUG) { - console.log(`[DEBUG] ${message}`); - } + if (DEBUG) { + console.log(`[DEBUG] ${message}`); + } } function parseInfoToml(infoPath) { - if (!fs.existsSync(infoPath)) { - throw new Error(`info.toml not found at: ${infoPath}`); - } - - const content = fs.readFileSync(infoPath, 'utf8'); - const lines = content.split('\n'); - - const categories = {}; - let currentCategory = null; - let currentExercise = null; - let collectingHint = false; - let hintLines = []; - - for (let i = 0; i < lines.length; i++) { - const line = lines[i]; - const cleanLine = line.trim(); - - // Détecter les catégories - if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) { - currentCategory = cleanLine.substring(2).trim(); - categories[currentCategory] = []; - continue; - } - - if (cleanLine.startsWith('[[exercises]]')) { - if (currentExercise) { - if (hintLines.length > 0) { - currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, ''); - } - if (currentCategory) { - categories[currentCategory].push(currentExercise); - } - } - currentExercise = { category: currentCategory }; - collectingHint = false; - hintLines = []; - } else if (cleanLine.startsWith('hint = """')) { - collectingHint = true; - hintLines.push(cleanLine.replace('hint = """', '').trim()); - } else if (collectingHint) { - if (cleanLine.endsWith('"""')) { - hintLines.push(cleanLine.replace('"""', '').trim()); - collectingHint = false; - } else { - hintLines.push(cleanLine); - } - } else if (cleanLine.startsWith('name = ')) { - const match = cleanLine.match(/name = "(.+)"/); - if (match) { - currentExercise.name = match[1]; - } - } else if (cleanLine.startsWith('path = ')) { - const match = cleanLine.match(/path = "(.+)"/); - if (match) { - currentExercise.path = match[1]; - } - } else if (cleanLine.startsWith('mode = ')) { - const match = cleanLine.match(/mode = "(.+)"/); - if (match) { - currentExercise.mode = match[1]; - } - } - } - - // N'oublie pas le dernier exercice - if (currentExercise) { - if (hintLines.length > 0) { - currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); - } - if (currentCategory) { - categories[currentCategory].push(currentExercise); - } - } - - return categories; + if (!fs.existsSync(infoPath)) { + throw new Error(`info.toml not found at: ${infoPath}`); + } + + const content = fs.readFileSync(infoPath, 'utf8'); + const lines = content.split('\n'); + + const categories = {}; + let currentCategory = null; + let currentExercise = null; + let collectingHint = false; + let hintLines = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const cleanLine = line.trim(); + + // Détecter les catégories + if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) { + currentCategory = cleanLine.substring(2).trim(); + categories[currentCategory] = []; + continue; + } + + if (cleanLine.startsWith('[[exercises]]')) { + if (currentExercise) { + if (hintLines.length > 0) { + currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, ''); + } + if (currentCategory) { + categories[currentCategory].push(currentExercise); + } + } + currentExercise = { category: currentCategory }; + collectingHint = false; + hintLines = []; + } else if (cleanLine.startsWith('hint = """')) { + collectingHint = true; + hintLines.push(cleanLine.replace('hint = """', '').trim()); + } else if (collectingHint) { + if (cleanLine.endsWith('"""')) { + hintLines.push(cleanLine.replace('"""', '').trim()); + collectingHint = false; + } else { + hintLines.push(cleanLine); + } + } else if (cleanLine.startsWith('name = ')) { + const match = cleanLine.match(/name = "(.+)"/); + if (match) { + currentExercise.name = match[1]; + } + } else if (cleanLine.startsWith('path = ')) { + const match = cleanLine.match(/path = "(.+)"/); + if (match) { + currentExercise.path = match[1]; + } + } else if (cleanLine.startsWith('mode = ')) { + const match = cleanLine.match(/mode = "(.+)"/); + if (match) { + currentExercise.mode = match[1]; + } + } + } + + // N'oublie pas le dernier exercice + if (currentExercise) { + if (hintLines.length > 0) { + currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); + } + if (currentCategory) { + categories[currentCategory].push(currentExercise); + } + } + + return categories; } async function testServerConnection() { - log('Testing server connection...'); - - try { - const response = await fetch('http://localhost:3002/', { - method: 'GET', - timeout: 5000 - }); - - if (response.ok) { - log('✅ Server connection successful'); - return true; - } else { - log(`❌ Server responded with status: ${response.status}`); - return false; - } - } catch (error) { - log(`❌ Server connection failed: ${error.message}`); - return false; - } + log('Testing server connection...'); + + try { + const response = await fetch('http://localhost:3002/', { + method: 'GET', + timeout: 5000 + }); + + if (response.ok) { + log('✅ Server connection successful'); + return true; + } else { + log(`❌ Server responded with status: ${response.status}`); + return false; + } + } catch (error) { + log(`❌ Server connection failed: ${error.message}`); + return false; + } } -async function callCairoCoderAPI(exerciseContent, exercise) { - // log(`Calling API for exercise: ${exercise.name}`); - +async function callCairoCoderAPI(exerciseContent, exercise, retries = 3) { const prompt = `You are solving a Cairo programming exercise. Exercise: ${exercise.name} @@ -139,135 +137,137 @@ Please provide only the corrected code, without any additional explanation or ma stream: false }; - // log(`Request body size: ${JSON.stringify(requestBody).length} characters`); - - try { - const response = await fetch('http://localhost:3002/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify(requestBody), - timeout: 60000 // 60 secondes - }); - - if (!response.ok) { - const errorText = await response.text(); - log(`API Error - Status: ${response.status}, Response: ${errorText}`); - throw new Error(`HTTP error! status: ${response.status} - ${errorText}`); - } - - const data = await response.json(); - // log(`API Response received, data structure: ${Object.keys(data).join(', ')}`); - - // Sauvegarder la réponse complète si demandé - if (SAVE_RESPONSES) { - const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`); - fs.mkdirSync(path.dirname(responseFile), { recursive: true }); - fs.writeFileSync(responseFile, JSON.stringify(data, null, 2)); - // log(`Response saved to: ${responseFile}`); - } - - // Extraire le contenu de la réponse - if (data.choices && data.choices[0] && data.choices[0].message) { - const rawContent = data.choices[0].message.content; - const cleanCode = extractCairoCode(rawContent); - // log(`Generated code length: ${cleanCode.length} characters`); - // log(`Raw response length: ${rawContent.length} characters`); - return cleanCode; - } else { - log(`Invalid response format: ${JSON.stringify(data)}`); - throw new Error('Invalid response format from API'); - } - } catch (error) { - log(`API call failed: ${error.message}`); - throw error; - } -} - -async function testExercise(exercise, starklingsPath) { - log(`\n=== Testing exercise: ${exercise.name} ===`); - - const exercisePath = path.join(starklingsPath, exercise.path); - // log(`Exercise path: ${exercisePath}`); - - if (!fs.existsSync(exercisePath)) { - log(`❌ Exercise file not found: ${exercisePath}`); - return false; - } - - // Lire le contenu original - const originalContent = fs.readFileSync(exercisePath, 'utf8'); - // log(`Original file size: ${originalContent.length} characters`); - - // Sauvegarder l'original - const backupPath = exercisePath + '.backup'; - fs.writeFileSync(backupPath, originalContent); - // log(`Backup saved to: ${backupPath}`); - - try { - // Appeler l'API - const correctedCode = await callCairoCoderAPI(originalContent, exercise); - - // Sauvegarder la solution - fs.writeFileSync(exercisePath, correctedCode); - log(`Updated exercise file with generated code`); - - // Sauvegarder la solution générée pour debug - if (SAVE_RESPONSES) { - const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`); - fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); - fs.writeFileSync(solutionFile, correctedCode); - // log(`Solution saved to: ${solutionFile}`); - } - - // Tester la solution + for (let attempt = 1; attempt <= retries; attempt++) { try { - log(`Running starklings for ${exercise.name}...`); - const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, { - cwd: starklingsPath, - stdio: 'pipe', - timeout: 300000, - encoding: 'utf8' + log(`API call attempt ${attempt}/${retries} for ${exercise.name}`); + + const response = await fetch('http://localhost:3002/v1/chat/completions', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + timeout: 120000 // 2 minutes au lieu de 60 secondes }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`HTTP error! status: ${response.status} - ${errorText}`); + } + + const data = await response.json(); - log(`✅ ${exercise.name} - Success`); - log(`Starklings output: ${result.substring(0, 200)}...`); - return { success: true }; - } catch (error) { - log(`❌ ${exercise.name} - Execution failed`); - log(`Error code: ${error.status}`); - log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`); - log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`); + // Sauvegarder la réponse complète si demandé + if (SAVE_RESPONSES) { + const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`); + fs.mkdirSync(path.dirname(responseFile), { recursive: true }); + fs.writeFileSync(responseFile, JSON.stringify(data, null, 2)); + } - // Formater l'erreur pour le rapport - const errorDetails = { - exitCode: error.status, - stdout: error.stdout || '', - stderr: error.stderr || '' - }; + // Extraire le contenu de la réponse + if (data.choices && data.choices[0] && data.choices[0].message) { + const rawContent = data.choices[0].message.content; + const cleanCode = extractCairoCode(rawContent); + log(`✅ API call successful for ${exercise.name}`); + return cleanCode; + } else { + throw new Error('Invalid response format from API'); + } - // Sauvegarder l'erreur pour debug - if (SAVE_RESPONSES) { - const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); - fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`); - log(`Error details saved to: ${errorFile}`); + } catch (error) { + log(`❌ API call failed (attempt ${attempt}/${retries}) for ${exercise.name}: ${error.message}`); + + if (attempt === retries) { + throw error; // Dernier essai, on lance l'erreur } - return { success: false, error: errorDetails }; + // Attendre de plus en plus longtemps à chaque retry + const waitTime = 3000 * attempt; // 3s, 6s, 9s + log(`Waiting ${waitTime}ms before retry...`); + await new Promise(resolve => setTimeout(resolve, waitTime)); } - } catch (error) { - log(`❌ ${exercise.name} - API call failed: ${error.message}`); - return { success: false, error: { message: error.message, type: 'API_ERROR' } }; - } finally { - // Restaurer l'original - fs.writeFileSync(exercisePath, originalContent); - fs.unlinkSync(backupPath); - log(`Restored original file and cleaned up backup`); } } -async function processCategoryWorker(categoryName, exercises, starklingsPath) { +async function testExercise(exercise, starklingsPath, runNumber = 1) { + log(`\n=== Testing exercise: ${exercise.name} ===`); + + const exercisePath = path.join(starklingsPath, exercise.path); + + if (!fs.existsSync(exercisePath)) { + log(`❌ Exercise file not found: ${exercisePath}`); + return { success: false, error: { message: 'File not found', type: 'FILE_ERROR' } }; + } + + // Lire le contenu original + const originalContent = fs.readFileSync(exercisePath, 'utf8'); + + // Sauvegarder l'original + const backupPath = exercisePath + '.backup'; + fs.writeFileSync(backupPath, originalContent); + + try { + // Appeler l'API + const correctedCode = await callCairoCoderAPI(originalContent, exercise); + + // Sauvegarder la solution + fs.writeFileSync(exercisePath, correctedCode); + log(`Updated exercise file with generated code`); + + // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10) + if (SAVE_RESPONSES && runNumber === 10) { + const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`); + fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); + fs.writeFileSync(solutionFile, correctedCode); + } + + // Tester la solution + try { + log(`Running starklings for ${exercise.name}...`); + const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, { + cwd: starklingsPath, + stdio: 'pipe', + timeout: 300000, + encoding: 'utf8' + }); + + log(`✅ ${exercise.name} - Success`); + log(`Starklings output: ${result.substring(0, 200)}...`); + return { success: true }; + } catch (error) { + log(`❌ ${exercise.name} - Execution failed`); + log(`Error code: ${error.status}`); + log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`); + log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`); + + // Formater l'erreur pour le rapport + const errorDetails = { + exitCode: error.status, + stdout: error.stdout || '', + stderr: error.stderr || '' + }; + + // Sauvegarder les erreurs SEULEMENT pour le dernier run + if (SAVE_RESPONSES && runNumber === 10) { + const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); + fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`); + log(`Error details saved to: ${errorFile}`); + } + + return { success: false, error: errorDetails }; + } + } catch (error) { + log(`❌ ${exercise.name} - API call failed: ${error.message}`); + return { success: false, error: { message: error.message, type: 'API_ERROR' } }; + } finally { + // Restaurer l'original + fs.writeFileSync(exercisePath, originalContent); + fs.unlinkSync(backupPath); + log(`Restored original file and cleaned up backup`); + } +} + +async function processCategoryWorker(categoryName, exercises, starklingsPath, runNumber = 1) { const categoryResults = { category: categoryName, exercises: [], @@ -278,14 +278,18 @@ async function processCategoryWorker(categoryName, exercises, starklingsPath) { log(`\n[${categoryName}] Starting ${exercises.length} exercises...`); for (const exercise of exercises) { - const result = await testExercise(exercise, starklingsPath); + // Délai entre chaque exercice pour éviter la surcharge + if (categoryResults.exercises.length > 0) { + await new Promise(resolve => setTimeout(resolve, 1000)); // 1 seconde + } + + const result = await testExercise(exercise, starklingsPath, runNumber); const exerciseResult = { name: exercise.name, success: result.success }; - // Ajouter les erreurs seulement si échec if (!result.success && result.error) { exerciseResult.error = result.error; } @@ -300,8 +304,7 @@ async function processCategoryWorker(categoryName, exercises, starklingsPath) { categoryResults.successRate = (categoryResults.passed / categoryResults.total * 100).toFixed(1); - // Sauvegarder le rapport de catégorie - const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report.json`); + const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`); fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2)); log(`[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`); @@ -309,117 +312,194 @@ async function processCategoryWorker(categoryName, exercises, starklingsPath) { } function extractCairoCode(generatedResponse) { - // Chercher les blocs de code Cairo ou génériques - const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g; - const matches = generatedResponse.match(codeBlockRegex); - - if (matches && matches.length > 0) { - // Extraire le contenu du premier bloc de code trouvé - const codeBlock = matches[0]; - const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, ''); - return codeContent.trim(); - } - - // Si pas de bloc de code trouvé, retourner le texte tel quel - return generatedResponse.trim(); + // Chercher les blocs de code Cairo ou génériques + const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g; + const matches = generatedResponse.match(codeBlockRegex); + + if (matches && matches.length > 0) { + // Extraire le contenu du premier bloc de code trouvé + const codeBlock = matches[0]; + const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, ''); + return codeContent.trim(); + } + + // Si pas de bloc de code trouvé, retourner le texte tel quel + return generatedResponse.trim(); } -async function main() { - const starklingsPath = path.join(process.cwd(), 'starklings'); - const infoPath = path.join(starklingsPath, 'info.toml'); - - if (!fs.existsSync(starklingsPath)) { - console.error('❌ Starklings directory not found'); - process.exit(1); - } - - if (!fs.existsSync(infoPath)) { - console.error('❌ info.toml not found in starklings directory'); - process.exit(1); - } - - // Tester la connexion au serveur - const serverOk = await testServerConnection(); - if (!serverOk) { - console.error('❌ Server is not accessible'); - process.exit(1); - } - - // Parser les exercices par catégorie - const categories = parseInfoToml(infoPath); - - if (Object.keys(categories).length === 0) { - console.error('❌ No categories found'); - process.exit(1); - } - - // Filtrer à une seule catégorie si demandé - let categoriesToTest = categories; - if (SINGLE_EXERCISE) { - // Trouver la catégorie contenant l'exercice - let foundCategory = null; - for (const [categoryName, exercises] of Object.entries(categories)) { - if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) { - foundCategory = categoryName; - break; - } - } - - if (!foundCategory) { - console.error(`❌ Exercise '${SINGLE_EXERCISE}' not found`); - process.exit(1); - } - - categoriesToTest = { - [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE) - }; - log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`); - } +function generateConsolidatedReport(allResults) { + if (allResults.length === 0) { + return { error: 'No successful runs' }; + } + + const successRates = allResults.map(r => parseFloat(r.globalSuccessRate)); + const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1); + + const bestRun = allResults.reduce((best, current) => + parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best + ); + + const worstRun = allResults.reduce((worst, current) => + parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst + ); + + // Analyse par catégorie + const categoryStats = {}; + allResults.forEach(run => { + run.categories.forEach(category => { + if (!categoryStats[category.category]) { + categoryStats[category.category] = { + successRates: [], + averageSuccessRate: 0, + bestRate: 0, + worstRate: 100 + }; + } + + const rate = parseFloat(category.successRate); + categoryStats[category.category].successRates.push(rate); + categoryStats[category.category].bestRate = Math.max(categoryStats[category.category].bestRate, rate); + categoryStats[category.category].worstRate = Math.min(categoryStats[category.category].worstRate, rate); + }); + }); + + // Calculer les moyennes par catégorie + Object.keys(categoryStats).forEach(category => { + const rates = categoryStats[category].successRates; + categoryStats[category].averageSuccessRate = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1); + }); + + return { + totalRuns: allResults.length, + averageSuccessRate: averageSuccessRate, + bestRun: bestRun, + worstRun: worstRun, + categoryStats: categoryStats, + allRuns: allResults + }; +} - // Créer le dossier de debug - const debugDir = path.join(__dirname, '..', '..', 'debug'); - fs.mkdirSync(debugDir, { recursive: true }); - - // Calculer le total d'exercices - const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0); - console.log(`\n🧪 Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); - - // Traiter les catégories en parallèle - const startTime = Date.now(); - const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => - processCategoryWorker(categoryName, exercises, starklingsPath) - ); - - const categoryResults = await Promise.all(categoryPromises); - const endTime = Date.now(); - - // Consolider les résultats - const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0); - const globalResults = { - totalExercises: totalExercises, - totalPassed: totalPassed, - globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1), - categories: categoryResults - }; +async function runSingleTest(runNumber) { + const starklingsPath = path.join(process.cwd(), 'starklings'); + const infoPath = path.join(starklingsPath, 'info.toml'); + + if (!fs.existsSync(starklingsPath)) { + throw new Error('Starklings directory not found'); + } + + if (!fs.existsSync(infoPath)) { + throw new Error('info.toml not found in starklings directory'); + } + + // Tester la connexion au serveur + const serverOk = await testServerConnection(); + if (!serverOk) { + throw new Error('Server is not accessible'); + } + + // Parser les exercices par catégorie + const categories = parseInfoToml(infoPath); + + if (Object.keys(categories).length === 0) { + throw new Error('No categories found'); + } + + // Filtrer à une seule catégorie si demandé + let categoriesToTest = categories; + if (SINGLE_EXERCISE) { + let foundCategory = null; + for (const [categoryName, exercises] of Object.entries(categories)) { + if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) { + foundCategory = categoryName; + break; + } + } + + if (!foundCategory) { + throw new Error(`Exercise '${SINGLE_EXERCISE}' not found`); + } + + categoriesToTest = { + [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE) + }; + log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`); + } + + // Créer le dossier de debug + const debugDir = path.join(__dirname, '..', '..', 'debug'); + fs.mkdirSync(debugDir, { recursive: true }); + + // Calculer le total d'exercices + const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0); + console.log(`\n🧪 [RUN ${runNumber}/10] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); + + // Traiter les catégories en parallèle + const startTime = Date.now(); + const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => + processCategoryWorker(categoryName, exercises, starklingsPath, runNumber) + ); + + const categoryResults = await Promise.all(categoryPromises); + const endTime = Date.now(); + + // Consolider les résultats + const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0); + const globalResults = { + runNumber: runNumber, + timestamp: new Date().toISOString(), + totalExercises: totalExercises, + totalPassed: totalPassed, + globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1), + executionTime: (endTime - startTime) / 1000, + categories: categoryResults + }; + + // Sauvegarder le rapport global pour ce run + const globalReportPath = path.join(debugDir, `global_report_run${runNumber}.json`); + fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2)); + + console.log(`[RUN ${runNumber}] ${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`); + + return globalResults; +} - // Sauvegarder le rapport global - const globalReportPath = path.join(debugDir, 'global_report.json'); - fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2)); - - console.log(`\n=== Final Results ===`); - console.log(`${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`); - console.log(`Total time: ${(endTime - startTime) / 1000}s`); - console.log(`\nCategory breakdown:`); - - categoryResults.forEach(result => { - console.log(` ${result.category}: ${result.passed}/${result.total} (${result.successRate}%)`); - }); - - log(`Reports saved in: ${debugDir}`); - log(`Global report: ${globalReportPath}`); +async function main() { + const NUM_RUNS = 10; + const allResults = []; + + console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`); + + for (let i = 1; i <= NUM_RUNS; i++) { + try { + const result = await runSingleTest(i); + allResults.push(result); + + // Petite pause entre les runs pour éviter la surcharge + if (i < NUM_RUNS) { + await new Promise(resolve => setTimeout(resolve, 2000)); + } + } catch (error) { + console.error(`❌ Run ${i} failed:`, error.message); + // Continuer avec les autres runs même si un échoue + } + } + + // Générer le rapport consolidé + const debugDir = path.join(__dirname, '..', '..', 'debug'); + const consolidatedReport = generateConsolidatedReport(allResults); + const consolidatedReportPath = path.join(debugDir, 'consolidated_report.json'); + fs.writeFileSync(consolidatedReportPath, JSON.stringify(consolidatedReport, null, 2)); + + console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`); + console.log(`Average success rate: ${consolidatedReport.averageSuccessRate}%`); + console.log(`Best run: ${consolidatedReport.bestRun.globalSuccessRate}% (Run ${consolidatedReport.bestRun.runNumber})`); + console.log(`Worst run: ${consolidatedReport.worstRun.globalSuccessRate}% (Run ${consolidatedReport.worstRun.runNumber})`); + + log(`All reports saved in: ${debugDir}`); + log(`Consolidated report: ${consolidatedReportPath}`); } main().catch(error => { - console.error('❌ Fatal error:', error); - process.exit(1); + console.error('❌ Fatal error:', error); + process.exit(1); }); \ No newline at end of file From 48f48398abae5d4c5cf488307c93c27b673607f3 Mon Sep 17 00:00:00 2001 From: alvinouille Date: Wed, 16 Jul 2025 00:12:38 +0200 Subject: [PATCH 5/9] improve overall report --- .github/scripts/starklings-evaluate.js | 116 ++++++++++++++----------- 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index 7cd78bdd..2e95036e 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -215,7 +215,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) { log(`Updated exercise file with generated code`); // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10) - if (SAVE_RESPONSES && runNumber === 10) { + if (SAVE_RESPONSES && runNumber === 2) { const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`); fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); fs.writeFileSync(solutionFile, correctedCode); @@ -248,7 +248,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) { }; // Sauvegarder les erreurs SEULEMENT pour le dernier run - if (SAVE_RESPONSES && runNumber === 10) { + if (SAVE_RESPONSES && runNumber === 2) { const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`); log(`Error details saved to: ${errorFile}`); @@ -328,55 +328,65 @@ function extractCairoCode(generatedResponse) { } function generateConsolidatedReport(allResults) { - if (allResults.length === 0) { - return { error: 'No successful runs' }; - } - - const successRates = allResults.map(r => parseFloat(r.globalSuccessRate)); - const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1); - - const bestRun = allResults.reduce((best, current) => - parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best - ); - - const worstRun = allResults.reduce((worst, current) => - parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst - ); - - // Analyse par catégorie - const categoryStats = {}; - allResults.forEach(run => { - run.categories.forEach(category => { - if (!categoryStats[category.category]) { - categoryStats[category.category] = { - successRates: [], - averageSuccessRate: 0, - bestRate: 0, - worstRate: 100 - }; - } - - const rate = parseFloat(category.successRate); - categoryStats[category.category].successRates.push(rate); - categoryStats[category.category].bestRate = Math.max(categoryStats[category.category].bestRate, rate); - categoryStats[category.category].worstRate = Math.min(categoryStats[category.category].worstRate, rate); - }); - }); - - // Calculer les moyennes par catégorie - Object.keys(categoryStats).forEach(category => { - const rates = categoryStats[category].successRates; - categoryStats[category].averageSuccessRate = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1); - }); - - return { - totalRuns: allResults.length, - averageSuccessRate: averageSuccessRate, - bestRun: bestRun, - worstRun: worstRun, - categoryStats: categoryStats, - allRuns: allResults - }; + if (allResults.length === 0) { + return { error: 'No successful runs' }; + } + + // Taux de réussite global + const successRates = allResults.map(r => parseFloat(r.globalSuccessRate)); + const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1); + + // Taux de réussite par catégorie + const categoryStats = {}; + allResults.forEach(run => { + run.categories.forEach(category => { + if (!categoryStats[category.category]) { + categoryStats[category.category] = { + successRates: [] + }; + } + categoryStats[category.category].successRates.push(parseFloat(category.successRate)); + }); + }); + + // Calculer les moyennes par catégorie + const categoryAverages = {}; + Object.keys(categoryStats).forEach(category => { + const rates = categoryStats[category].successRates; + categoryAverages[category] = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + '%'; + }); + + // Collecter les erreurs par exercice et par run + const exerciseErrors = {}; + allResults.forEach(run => { + run.categories.forEach(category => { + category.exercises.forEach(exercise => { + if (!exercise.success && exercise.error) { + if (!exerciseErrors[exercise.name]) { + exerciseErrors[exercise.name] = []; + } + + // Ajouter l'erreur avec le numéro de run + exerciseErrors[exercise.name].push({ + run: run.runNumber, + type: exercise.error.type || 'COMPILATION_ERROR', + message: exercise.error.message || 'Compilation failed', + stdout: exercise.error.stdout ? exercise.error.stdout.substring(0, 500) : null, + stderr: exercise.error.stderr ? exercise.error.stderr.substring(0, 500) : null + }); + } + }); + }); + }); + + return { + summary: { + totalRuns: allResults.length, + globalSuccessRate: averageSuccessRate + '%' + }, + categorySuccessRates: categoryAverages, + exerciseErrors: exerciseErrors + }; } async function runSingleTest(runNumber) { @@ -431,7 +441,7 @@ async function runSingleTest(runNumber) { // Calculer le total d'exercices const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0); - console.log(`\n🧪 [RUN ${runNumber}/10] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); + console.log(`\n🧪 [RUN ${runNumber}/2] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); // Traiter les catégories en parallèle const startTime = Date.now(); @@ -464,7 +474,7 @@ async function runSingleTest(runNumber) { } async function main() { - const NUM_RUNS = 10; + const NUM_RUNS = 2; const allResults = []; console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`); From a9f3f6313256a69252253ebbe36f1f0897259b9d Mon Sep 17 00:00:00 2001 From: alvinouille Date: Wed, 16 Jul 2025 00:14:29 +0200 Subject: [PATCH 6/9] fix: remove starklings ci --- .github/workflows/starklings.yml | 138 ------------------------------- 1 file changed, 138 deletions(-) delete mode 100644 .github/workflows/starklings.yml diff --git a/.github/workflows/starklings.yml b/.github/workflows/starklings.yml deleted file mode 100644 index baf5f490..00000000 --- a/.github/workflows/starklings.yml +++ /dev/null @@ -1,138 +0,0 @@ -name: Starklings Benchmark - -on: - push: - branches: [main] - pull_request: - branches: [main] - workflow_dispatch: - -jobs: - starklings-benchmark: - name: Starklings Benchmark - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - - - name: Setup Rust - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - - - name: Install pnpm - uses: pnpm/action-setup@v3 - with: - version: 9 - - - name: Install dependencies - run: pnpm install - - - name: Build Cairo Coder - run: pnpm build - - - name: Setup PostgreSQL - uses: harmon758/postgresql-action@v1 - with: - postgresql version: '15' - postgresql db: 'cairo_coder_test' - postgresql user: 'test_user' - postgresql password: 'test_password' - - - name: Install PostgreSQL client and pgvector - run: | - sudo apt-get update - sudo apt-get install -y postgresql-client-15 - sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;" - - - name: Setup test configuration - run: | - mkdir -p packages/agents - cat > packages/agents/config.toml << 'EOL' - [API_KEYS] - OPENAI = "${{ secrets.OPENAI_API_KEY }}" - ANTHROPIC = "${{ secrets.ANTHROPIC_API_KEY }}" - GEMINI = "${{ secrets.GEMINI_API_KEY }}" - - [VECTOR_DB] - POSTGRES_USER = "test_user" - POSTGRES_HOST = "localhost" - POSTGRES_DB = "cairo_coder_test" - POSTGRES_PASSWORD = "test_password" - POSTGRES_PORT = "5432" - - [GENERAL] - PORT = 3001 - SIMILARITY_MEASURE = "cosine" - - [PROVIDERS] - DEFAULT_CHAT_PROVIDER = "gemini" - DEFAULT_CHAT_MODEL = "Gemini Flash 2.5" - DEFAULT_FAST_CHAT_PROVIDER = "gemini" - DEFAULT_FAST_CHAT_MODEL = "Gemini Flash 2.5" - DEFAULT_EMBEDDING_PROVIDER = "openai" - DEFAULT_EMBEDDING_MODEL = "Text embedding 3 large" - - [VERSIONS] - STARKNET_FOUNDRY = "0.37.0" - SCARB = "2.9.2" - EOL - - - name: Create env file - run: | - cat > .env << 'EOL' - POSTGRES_USER=test_user - POSTGRES_HOST=localhost - POSTGRES_DB=cairo_coder_test - POSTGRES_PASSWORD=test_password - POSTGRES_PORT=5432 - EOL - - - name: Clone Starklings - run: | - if [ ! -d "starklings" ]; then - git clone https://github.com/starknet-edu/starklings.git - fi - - - name: Install Scarb - run: | - curl --proto '=https' --tlsv1.2 -sSf https://docs.swmansion.com/scarb/install.sh | sh - echo "$HOME/.local/bin" >> $GITHUB_PATH - - - name: Start Cairo Coder (background) - run: | - pnpm start & - # Attendre que le serveur démarre - for i in {1..30}; do - if curl -s http://localhost:3001/ > /dev/null; then - echo "Server is ready" - break - fi - echo "Waiting for server... ($i/30)" - sleep 2 - done - - # Vérifier si le serveur est vraiment prêt - if ! curl -s http://localhost:3001/ > /dev/null; then - echo "Server failed to start" - exit 1 - fi - - - name: Run Starklings Evaluation - run: node .github/scripts/starklings-evaluate.js - timeout-minutes: 30 - - - name: Upload results - if: always() - uses: actions/upload-artifact@v4 - with: - name: starklings-results - path: | - starklings/ - *.log \ No newline at end of file From d9e177a0136dad1dd69abb8123a09e147979fd7a Mon Sep 17 00:00:00 2001 From: alvinouille Date: Wed, 16 Jul 2025 20:37:10 +0200 Subject: [PATCH 7/9] improve bash script to install good version of starkling repo and starling script to improve consolidated report --- .github/scripts/starklings-evaluate.js | 48 ++++++++++++++++++-------- script-starklings.sh | 25 ++++++++++++++ 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index 2e95036e..b5ac7573 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -4,7 +4,7 @@ const path = require('path'); // Configuration de débogage const DEBUG = true; -const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; // ex: "intro1" +const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; const SAVE_RESPONSES = true; function log(message) { @@ -215,7 +215,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) { log(`Updated exercise file with generated code`); // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10) - if (SAVE_RESPONSES && runNumber === 2) { + if (SAVE_RESPONSES && runNumber === 5) { const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`); fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); fs.writeFileSync(solutionFile, correctedCode); @@ -224,7 +224,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) { // Tester la solution try { log(`Running starklings for ${exercise.name}...`); - const result = execSync(`cargo run --bin starklings run ${exercise.name} 2>/dev/null`, { + const result = execSync(`cargo run --bin starklings run ${exercise.name}`, { cwd: starklingsPath, stdio: 'pipe', timeout: 300000, @@ -248,7 +248,7 @@ async function testExercise(exercise, starklingsPath, runNumber = 1) { }; // Sauvegarder les erreurs SEULEMENT pour le dernier run - if (SAVE_RESPONSES && runNumber === 2) { + if (SAVE_RESPONSES && runNumber === 5) { const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`); log(`Error details saved to: ${errorFile}`); @@ -356,18 +356,24 @@ function generateConsolidatedReport(allResults) { categoryAverages[category] = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + '%'; }); - // Collecter les erreurs par exercice et par run - const exerciseErrors = {}; + // Collecter les erreurs par catégorie et par exercice + const exerciseErrorsByCategory = {}; allResults.forEach(run => { run.categories.forEach(category => { category.exercises.forEach(exercise => { if (!exercise.success && exercise.error) { - if (!exerciseErrors[exercise.name]) { - exerciseErrors[exercise.name] = []; + // Initialiser la catégorie si elle n'existe pas + if (!exerciseErrorsByCategory[category.category]) { + exerciseErrorsByCategory[category.category] = {}; + } + + // Initialiser l'exercice si il n'existe pas + if (!exerciseErrorsByCategory[category.category][exercise.name]) { + exerciseErrorsByCategory[category.category][exercise.name] = []; } // Ajouter l'erreur avec le numéro de run - exerciseErrors[exercise.name].push({ + exerciseErrorsByCategory[category.category][exercise.name].push({ run: run.runNumber, type: exercise.error.type || 'COMPILATION_ERROR', message: exercise.error.message || 'Compilation failed', @@ -385,7 +391,7 @@ function generateConsolidatedReport(allResults) { globalSuccessRate: averageSuccessRate + '%' }, categorySuccessRates: categoryAverages, - exerciseErrors: exerciseErrors + exerciseErrorsByCategory: exerciseErrorsByCategory }; } @@ -441,7 +447,7 @@ async function runSingleTest(runNumber) { // Calculer le total d'exercices const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0); - console.log(`\n🧪 [RUN ${runNumber}/2] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); + console.log(`\n🧪 [RUN ${runNumber}/5] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); // Traiter les catégories en parallèle const startTime = Date.now(); @@ -474,7 +480,7 @@ async function runSingleTest(runNumber) { } async function main() { - const NUM_RUNS = 2; + const NUM_RUNS = 1; const allResults = []; console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`); @@ -501,9 +507,21 @@ async function main() { fs.writeFileSync(consolidatedReportPath, JSON.stringify(consolidatedReport, null, 2)); console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`); - console.log(`Average success rate: ${consolidatedReport.averageSuccessRate}%`); - console.log(`Best run: ${consolidatedReport.bestRun.globalSuccessRate}% (Run ${consolidatedReport.bestRun.runNumber})`); - console.log(`Worst run: ${consolidatedReport.worstRun.globalSuccessRate}% (Run ${consolidatedReport.worstRun.runNumber})`); + console.log(`Average success rate: ${consolidatedReport.summary.globalSuccessRate}`); + + // Calculer le meilleur et pire run pour l'affichage + if (allResults.length > 0) { + const bestRun = allResults.reduce((best, current) => + parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best + ); + + const worstRun = allResults.reduce((worst, current) => + parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst + ); + + console.log(`Best run: ${bestRun.globalSuccessRate}% (Run ${bestRun.runNumber})`); + console.log(`Worst run: ${worstRun.globalSuccessRate}% (Run ${worstRun.runNumber})`); + } log(`All reports saved in: ${debugDir}`); log(`Consolidated report: ${consolidatedReportPath}`); diff --git a/script-starklings.sh b/script-starklings.sh index 0ad3f18a..a5e28e45 100644 --- a/script-starklings.sh +++ b/script-starklings.sh @@ -1,5 +1,30 @@ #!/bin/bash + +# 1. Nettoyer les éventuels anciens dossiers +echo "🧹 Cleaning up previous installations..." +rm -rf starklings + +# 2. Cloner le repo starklings +echo "📦 Cloning starklings repository..." +git clone https://github.com/shramee/starklings.git +if [ $? -ne 0 ]; then + echo "❌ Failed to clone starklings repository" + exit 1 +fi + +# 3. Changer vers la branche feat/upgrade-cairo-and-use-scarb +echo "🔄 Switching to feat/upgrade-cairo-and-use-scarb branch..." +cd starklings +git checkout feat/upgrade-cairo-and-use-scarb +if [ $? -ne 0 ]; then + echo "❌ Failed to switch to feat/upgrade-cairo-and-use-scarb branch" + exit 1 +fi + +# 4. Retourner au dossier parent +cd .. + # Vérifier si le serveur répond if ! curl -s http://localhost:3002/ > /dev/null 2>&1; then echo "❌ Server failed to start" From 40d74e47f083a51d9a6d370be29de5d5069fff0a Mon Sep 17 00:00:00 2001 From: alvinouille Date: Wed, 16 Jul 2025 20:41:54 +0200 Subject: [PATCH 8/9] trunk fmt --- .github/scripts/starklings-evaluate.js | 1040 +++++++++++++----------- 1 file changed, 571 insertions(+), 469 deletions(-) diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index b5ac7573..69d73107 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -4,117 +4,120 @@ const path = require('path'); // Configuration de débogage const DEBUG = true; -const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; +const SINGLE_EXERCISE = process.env.SINGLE_EXERCISE || null; const SAVE_RESPONSES = true; function log(message) { - if (DEBUG) { - console.log(`[DEBUG] ${message}`); - } + if (DEBUG) { + console.log(`[DEBUG] ${message}`); + } } function parseInfoToml(infoPath) { - if (!fs.existsSync(infoPath)) { - throw new Error(`info.toml not found at: ${infoPath}`); - } - - const content = fs.readFileSync(infoPath, 'utf8'); - const lines = content.split('\n'); - - const categories = {}; - let currentCategory = null; - let currentExercise = null; - let collectingHint = false; - let hintLines = []; - - for (let i = 0; i < lines.length; i++) { - const line = lines[i]; - const cleanLine = line.trim(); - - // Détecter les catégories - if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) { - currentCategory = cleanLine.substring(2).trim(); - categories[currentCategory] = []; - continue; - } - - if (cleanLine.startsWith('[[exercises]]')) { - if (currentExercise) { - if (hintLines.length > 0) { - currentExercise.hint = hintLines.join('\n').replace(/^"""/, '').replace(/"""$/, ''); - } - if (currentCategory) { - categories[currentCategory].push(currentExercise); - } - } - currentExercise = { category: currentCategory }; - collectingHint = false; - hintLines = []; - } else if (cleanLine.startsWith('hint = """')) { - collectingHint = true; - hintLines.push(cleanLine.replace('hint = """', '').trim()); - } else if (collectingHint) { - if (cleanLine.endsWith('"""')) { - hintLines.push(cleanLine.replace('"""', '').trim()); - collectingHint = false; - } else { - hintLines.push(cleanLine); - } - } else if (cleanLine.startsWith('name = ')) { - const match = cleanLine.match(/name = "(.+)"/); - if (match) { - currentExercise.name = match[1]; - } - } else if (cleanLine.startsWith('path = ')) { - const match = cleanLine.match(/path = "(.+)"/); - if (match) { - currentExercise.path = match[1]; - } - } else if (cleanLine.startsWith('mode = ')) { - const match = cleanLine.match(/mode = "(.+)"/); - if (match) { - currentExercise.mode = match[1]; - } - } - } - - // N'oublie pas le dernier exercice - if (currentExercise) { - if (hintLines.length > 0) { - currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); - } - if (currentCategory) { - categories[currentCategory].push(currentExercise); - } - } - - return categories; + if (!fs.existsSync(infoPath)) { + throw new Error(`info.toml not found at: ${infoPath}`); + } + + const content = fs.readFileSync(infoPath, 'utf8'); + const lines = content.split('\n'); + + const categories = {}; + let currentCategory = null; + let currentExercise = null; + let collectingHint = false; + let hintLines = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const cleanLine = line.trim(); + + // Détecter les catégories + if (cleanLine.startsWith('# ') && !cleanLine.startsWith('##')) { + currentCategory = cleanLine.substring(2).trim(); + categories[currentCategory] = []; + continue; + } + + if (cleanLine.startsWith('[[exercises]]')) { + if (currentExercise) { + if (hintLines.length > 0) { + currentExercise.hint = hintLines + .join('\n') + .replace(/^"""/, '') + .replace(/"""$/, ''); + } + if (currentCategory) { + categories[currentCategory].push(currentExercise); + } + } + currentExercise = { category: currentCategory }; + collectingHint = false; + hintLines = []; + } else if (cleanLine.startsWith('hint = """')) { + collectingHint = true; + hintLines.push(cleanLine.replace('hint = """', '').trim()); + } else if (collectingHint) { + if (cleanLine.endsWith('"""')) { + hintLines.push(cleanLine.replace('"""', '').trim()); + collectingHint = false; + } else { + hintLines.push(cleanLine); + } + } else if (cleanLine.startsWith('name = ')) { + const match = cleanLine.match(/name = "(.+)"/); + if (match) { + currentExercise.name = match[1]; + } + } else if (cleanLine.startsWith('path = ')) { + const match = cleanLine.match(/path = "(.+)"/); + if (match) { + currentExercise.path = match[1]; + } + } else if (cleanLine.startsWith('mode = ')) { + const match = cleanLine.match(/mode = "(.+)"/); + if (match) { + currentExercise.mode = match[1]; + } + } + } + + // N'oublie pas le dernier exercice + if (currentExercise) { + if (hintLines.length > 0) { + currentExercise.hint = hintLines.join('\n').replace(/"""$/, ''); + } + if (currentCategory) { + categories[currentCategory].push(currentExercise); + } + } + + return categories; } async function testServerConnection() { - log('Testing server connection...'); - - try { - const response = await fetch('http://localhost:3002/', { - method: 'GET', - timeout: 5000 - }); - - if (response.ok) { - log('✅ Server connection successful'); - return true; - } else { - log(`❌ Server responded with status: ${response.status}`); - return false; - } - } catch (error) { - log(`❌ Server connection failed: ${error.message}`); - return false; - } + log('Testing server connection...'); + + try { + const response = await fetch('http://localhost:3002/', { + method: 'GET', + timeout: 5000, + }); + + if (response.ok) { + log('✅ Server connection successful'); + return true; + } else { + log(`❌ Server responded with status: ${response.status}`); + return false; + } + } catch (error) { + log(`❌ Server connection failed: ${error.message}`); + return false; + } } async function callCairoCoderAPI(exerciseContent, exercise, retries = 3) { - const prompt = `You are solving a Cairo programming exercise. + const prompt = `You are solving a Cairo programming exercise. Exercise: ${exercise.name} ${exercise.hint ? `Hint: ${exercise.hint}` : ''} @@ -131,403 +134,502 @@ ${exerciseContent} Please provide only the corrected code, without any additional explanation or markdown formatting.`; - const requestBody = { - model: 'cairo-coder', - messages: [{ role: 'user', content: prompt }], - stream: false - }; - - for (let attempt = 1; attempt <= retries; attempt++) { - try { - log(`API call attempt ${attempt}/${retries} for ${exercise.name}`); - - const response = await fetch('http://localhost:3002/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify(requestBody), - timeout: 120000 // 2 minutes au lieu de 60 secondes - }); - - if (!response.ok) { - const errorText = await response.text(); - throw new Error(`HTTP error! status: ${response.status} - ${errorText}`); - } - - const data = await response.json(); - - // Sauvegarder la réponse complète si demandé - if (SAVE_RESPONSES) { - const responseFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_response.json`); - fs.mkdirSync(path.dirname(responseFile), { recursive: true }); - fs.writeFileSync(responseFile, JSON.stringify(data, null, 2)); - } - - // Extraire le contenu de la réponse - if (data.choices && data.choices[0] && data.choices[0].message) { - const rawContent = data.choices[0].message.content; - const cleanCode = extractCairoCode(rawContent); - log(`✅ API call successful for ${exercise.name}`); - return cleanCode; - } else { - throw new Error('Invalid response format from API'); - } - - } catch (error) { - log(`❌ API call failed (attempt ${attempt}/${retries}) for ${exercise.name}: ${error.message}`); - - if (attempt === retries) { - throw error; // Dernier essai, on lance l'erreur - } - - // Attendre de plus en plus longtemps à chaque retry - const waitTime = 3000 * attempt; // 3s, 6s, 9s - log(`Waiting ${waitTime}ms before retry...`); - await new Promise(resolve => setTimeout(resolve, waitTime)); - } + const requestBody = { + model: 'cairo-coder', + messages: [{ role: 'user', content: prompt }], + stream: false, + }; + + for (let attempt = 1; attempt <= retries; attempt++) { + try { + log(`API call attempt ${attempt}/${retries} for ${exercise.name}`); + + const response = await fetch( + 'http://localhost:3002/v1/chat/completions', + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + timeout: 120000, // 2 minutes au lieu de 60 secondes + }, + ); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `HTTP error! status: ${response.status} - ${errorText}`, + ); + } + + const data = await response.json(); + + // Sauvegarder la réponse complète si demandé + if (SAVE_RESPONSES) { + const responseFile = path.join( + __dirname, + '..', + '..', + 'debug', + `${exercise.name}_response.json`, + ); + fs.mkdirSync(path.dirname(responseFile), { recursive: true }); + fs.writeFileSync(responseFile, JSON.stringify(data, null, 2)); + } + + // Extraire le contenu de la réponse + if (data.choices && data.choices[0] && data.choices[0].message) { + const rawContent = data.choices[0].message.content; + const cleanCode = extractCairoCode(rawContent); + log(`✅ API call successful for ${exercise.name}`); + return cleanCode; + } else { + throw new Error('Invalid response format from API'); + } + } catch (error) { + log( + `❌ API call failed (attempt ${attempt}/${retries}) for ${exercise.name}: ${error.message}`, + ); + + if (attempt === retries) { + throw error; // Dernier essai, on lance l'erreur + } + + // Attendre de plus en plus longtemps à chaque retry + const waitTime = 3000 * attempt; // 3s, 6s, 9s + log(`Waiting ${waitTime}ms before retry...`); + await new Promise((resolve) => setTimeout(resolve, waitTime)); } + } } async function testExercise(exercise, starklingsPath, runNumber = 1) { - log(`\n=== Testing exercise: ${exercise.name} ===`); - - const exercisePath = path.join(starklingsPath, exercise.path); - - if (!fs.existsSync(exercisePath)) { - log(`❌ Exercise file not found: ${exercisePath}`); - return { success: false, error: { message: 'File not found', type: 'FILE_ERROR' } }; - } - - // Lire le contenu original - const originalContent = fs.readFileSync(exercisePath, 'utf8'); - - // Sauvegarder l'original - const backupPath = exercisePath + '.backup'; - fs.writeFileSync(backupPath, originalContent); - - try { - // Appeler l'API - const correctedCode = await callCairoCoderAPI(originalContent, exercise); - - // Sauvegarder la solution - fs.writeFileSync(exercisePath, correctedCode); - log(`Updated exercise file with generated code`); - - // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10) - if (SAVE_RESPONSES && runNumber === 5) { - const solutionFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_solution.cairo`); - fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); - fs.writeFileSync(solutionFile, correctedCode); - } - - // Tester la solution - try { - log(`Running starklings for ${exercise.name}...`); - const result = execSync(`cargo run --bin starklings run ${exercise.name}`, { - cwd: starklingsPath, - stdio: 'pipe', - timeout: 300000, - encoding: 'utf8' - }); - - log(`✅ ${exercise.name} - Success`); - log(`Starklings output: ${result.substring(0, 200)}...`); - return { success: true }; - } catch (error) { - log(`❌ ${exercise.name} - Execution failed`); - log(`Error code: ${error.status}`); - log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`); - log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`); - - // Formater l'erreur pour le rapport - const errorDetails = { - exitCode: error.status, - stdout: error.stdout || '', - stderr: error.stderr || '' - }; - - // Sauvegarder les erreurs SEULEMENT pour le dernier run - if (SAVE_RESPONSES && runNumber === 5) { - const errorFile = path.join(__dirname, '..', '..', 'debug', `${exercise.name}_error.txt`); - fs.writeFileSync(errorFile, `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`); - log(`Error details saved to: ${errorFile}`); - } - - return { success: false, error: errorDetails }; - } - } catch (error) { - log(`❌ ${exercise.name} - API call failed: ${error.message}`); - return { success: false, error: { message: error.message, type: 'API_ERROR' } }; - } finally { - // Restaurer l'original - fs.writeFileSync(exercisePath, originalContent); - fs.unlinkSync(backupPath); - log(`Restored original file and cleaned up backup`); - } -} + log(`\n=== Testing exercise: ${exercise.name} ===`); -async function processCategoryWorker(categoryName, exercises, starklingsPath, runNumber = 1) { - const categoryResults = { - category: categoryName, - exercises: [], - passed: 0, - total: exercises.length + const exercisePath = path.join(starklingsPath, exercise.path); + + if (!fs.existsSync(exercisePath)) { + log(`❌ Exercise file not found: ${exercisePath}`); + return { + success: false, + error: { message: 'File not found', type: 'FILE_ERROR' }, }; + } + + // Lire le contenu original + const originalContent = fs.readFileSync(exercisePath, 'utf8'); + + // Sauvegarder l'original + const backupPath = exercisePath + '.backup'; + fs.writeFileSync(backupPath, originalContent); + + try { + // Appeler l'API + const correctedCode = await callCairoCoderAPI(originalContent, exercise); + + // Sauvegarder la solution + fs.writeFileSync(exercisePath, correctedCode); + log(`Updated exercise file with generated code`); + + // Sauvegarder les fichiers de debug SEULEMENT pour le dernier run (run 10) + if (SAVE_RESPONSES && runNumber === 5) { + const solutionFile = path.join( + __dirname, + '..', + '..', + 'debug', + `${exercise.name}_solution.cairo`, + ); + fs.mkdirSync(path.dirname(solutionFile), { recursive: true }); + fs.writeFileSync(solutionFile, correctedCode); + } - log(`\n[${categoryName}] Starting ${exercises.length} exercises...`); + // Tester la solution + try { + log(`Running starklings for ${exercise.name}...`); + const result = execSync( + `cargo run --bin starklings run ${exercise.name}`, + { + cwd: starklingsPath, + stdio: 'pipe', + timeout: 300000, + encoding: 'utf8', + }, + ); + + log(`✅ ${exercise.name} - Success`); + log(`Starklings output: ${result.substring(0, 200)}...`); + return { success: true }; + } catch (error) { + log(`❌ ${exercise.name} - Execution failed`); + log(`Error code: ${error.status}`); + log(`stdout: ${error.stdout ? error.stdout.substring(0, 500) : 'none'}`); + log(`stderr: ${error.stderr ? error.stderr.substring(0, 500) : 'none'}`); + + // Formater l'erreur pour le rapport + const errorDetails = { + exitCode: error.status, + stdout: error.stdout || '', + stderr: error.stderr || '', + }; + + // Sauvegarder les erreurs SEULEMENT pour le dernier run + if (SAVE_RESPONSES && runNumber === 5) { + const errorFile = path.join( + __dirname, + '..', + '..', + 'debug', + `${exercise.name}_error.txt`, + ); + fs.writeFileSync( + errorFile, + `Exit code: ${error.status}\n\nSTDOUT:\n${error.stdout}\n\nSTDERR:\n${error.stderr}`, + ); + log(`Error details saved to: ${errorFile}`); + } + + return { success: false, error: errorDetails }; + } + } catch (error) { + log(`❌ ${exercise.name} - API call failed: ${error.message}`); + return { + success: false, + error: { message: error.message, type: 'API_ERROR' }, + }; + } finally { + // Restaurer l'original + fs.writeFileSync(exercisePath, originalContent); + fs.unlinkSync(backupPath); + log(`Restored original file and cleaned up backup`); + } +} - for (const exercise of exercises) { - // Délai entre chaque exercice pour éviter la surcharge - if (categoryResults.exercises.length > 0) { - await new Promise(resolve => setTimeout(resolve, 1000)); // 1 seconde - } - - const result = await testExercise(exercise, starklingsPath, runNumber); - - const exerciseResult = { - name: exercise.name, - success: result.success - }; +async function processCategoryWorker( + categoryName, + exercises, + starklingsPath, + runNumber = 1, +) { + const categoryResults = { + category: categoryName, + exercises: [], + passed: 0, + total: exercises.length, + }; + + log(`\n[${categoryName}] Starting ${exercises.length} exercises...`); + + for (const exercise of exercises) { + // Délai entre chaque exercice pour éviter la surcharge + if (categoryResults.exercises.length > 0) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // 1 seconde + } - if (!result.success && result.error) { - exerciseResult.error = result.error; - } + const result = await testExercise(exercise, starklingsPath, runNumber); - categoryResults.exercises.push(exerciseResult); - if (result.success) { - categoryResults.passed++; - } + const exerciseResult = { + name: exercise.name, + success: result.success, + }; - log(`[${categoryName}] ${exercise.name}: ${result.success ? '✅' : '❌'}`); + if (!result.success && result.error) { + exerciseResult.error = result.error; } - categoryResults.successRate = (categoryResults.passed / categoryResults.total * 100).toFixed(1); - - const reportPath = path.join(__dirname, '..', '..', 'debug', `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`); - fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2)); + categoryResults.exercises.push(exerciseResult); + if (result.success) { + categoryResults.passed++; + } - log(`[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`); - return categoryResults; + log(`[${categoryName}] ${exercise.name}: ${result.success ? '✅' : '❌'}`); + } + + categoryResults.successRate = ( + (categoryResults.passed / categoryResults.total) * + 100 + ).toFixed(1); + + const reportPath = path.join( + __dirname, + '..', + '..', + 'debug', + `${categoryName.toLowerCase().replace(/\s+/g, '_')}_report_run${runNumber}.json`, + ); + fs.writeFileSync(reportPath, JSON.stringify(categoryResults, null, 2)); + + log( + `[${categoryName}] Completed: ${categoryResults.passed}/${categoryResults.total} (${categoryResults.successRate}%)`, + ); + return categoryResults; } function extractCairoCode(generatedResponse) { - // Chercher les blocs de code Cairo ou génériques - const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g; - const matches = generatedResponse.match(codeBlockRegex); - - if (matches && matches.length > 0) { - // Extraire le contenu du premier bloc de code trouvé - const codeBlock = matches[0]; - const codeContent = codeBlock.replace(/```(?:cairo|rust|)?\s*\n/, '').replace(/\n```$/, ''); - return codeContent.trim(); - } - - // Si pas de bloc de code trouvé, retourner le texte tel quel - return generatedResponse.trim(); + // Chercher les blocs de code Cairo ou génériques + const codeBlockRegex = /```(?:cairo|rust|)?\s*\n([\s\S]*?)\n```/g; + const matches = generatedResponse.match(codeBlockRegex); + + if (matches && matches.length > 0) { + // Extraire le contenu du premier bloc de code trouvé + const codeBlock = matches[0]; + const codeContent = codeBlock + .replace(/```(?:cairo|rust|)?\s*\n/, '') + .replace(/\n```$/, ''); + return codeContent.trim(); + } + + // Si pas de bloc de code trouvé, retourner le texte tel quel + return generatedResponse.trim(); } function generateConsolidatedReport(allResults) { - if (allResults.length === 0) { - return { error: 'No successful runs' }; - } - - // Taux de réussite global - const successRates = allResults.map(r => parseFloat(r.globalSuccessRate)); - const averageSuccessRate = (successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length).toFixed(1); - - // Taux de réussite par catégorie - const categoryStats = {}; - allResults.forEach(run => { - run.categories.forEach(category => { - if (!categoryStats[category.category]) { - categoryStats[category.category] = { - successRates: [] - }; - } - categoryStats[category.category].successRates.push(parseFloat(category.successRate)); - }); - }); - - // Calculer les moyennes par catégorie - const categoryAverages = {}; - Object.keys(categoryStats).forEach(category => { - const rates = categoryStats[category].successRates; - categoryAverages[category] = (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + '%'; + if (allResults.length === 0) { + return { error: 'No successful runs' }; + } + + // Taux de réussite global + const successRates = allResults.map((r) => parseFloat(r.globalSuccessRate)); + const averageSuccessRate = ( + successRates.reduce((sum, rate) => sum + rate, 0) / successRates.length + ).toFixed(1); + + // Taux de réussite par catégorie + const categoryStats = {}; + allResults.forEach((run) => { + run.categories.forEach((category) => { + if (!categoryStats[category.category]) { + categoryStats[category.category] = { + successRates: [], + }; + } + categoryStats[category.category].successRates.push( + parseFloat(category.successRate), + ); }); - - // Collecter les erreurs par catégorie et par exercice - const exerciseErrorsByCategory = {}; - allResults.forEach(run => { - run.categories.forEach(category => { - category.exercises.forEach(exercise => { - if (!exercise.success && exercise.error) { - // Initialiser la catégorie si elle n'existe pas - if (!exerciseErrorsByCategory[category.category]) { - exerciseErrorsByCategory[category.category] = {}; - } - - // Initialiser l'exercice si il n'existe pas - if (!exerciseErrorsByCategory[category.category][exercise.name]) { - exerciseErrorsByCategory[category.category][exercise.name] = []; - } - - // Ajouter l'erreur avec le numéro de run - exerciseErrorsByCategory[category.category][exercise.name].push({ - run: run.runNumber, - type: exercise.error.type || 'COMPILATION_ERROR', - message: exercise.error.message || 'Compilation failed', - stdout: exercise.error.stdout ? exercise.error.stdout.substring(0, 500) : null, - stderr: exercise.error.stderr ? exercise.error.stderr.substring(0, 500) : null - }); - } - }); - }); + }); + + // Calculer les moyennes par catégorie + const categoryAverages = {}; + Object.keys(categoryStats).forEach((category) => { + const rates = categoryStats[category].successRates; + categoryAverages[category] = + (rates.reduce((sum, rate) => sum + rate, 0) / rates.length).toFixed(1) + + '%'; + }); + + // Collecter les erreurs par catégorie et par exercice + const exerciseErrorsByCategory = {}; + allResults.forEach((run) => { + run.categories.forEach((category) => { + category.exercises.forEach((exercise) => { + if (!exercise.success && exercise.error) { + // Initialiser la catégorie si elle n'existe pas + if (!exerciseErrorsByCategory[category.category]) { + exerciseErrorsByCategory[category.category] = {}; + } + + // Initialiser l'exercice si il n'existe pas + if (!exerciseErrorsByCategory[category.category][exercise.name]) { + exerciseErrorsByCategory[category.category][exercise.name] = []; + } + + // Ajouter l'erreur avec le numéro de run + exerciseErrorsByCategory[category.category][exercise.name].push({ + run: run.runNumber, + type: exercise.error.type || 'COMPILATION_ERROR', + message: exercise.error.message || 'Compilation failed', + stdout: exercise.error.stdout + ? exercise.error.stdout.substring(0, 500) + : null, + stderr: exercise.error.stderr + ? exercise.error.stderr.substring(0, 500) + : null, + }); + } + }); }); - - return { - summary: { - totalRuns: allResults.length, - globalSuccessRate: averageSuccessRate + '%' - }, - categorySuccessRates: categoryAverages, - exerciseErrorsByCategory: exerciseErrorsByCategory - }; + }); + + return { + summary: { + totalRuns: allResults.length, + globalSuccessRate: averageSuccessRate + '%', + }, + categorySuccessRates: categoryAverages, + exerciseErrorsByCategory: exerciseErrorsByCategory, + }; } async function runSingleTest(runNumber) { - const starklingsPath = path.join(process.cwd(), 'starklings'); - const infoPath = path.join(starklingsPath, 'info.toml'); - - if (!fs.existsSync(starklingsPath)) { - throw new Error('Starklings directory not found'); - } - - if (!fs.existsSync(infoPath)) { - throw new Error('info.toml not found in starklings directory'); - } - - // Tester la connexion au serveur - const serverOk = await testServerConnection(); - if (!serverOk) { - throw new Error('Server is not accessible'); - } - - // Parser les exercices par catégorie - const categories = parseInfoToml(infoPath); - - if (Object.keys(categories).length === 0) { - throw new Error('No categories found'); - } - - // Filtrer à une seule catégorie si demandé - let categoriesToTest = categories; - if (SINGLE_EXERCISE) { - let foundCategory = null; - for (const [categoryName, exercises] of Object.entries(categories)) { - if (exercises.some(ex => ex.name === SINGLE_EXERCISE)) { - foundCategory = categoryName; - break; - } - } - - if (!foundCategory) { - throw new Error(`Exercise '${SINGLE_EXERCISE}' not found`); - } - - categoriesToTest = { - [foundCategory]: categories[foundCategory].filter(ex => ex.name === SINGLE_EXERCISE) - }; - log(`Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`); - } - - // Créer le dossier de debug - const debugDir = path.join(__dirname, '..', '..', 'debug'); - fs.mkdirSync(debugDir, { recursive: true }); - - // Calculer le total d'exercices - const totalExercises = Object.values(categoriesToTest).reduce((sum, exercises) => sum + exercises.length, 0); - console.log(`\n🧪 [RUN ${runNumber}/5] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`); - - // Traiter les catégories en parallèle - const startTime = Date.now(); - const categoryPromises = Object.entries(categoriesToTest).map(([categoryName, exercises]) => - processCategoryWorker(categoryName, exercises, starklingsPath, runNumber) - ); - - const categoryResults = await Promise.all(categoryPromises); - const endTime = Date.now(); - - // Consolider les résultats - const totalPassed = categoryResults.reduce((sum, result) => sum + result.passed, 0); - const globalResults = { - runNumber: runNumber, - timestamp: new Date().toISOString(), - totalExercises: totalExercises, - totalPassed: totalPassed, - globalSuccessRate: (totalPassed / totalExercises * 100).toFixed(1), - executionTime: (endTime - startTime) / 1000, - categories: categoryResults - }; - - // Sauvegarder le rapport global pour ce run - const globalReportPath = path.join(debugDir, `global_report_run${runNumber}.json`); - fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2)); - - console.log(`[RUN ${runNumber}] ${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`); - - return globalResults; + const starklingsPath = path.join(process.cwd(), 'starklings'); + const infoPath = path.join(starklingsPath, 'info.toml'); + + if (!fs.existsSync(starklingsPath)) { + throw new Error('Starklings directory not found'); + } + + if (!fs.existsSync(infoPath)) { + throw new Error('info.toml not found in starklings directory'); + } + + // Tester la connexion au serveur + const serverOk = await testServerConnection(); + if (!serverOk) { + throw new Error('Server is not accessible'); + } + + // Parser les exercices par catégorie + const categories = parseInfoToml(infoPath); + + if (Object.keys(categories).length === 0) { + throw new Error('No categories found'); + } + + // Filtrer à une seule catégorie si demandé + let categoriesToTest = categories; + if (SINGLE_EXERCISE) { + let foundCategory = null; + for (const [categoryName, exercises] of Object.entries(categories)) { + if (exercises.some((ex) => ex.name === SINGLE_EXERCISE)) { + foundCategory = categoryName; + break; + } + } + + if (!foundCategory) { + throw new Error(`Exercise '${SINGLE_EXERCISE}' not found`); + } + + categoriesToTest = { + [foundCategory]: categories[foundCategory].filter( + (ex) => ex.name === SINGLE_EXERCISE, + ), + }; + log( + `Testing single exercise: ${SINGLE_EXERCISE} in category: ${foundCategory}`, + ); + } + + // Créer le dossier de debug + const debugDir = path.join(__dirname, '..', '..', 'debug'); + fs.mkdirSync(debugDir, { recursive: true }); + + // Calculer le total d'exercices + const totalExercises = Object.values(categoriesToTest).reduce( + (sum, exercises) => sum + exercises.length, + 0, + ); + console.log( + `\n🧪 [RUN ${runNumber}/5] Starting evaluation of ${totalExercises} exercises across ${Object.keys(categoriesToTest).length} categories...`, + ); + + // Traiter les catégories en parallèle + const startTime = Date.now(); + const categoryPromises = Object.entries(categoriesToTest).map( + ([categoryName, exercises]) => + processCategoryWorker(categoryName, exercises, starklingsPath, runNumber), + ); + + const categoryResults = await Promise.all(categoryPromises); + const endTime = Date.now(); + + // Consolider les résultats + const totalPassed = categoryResults.reduce( + (sum, result) => sum + result.passed, + 0, + ); + const globalResults = { + runNumber: runNumber, + timestamp: new Date().toISOString(), + totalExercises: totalExercises, + totalPassed: totalPassed, + globalSuccessRate: ((totalPassed / totalExercises) * 100).toFixed(1), + executionTime: (endTime - startTime) / 1000, + categories: categoryResults, + }; + + // Sauvegarder le rapport global pour ce run + const globalReportPath = path.join( + debugDir, + `global_report_run${runNumber}.json`, + ); + fs.writeFileSync(globalReportPath, JSON.stringify(globalResults, null, 2)); + + console.log( + `[RUN ${runNumber}] ${totalPassed}/${totalExercises} exercises passed (${globalResults.globalSuccessRate}%)`, + ); + + return globalResults; } async function main() { - const NUM_RUNS = 1; - const allResults = []; - - console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`); - - for (let i = 1; i <= NUM_RUNS; i++) { - try { - const result = await runSingleTest(i); - allResults.push(result); - - // Petite pause entre les runs pour éviter la surcharge - if (i < NUM_RUNS) { - await new Promise(resolve => setTimeout(resolve, 2000)); - } - } catch (error) { - console.error(`❌ Run ${i} failed:`, error.message); - // Continuer avec les autres runs même si un échoue - } - } - - // Générer le rapport consolidé - const debugDir = path.join(__dirname, '..', '..', 'debug'); - const consolidatedReport = generateConsolidatedReport(allResults); - const consolidatedReportPath = path.join(debugDir, 'consolidated_report.json'); - fs.writeFileSync(consolidatedReportPath, JSON.stringify(consolidatedReport, null, 2)); - - console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`); - console.log(`Average success rate: ${consolidatedReport.summary.globalSuccessRate}`); - - // Calculer le meilleur et pire run pour l'affichage - if (allResults.length > 0) { - const bestRun = allResults.reduce((best, current) => - parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) ? current : best - ); - - const worstRun = allResults.reduce((worst, current) => - parseFloat(current.globalSuccessRate) < parseFloat(worst.globalSuccessRate) ? current : worst - ); - - console.log(`Best run: ${bestRun.globalSuccessRate}% (Run ${bestRun.runNumber})`); - console.log(`Worst run: ${worstRun.globalSuccessRate}% (Run ${worstRun.runNumber})`); - } - - log(`All reports saved in: ${debugDir}`); - log(`Consolidated report: ${consolidatedReportPath}`); + const NUM_RUNS = 1; + const allResults = []; + + console.log(`🚀 Starting ${NUM_RUNS} successive test runs...`); + + for (let i = 1; i <= NUM_RUNS; i++) { + try { + const result = await runSingleTest(i); + allResults.push(result); + + // Petite pause entre les runs pour éviter la surcharge + if (i < NUM_RUNS) { + await new Promise((resolve) => setTimeout(resolve, 2000)); + } + } catch (error) { + console.error(`❌ Run ${i} failed:`, error.message); + // Continuer avec les autres runs même si un échoue + } + } + + // Générer le rapport consolidé + const debugDir = path.join(__dirname, '..', '..', 'debug'); + const consolidatedReport = generateConsolidatedReport(allResults); + const consolidatedReportPath = path.join( + debugDir, + 'consolidated_report.json', + ); + fs.writeFileSync( + consolidatedReportPath, + JSON.stringify(consolidatedReport, null, 2), + ); + + console.log(`\n=== Final Summary (${NUM_RUNS} runs) ===`); + console.log( + `Average success rate: ${consolidatedReport.summary.globalSuccessRate}`, + ); + + // Calculer le meilleur et pire run pour l'affichage + if (allResults.length > 0) { + const bestRun = allResults.reduce((best, current) => + parseFloat(current.globalSuccessRate) > parseFloat(best.globalSuccessRate) + ? current + : best, + ); + + const worstRun = allResults.reduce((worst, current) => + parseFloat(current.globalSuccessRate) < + parseFloat(worst.globalSuccessRate) + ? current + : worst, + ); + + console.log( + `Best run: ${bestRun.globalSuccessRate}% (Run ${bestRun.runNumber})`, + ); + console.log( + `Worst run: ${worstRun.globalSuccessRate}% (Run ${worstRun.runNumber})`, + ); + } + + log(`All reports saved in: ${debugDir}`); + log(`Consolidated report: ${consolidatedReportPath}`); } -main().catch(error => { - console.error('❌ Fatal error:', error); - process.exit(1); -}); \ No newline at end of file +main().catch((error) => { + console.error('❌ Fatal error:', error); + process.exit(1); +}); From 312897d3cf94d4b74002b0aa402faa4b92c6e299 Mon Sep 17 00:00:00 2001 From: alvinouille Date: Wed, 16 Jul 2025 20:45:24 +0200 Subject: [PATCH 9/9] fix: port 3001 backend compose --- .github/scripts/starklings-evaluate.js | 4 ++-- docker-compose.yml | 2 +- script-starklings.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/starklings-evaluate.js b/.github/scripts/starklings-evaluate.js index 69d73107..7370215e 100644 --- a/.github/scripts/starklings-evaluate.js +++ b/.github/scripts/starklings-evaluate.js @@ -98,7 +98,7 @@ async function testServerConnection() { log('Testing server connection...'); try { - const response = await fetch('http://localhost:3002/', { + const response = await fetch('http://localhost:3001/', { method: 'GET', timeout: 5000, }); @@ -145,7 +145,7 @@ Please provide only the corrected code, without any additional explanation or ma log(`API call attempt ${attempt}/${retries} for ${exercise.name}`); const response = await fetch( - 'http://localhost:3002/v1/chat/completions', + 'http://localhost:3001/v1/chat/completions', { method: 'POST', headers: { diff --git a/docker-compose.yml b/docker-compose.yml index 8832ee3b..421f5dcd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,7 +19,7 @@ services: dockerfile: backend.dockerfile container_name: 'cairo-coder-backend' ports: - - 3002:3001 + - 3001:3001 depends_on: postgres: condition: service_started diff --git a/script-starklings.sh b/script-starklings.sh index a5e28e45..2a24d930 100644 --- a/script-starklings.sh +++ b/script-starklings.sh @@ -26,7 +26,7 @@ fi cd .. # Vérifier si le serveur répond -if ! curl -s http://localhost:3002/ > /dev/null 2>&1; then +if ! curl -s http://localhost:3001/ > /dev/null 2>&1; then echo "❌ Server failed to start" kill $SERVER_PID 2>/dev/null || true exit 1