(TURN ON "RUN IN SEPARATE PROCESS")

# Top Projects Random Packages with Generated Tags

This Kotlin Jupyter notebook connects to the database, takes top projects by stars, and for each project:
1. Fetches project metadata (name, stars) and plain description from project_index
2. Fetches the repository minimized README (if available)
3. Generates a list of search tags using OpenAI API and a prompt loaded from a file, leveraging description + README
4. Prints the context and tags for quick evaluation

Notes:
- The prompt is expected to be provided via an external file whose path is in env var PROMPT_FILE_PATH.
- API key is read from env var OPENAI_API_KEY.
- DB connection is configured via DB_URL, DB_USERNAME, DB_PASSWORD.


In [2]:
@file:DependsOn("org.postgresql:postgresql:42.7.1")
@file:DependsOn("com.squareup.okhttp3:okhttp:4.12.0")
@file:DependsOn("com.fasterxml.jackson.module:jackson-module-kotlin:2.15.2")


## Setup

Import libraries and define data classes.


In [3]:
import java.sql.Connection
import java.sql.DriverManager
import java.sql.ResultSet
import kotlin.random.Random
import java.io.File
import okhttp3.MediaType.Companion.toMediaType
import okhttp3.OkHttpClient
import okhttp3.Request
import okhttp3.RequestBody.Companion.toRequestBody
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.fasterxml.jackson.module.kotlin.readValue

// Data classes

data class Project(
    val id: Int,
    val name: String,
    val stars: Int,
    val plainDescription: String?,
    val repoDescription: String?,
    val minimizedReadme: String?,
    val existingTags: List<String>
)

// OpenAI request/response types (matching the description notebook)

data class OpenAiRequest(
    val model: String,
    val messages: List<OpenAiMessage>,
    val response_format: ResponseFormat? = null,
)

data class ResponseFormat(
    val type: String,
    val json_schema: JsonSchema? = null
)

data class JsonSchema(
    val name: String,
    val schema: Map<String, Any>,
    val strict: Boolean = true
)

data class Reasoning(
    val effort: String
)

data class OpenAiMessage(
    val role: String,
    val content: String,
    val refusal: String? = null,
    val annotations: List<OpenAiMessageAnnotation>? = null,
    val citations: List<Map<String, String>>? = null
)

data class OpenAiMessageAnnotation(
    val text: String? = null,
    val start_index: Int? = null,
    val end_index: Int? = null,
    val type: String? = null,
    val metadata: Map<String, Any>? = null
)

data class OpenAiResponse(
    val id: String?,
    val choices: List<OpenAiChoice>
)

data class OpenAiChoice(
    val message: OpenAiMessage,
    val index: Int,
    val logprobs: Any? = null,
    val finish_reason: String? = null
)

val OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
val OPENAI_API_KEY = System.getenv("OPENAI_API_KEY")
val OPENAI_MODEL = "gpt-5-mini"


## Load tag rules from YAML


In [4]:
@file:DependsOn("com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.15.2")
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory
import com.fasterxml.jackson.databind.ObjectMapper

data class TagRule(
    val name: String,
    val definition: String?,
    val positive_cues: List<String>?,
    val hard_negatives: List<String>?,
    val synonyms: List<String>?
,    val tag_synonyms: List<String>?)

data class TagRulesWrapper(
    val tag_rules: List<TagRule>
)

val tagRulesPath = System.getenv("TAG_RULES_PATH") ?: "../integrations/ai/src/main/resources/ai/prompts/tag_rules.yaml"
val tagRulesFile = File(tagRulesPath)
val yamlMapper = ObjectMapper(YAMLFactory())
yamlMapper.findAndRegisterModules()
val tagRules = yamlMapper.readValue(tagRulesFile, TagRulesWrapper::class.java).tag_rules
println("Loaded tag rules: ${tagRules.size}")
println("First rule: ${tagRules.first()}")


Loaded tag rules: 181
First rule: TagRule(name=aes, definition=Apply when the project implements or directly uses AES encryption primitives., positive_cues=[AES, AES-GCM, AES-CBC], hard_negatives=[mentions AES but no crypto code/APIs present], synonyms=[advanced encryption standard, aes encryption], tag_synonyms=[advanced-encryption-standard, aes-encryption])


## Load prompt from file

Provide path in PROMPT_FILE_PATH.


In [6]:
val promptPath = System.getenv("PROMPT_FILE_PATH") ?: "../integrations/ai/src/main/resources/ai/prompts/project-tags.md"
val promptFile = File(promptPath)
println("Prompt: ${promptFile.readText().trim()}\n\n\n")

java.io.FileNotFoundException: project_tags.md (No such file or directory)

## Helper Functions


In [6]:
data class TagSelection(val indices: List<Int>)

fun sendTagRequest(systemPrompt: String?, query: String, log: Boolean = false): List<Int> {
    val messages = mutableListOf<OpenAiMessage>()
    if (!systemPrompt.isNullOrBlank()) {
        messages.add(OpenAiMessage(role = "system", content = systemPrompt))
    }
    messages.add(OpenAiMessage(role = "user", content = query))

    val schema: Map<String, Any> = mapOf(
        "type" to "object",
        "properties" to mapOf(
            "indices" to mapOf(
                "type" to "array",
                "items" to mapOf("type" to "integer"),
                "description" to "Indices of selected tags from the ALLOWED TAGS list"
            )
        ),
        "required" to listOf("indices"),
        "additionalProperties" to false
    )

    val request = OpenAiRequest(
        model = OPENAI_MODEL,
        messages = messages,
        response_format = ResponseFormat(
            type = "json_schema",
            json_schema = JsonSchema(
                name = "tag_indices",
                schema = schema,
                strict = true
            )
        ),
    )

    val objectMapper = jacksonObjectMapper()
    objectMapper.configure(com.fasterxml.jackson.databind.DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
    val requestJson = objectMapper.writeValueAsString(request)

    if (OPENAI_API_KEY.isNullOrBlank()) {
        // Fallback mock response if key missing
        if (log) println("OPENAI_API_KEY is not set. Returning mock indices.")
        return listOf(0, 1, 2)
    }

    val client = OkHttpClient.Builder()
        .connectTimeout(java.time.Duration.ofMinutes(1))
        .readTimeout(java.time.Duration.ofMinutes(1))
        .writeTimeout(java.time.Duration.ofMinutes(1))
        .build()
    val mediaType = "application/json; charset=utf-8".toMediaType()
    val requestBody = requestJson.toRequestBody(mediaType)

    if (log) println("Request body: $requestJson\n\n\n")

    val httpRequest = Request.Builder()
        .url(OPENAI_API_URL)
        .addHeader("Authorization", "Bearer $OPENAI_API_KEY")
        .post(requestBody)
        .build()

    val response = client.newCall(httpRequest).execute()
    val responseBody = response.body?.string() ?: throw RuntimeException("Empty response body")

    if (log) println("Response body: $responseBody\n\n\n")

    val openAiResponse = objectMapper.readValue<OpenAiResponse>(responseBody)
    val content = openAiResponse.choices.first().message.content
    // Expect content to be a JSON object per schema
    return try {
        val parsed = objectMapper.readValue<TagSelection>(content)
        parsed.indices
    } catch (e: Exception) {
        if (log) println("Failed to parse structured output, trying to parse indices from raw: ${'$'}{e.message}")
        // Fallback: try to parse any array of ints in content
        try {
            val node = objectMapper.readTree(content)
            val arr = node.get("indices")
            if (arr != null && arr.isArray) arr.mapNotNull { it.asInt() } else emptyList()
        } catch (_: Exception) {
            emptyList()
        }
    }
}

fun parseIndices(raw: String): List<Int> {
    val t = raw.trim()
    if (t.startsWith("[")) {
        return try {
            val mapper = jacksonObjectMapper()
            mapper.readValue<List<Int>>(t).mapNotNull { it }
        } catch (e: Exception) {
            // fallback: split and parse ints
            t.trim('[', ']').split(',', '\n', ';')
                .mapNotNull { it.trim().toIntOrNull() }
        }
    }
    return t.split(',', '\n', ';').mapNotNull { it.trim().toIntOrNull() }
}


In [7]:
fun getTopProjectsByStars(connection: Connection, limit: Int): List<Project> {
    val projects = mutableListOf<Project>()
    val sql = """
        SELECT pi.project_id, pi.name, pi.stars, pi.plain_description, repo.description AS repo_description, repo.minimized_readme AS repo_minimized_readme, pi.tags
        FROM project_index pi
        JOIN project p ON p.id = pi.project_id
        JOIN scm_repo repo ON repo.id = p.scm_repo_id
        WHERE pi.tags IS NOT NULL AND array_length(pi.tags, 1) > 0
        ORDER BY pi.stars DESC
        LIMIT ?
    """
    connection.prepareStatement(sql).use { statement ->
        statement.setInt(1, limit)
        statement.executeQuery().use { rs ->
            while (rs.next()) {
                projects.add(
                    Project(
                        id = rs.getInt("project_id"),
                        name = rs.getString("name"),
                        stars = rs.getInt("stars"),
                        plainDescription = rs.getString("plain_description"),
                        repoDescription = rs.getString("repo_description"),
                        minimizedReadme = rs.getString("repo_minimized_readme"),
                        existingTags = (rs.getArray("tags")?.array as? Array<*>)?.mapNotNull { it?.toString() } ?: emptyList()
                    )
                )
            }
        }
    }
    return projects
}


## Tag Generation Logic

We craft the user content using project name, its description from project_index.plain_description, and the repository minimized README (if available). Prompt formation is fully handled inside generateTagsForProject.


In [8]:
fun generateTagsForProject(project: Project, tagRules: List<TagRule>): List<String> {
    val promptTemplate = promptFile.readText()
    val description = project.plainDescription?.trim()
    val repoDesc = project.repoDescription?.trim()
    val readme = project.minimizedReadme?.trim()

    // If there is no README, no description, and no repository description, skip generation
    if (description.isNullOrBlank() && repoDesc.isNullOrBlank() && readme.isNullOrBlank()) {
        throw IllegalStateException("Skip: no description, no repo description and no README available")
    }

    val readmeTruncated = if (!readme.isNullOrBlank() && readme.length > 4000) readme.substring(0, 4000) else readme
    val sb = StringBuilder()
    sb.appendLine("Project name: ${project.name}")
    if (!description.isNullOrBlank()) sb.appendLine("Description: $description")
    if (!repoDesc.isNullOrBlank()) sb.appendLine("Repository description: $repoDesc")
    if (!readmeTruncated.isNullOrBlank()) sb.appendLine("README: $readmeTruncated")
    val userContent = sb.toString().trim()
    val allowedTags = tagRules.map { it.name }
    if (allowedTags.isEmpty()) {
        return emptyList()
    }

    // Build NUMBERED OBJECTS payload per project_tags.md expectations
    val allowedObjects = tagRules.withIndex().joinToString("\n") { (i, r) ->
        val def = r.definition ?: ""
        val pos = (r.positive_cues ?: emptyList()).joinToString(", ", prefix = "[", postfix = "]") { it }
        val neg = (r.hard_negatives ?: emptyList()).joinToString(", ", prefix = "[", postfix = "]") { it }
        val syn = (r.synonyms ?: emptyList()).joinToString(", ", prefix = "[", postfix = "]") { it }
        "${i}:{name:\"${r.name}\", definition:\"${def.replace("\"","\\\"")}\", positive_cues:${pos}, hard_negatives:${neg}, synonyms:${syn}}"
    }

    val systemPrompt = StringBuilder().apply {
        append(promptTemplate.trim())
        append("\n\nALLOWED TAGS (NUMBERED OBJECTS)\n")
        append(allowedObjects)
    }.toString()

    val indices = sendTagRequest(systemPrompt, userContent, log = false).distinct()

    val picked = indices.mapNotNull { idx -> allowedTags.getOrNull(idx) }
    return picked
}


## Main Execution

Fetch top 50 projects by stars from DB, randomly pick 5, and generate tags for them using tag rules from YAML.


In [9]:
val jdbcUrl = System.getenv("DB_URL")
val username = System.getenv("DB_USERNAME")
val password = System.getenv("DB_PASSWORD")
Class.forName("org.postgresql.Driver")

DriverManager.getConnection(jdbcUrl, username, password).use { connection ->
    val top50 = getTopProjectsByStars(connection, 50)
    val sample5 = top50.shuffled().take(5)
    println("Loaded tag rules: ${tagRules.size}. Will process ${sample5.size} projects out of ${top50.size} top by stars.")

    sample5.forEach { project ->
        println("\n=== Project: ${project.name} (⭐ ${project.stars}) ===")
        println("Plain description: ${project.plainDescription ?: "<none>"}")
        println("Repo description: ${project.repoDescription ?: "<none>"}")
        println("README chars: ${project.minimizedReadme?.length ?: 0}")
        println("Original tags (${project.existingTags.size}): ${project.existingTags.joinToString(", ")}")
        try {
            val tags = generateTagsForProject(project, tagRules)
            println("Generated tags: ${tags.joinToString(", ")}")
        } catch (e: Exception) {
            println("Tag generation skipped/failed: ${e.message}")
        }
    }
}


Loaded tag rules: 181. Will process 5 projects out of 50 top by stars.

=== Project: Calendar (⭐ 5371) ===
Plain description: Highly customizable calendar library supporting single, multiple, and range date selection modes. Features include horizontal/vertical scrolling, heatmap calendar, custom views, and flexible date boundaries.
Repo description: A highly customizable calendar view and compose library for Android and Kotlin Multiplatform.
README chars: 1000
Original tags (3): Time zone, Scheduling, Date-Time
Generated tags: calendar, compose-multiplatform, compose, android-ui, design-system, ui

=== Project: okio (⭐ 8951) ===
Plain description: Simplifies accessing, storing, and processing data by enhancing `java.io` and `java.nio` capabilities. Originated from OkHttp, ensuring robust performance for various data handling needs.
Repo description: A modern I/O library for Android, Java, and Kotlin Multiplatform.
README chars: 327
Original tags (2): Performance Optimization, File
Gene