Skip to content

Commit

Permalink
fauna scrape setup
Browse files Browse the repository at this point in the history
  • Loading branch information
ManApart committed Nov 12, 2023
1 parent 20dc4fb commit 07891c2
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 64 deletions.
66 changes: 23 additions & 43 deletions src/commonMain/kotlin/GameData.kt
Original file line number Diff line number Diff line change
@@ -1,28 +1,6 @@
import kotlinx.serialization.Serializable
import kotlinx.serialization.Transient


@Serializable
data class PlanetWikiData(
val name: String = "",
val type: String = "",
val temperature: String = "",
val atmosphere: String = "",
val magnetosphere: String = "",
val fauna: String = "",
val flora: String = "",
val water: String = "",
val resources: List<String> = listOf(),
val traits: List<String> = listOf(),
)

@Serializable
data class MissionWikiData(
val name: String = "",
val id: String = "",
val type: MissionType = MissionType.OTHER,
)

enum class MissionType {
MAIN,
NEW_ATLANTIS,
Expand All @@ -37,30 +15,30 @@ enum class MissionType {
RYUJIN_INDUSTRIES,
MISSION_BOARD,
RADIANT,
OTHER
}
OTHER;

fun MissionType.isMisc(): Boolean {
return this in listOf(MissionType.COMPANION, MissionType.MISSION_BOARD, MissionType.RADIANT)
}
fun isMisc(): Boolean {
return this in listOf(COMPANION, MISSION_BOARD, RADIANT)
}

fun MissionType.isFaction(): Boolean {
return this in listOf(
MissionType.CRIMSON_FLEET,
MissionType.UNITED_COLONIES,
MissionType.RYUJIN_INDUSTRIES,
MissionType.FREESTAR_RANGERS
)
}
fun isFaction(): Boolean {
return this in listOf(
CRIMSON_FLEET,
UNITED_COLONIES,
RYUJIN_INDUSTRIES,
FREESTAR_RANGERS
)
}

fun MissionType.isCity(): Boolean {
return this in listOf(
MissionType.AKILA,
MissionType.CYDONIA,
MissionType.NEW_ATLANTIS,
MissionType.NEON,
MissionType.CITY_OTHER
)
fun isCity(): Boolean {
return this in listOf(
AKILA,
CYDONIA,
NEW_ATLANTIS,
NEON,
CITY_OTHER
)
}
}

fun String.toMissionType(): MissionType {
Expand Down Expand Up @@ -95,6 +73,7 @@ data class Quest(
) {
@Transient
val uniqueId = "$name-$id-$instance-$batIndex"

@Transient
val latestState = stages.maxBy { it.id }.state
}
Expand All @@ -108,3 +87,4 @@ data class QuestStage(
val state: QuestStageState
)

enum class Temperament { PEACEFUL, SKITTISH, WARY, DEFENSIVE, TERRITORIAL, FEARLESS, AGGRESSIVE, UNKNOWN }
34 changes: 34 additions & 0 deletions src/commonMain/kotlin/WikiData.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import kotlinx.serialization.Serializable

@Serializable
data class PlanetWikiData(
val name: String = "",
val type: String = "",
val temperature: String = "",
val atmosphere: String = "",
val magnetosphere: String = "",
val fauna: String = "",
val flora: String = "",
val water: String = "",
val resources: List<String> = listOf(),
val traits: List<String> = listOf(),
)

@Serializable
data class MissionWikiData(
val name: String = "",
val id: String = "",
val type: MissionType = MissionType.OTHER,
)

@Serializable
data class FaunaWikiData(
val name: String = "",
val temperament: Temperament = Temperament.UNKNOWN,
val planets: List<String> = listOf(),
val biomes: List<String> = listOf(),
val resource: String,
val drops: List<String> = listOf(),
val abilities: List<String> = listOf(),
val other: Map<String, String> = mapOf()
)
3 changes: 0 additions & 3 deletions src/jsMain/kotlin/views/QuestView.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ import Quest
import QuestStageState
import el
import inMemoryStorage
import isCity
import isFaction
import isMisc
import kotlinx.browser.window
import kotlinx.dom.addClass
import kotlinx.dom.removeClass
Expand Down
2 changes: 2 additions & 0 deletions src/jsMain/resources/fauna-wiki-data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[
]
56 changes: 56 additions & 0 deletions src/jvmMain/kotlin/wikiScraper/FaunaWikiScraper.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package wikiScraper

import FaunaWikiData
import MissionWikiData
import jsonMapper
import kotlinx.serialization.decodeFromString
import kotlinx.serialization.encodeToString
import org.jsoup.Jsoup
import toMissionType
import java.io.File

private const val onlyOne = true
private const val start = 0
private const val limit = 0
private const val chunkSize = 100

fun main() {
val faunaUrlFile = File("raw-data/fauna-pages.txt")
if (!faunaUrlFile.exists()) faunaUrlFile.writeText("")
fetchFaunaPagesIfEmpty(faunaUrlFile)

val output = File("src/jsMain/resources/fauna-wiki-data.json")
val existing = (if (output.exists()) {
jsonMapper.decodeFromString<List<FaunaWikiData>>(output.readText())
} else listOf()).associateBy { it.name }.toMutableMap()

println("Reading Fauna")
faunaUrlFile.readLines()
.also { println("Found a total of ${it.size} urls") }
.let { if (onlyOne) it.take(1) else it.drop(start) }
.let { if (limit > 0) it.take(limit) else it}
.also { println("Crawling ${it.size} urls") }
.chunked(chunkSize).flatMap { chunk ->
println("Processing next $chunkSize, starting with ${chunk.first()}")
chunk.flatMap { fetchAndParseFauna(it) }
}
.forEach {fauna ->
existing[fauna.name] = fauna
}

output.writeText(jsonMapper.encodeToString(existing.values))
}

private fun fetchFaunaPagesIfEmpty(fauna: File) {
if (fauna.readLines().isEmpty()) {
val urls = crawl("https://starfieldwiki.net/wiki/Category:Starfield-Creatures-All", onlyOne).toSet()
fauna.writeText(urls.joinToString("\n"))
}
}

private fun fetchAndParseFauna(url: String): List<FaunaWikiData> {
val page = Jsoup.connect(url).get()
val variantTables = page.select(".wikitable")
val singleTable = page.select(".wikitable")
return listOf()
}
19 changes: 1 addition & 18 deletions src/jvmMain/kotlin/wikiScraper/MissionWikiScraper.kt
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,12 @@ private fun fetchMissionPagesIfEmpty(missions: File) {
"https://starfieldwiki.net/wiki/Starfield:Freestar_Rangers_Missions",
"https://starfieldwiki.net/wiki/Starfield:Ryujin_Industries_Missions",
"https://starfieldwiki.net/wiki/Starfield:UC_Vanguard_Missions",
).flatMap { crawl(it) }.toSet()
).flatMap { crawl(it, onlyOne) }.toSet()

missions.writeText(urls.joinToString("\n"))
}
}

private fun crawl(baseUrl: String): List<String> {
val cleanBase = if (baseUrl.startsWith("/")) "https://starfieldwiki.net$baseUrl" else baseUrl
println("Crawling $cleanBase")
val page = Jsoup.connect(cleanBase).get()
val urls = page.select("li")
.flatMap { li ->
li.select("a").mapNotNull { it.attr("href") }
}
.map { if (it.startsWith("/")) "https://starfieldwiki.net$it" else it }
.filter { it.startsWith("https://starfieldwiki.net/wiki/Starfield:") }

val nextUrl = page.select("a").firstOrNull { it.text() == "next page" }?.attr("href")?.let { "https://starfieldwiki.net$it" }
val nextUrls = if (onlyOne) listOf() else urls.filter { it.contains("Category") } + listOfNotNull(nextUrl)

return urls + nextUrls.flatMap { crawl(it) }
}

private fun fetchAndParseMission(url: String): MissionWikiData? {
val page = Jsoup.connect(url).get()
val name = page.select("#firstHeading").firstOrNull()?.text()?.replace("Starfield:", "")
Expand Down
18 changes: 18 additions & 0 deletions src/jvmMain/kotlin/wikiScraper/Tools.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package wikiScraper

import org.jsoup.Jsoup
import java.net.URL
import java.net.URLConnection
import java.nio.charset.StandardCharsets
Expand All @@ -26,3 +27,20 @@ fun getPage(url: String, headers: Map<String, String> = mapOf()): String? {
return if (scanner.hasNext()) scanner.next() else null.also { println("Unable to fetch $url") }
}
}

fun crawl(baseUrl: String, onlyOne: Boolean): List<String> {
val cleanBase = if (baseUrl.startsWith("/")) "https://starfieldwiki.net$baseUrl" else baseUrl
println("Crawling $cleanBase")
val page = Jsoup.connect(cleanBase).get()
val urls = page.select("li")
.flatMap { li ->
li.select("a").mapNotNull { it.attr("href") }
}
.map { if (it.startsWith("/")) "https://starfieldwiki.net$it" else it }
.filter { it.startsWith("https://starfieldwiki.net/wiki/Starfield:") }

val nextUrl = page.select("a").firstOrNull { it.text() == "next page" }?.attr("href")?.let { "https://starfieldwiki.net$it" }
val nextUrls = if (onlyOne) listOf() else urls.filter { it.contains("Category") } + listOfNotNull(nextUrl)

return urls + nextUrls.flatMap { crawl(it, onlyOne) }
}

0 comments on commit 07891c2

Please sign in to comment.