<a href="https://colab.research.google.com/github/NicolasChagnet/pokemon-team-optimization/blob/main/generate_improved_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate improved dataset

In [1]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '.')
from src import config
import pandas as pd
import numpy as np
import pokebase
import requests
from bs4 import BeautifulSoup

import urllib.parse

In this notebook, we will first download the [original dataset](https://www.kaggle.com/datasets/rounakbanik/pokemon) from Kaggle. Then we will improve this dataset before saving it for further use.

## Starters and pseudo-legendaries

In [2]:
df = pd.read_csv(config.path_data / "pokemon.csv")
df = df.drop(
    [
        "abilities",
        "attack",
        "base_egg_steps",
        "base_happiness",
        "capture_rate",
        "classfication",
        "defense",
        "experience_growth",
        "height_m",
        "hp",
        "japanese_name",
        "percentage_male",
        "sp_attack",
        "sp_defense",
        "speed",
        "weight_kg",
    ],
    axis=1,
)
df["name"] = df["name"].str.lower()
df["is_starter"] = df["name"].isin(config.list_starters_final_stage).astype(int)
df["is_pseudo_legendary"] = df["name"].isin(config.list_pseudo_legendaries).astype(int)
df["is_fossil"] = df["name"].isin(config.list_fossils).astype(int)

df = df.rename(columns={"against_fight": "against_fighting"})

In [3]:
def get_type_main(pkmn):
    pkmn_ = pkmn.copy()
    pkmn_data = pokebase.pokemon(pkmn_["name"])
    pkmn_["type1"] = pkmn_data.types[0].type.name
    pkmn_["type2"] = pkmn_data.types[1].type.name if len(pkmn_data.types) == 2 else np.nan
    return pkmn_

In [4]:
df.to_csv(config.path_data / "pokemon_starter_pl_fossil.csv", index=False)

## Remake datasets

The dataset available from Kaggle is missing some crucial information:
- it does not care about game availability of Pokemon,
- it does not care about types being introduced or modified later in the series.

To fix this, we remake new datasets containing the type information and base total for each game version as well as a "national" pokedex version which contains all Pokemon with gen IX types. The type data is obtained from the PokeAPI while the Pokemon data is scraped from [pokemondb](https://pokemondb.net).

In [5]:
# Taken from https://pokebase.readthedocs.io/en/latest/examples/index.html#making-a-type-chart
def type_multiplayer(attack, defense):
    # Get API data for the attcking type.
    dfs_type = pokebase.type_(defense)

    # Check which damage_relation list the defense is in. Matches by name
    if attack in [t.name for t in dfs_type.damage_relations.no_damage_from]:
        return 0.0
    elif attack in [t.name for t in dfs_type.damage_relations.half_damage_from]:
        return 0.5
    elif attack in [t.name for t in dfs_type.damage_relations.double_damage_from]:
        return 2.0
    else:
        return 1.0


# Build type matrix (Uncomment below to recompute -- SLOW)
types_mat = pd.read_csv(config.path_data / "attack_defense_types_mat.csv", index_col=0)
# types_mat = pd.DataFrame(
#     [[type_multiplayer(type1, type2) for type2 in config.list_types] for type1 in config.list_types],
#     columns=config.list_types,
#     index=config.list_types,
# ).transpose()
# types_mat.to_csv(config.path_data / "attack_defense_types_mat.csv")

# Adapt type matrix for previous generations
# See https://pokemondb.net/type for changes pre-gen VI
types_mat_prefairy = types_mat.copy(deep=True)
types_mat_prefairy = types_mat_prefairy.drop(columns=["fairy"], index=["fairy"])
types_mat_prefairy.loc["steel", "ghost"] = 0.5
types_mat_prefairy.loc["steel", "dark"] = 0.5
# types_mat_prefairy.to_csv(config.path_data / "attack_defense_types_mat_gen2to5.csv")

types_mat_rby = types_mat_prefairy.copy(deep=True)
types_mat_rby = types_mat_rby.drop(columns=["dark", "steel"], index=["dark", "steel"])
types_mat_rby.loc["psychic", "ghost"] = 0
types_mat_rby.loc["bug", "poison"] = 2
types_mat_rby.loc["poison", "bug"] = 2
types_mat_rby.loc["fire", "ice"] = 1
# types_mat_rby.to_csv(config.path_data / "attack_defense_types_mat_gen1.csv")

In [6]:
BASE_DOMAIN = "https://pokemondb.net"


# Finds the generation a Pokemon was introduced in using its national pokedex ID
def get_gen(x):
    upper_bound = [y for y in config.cutoffs if y >= x][0]
    return config.cutoffs.index(upper_bound) + 1


# Gets general information on all Pokemon (base total, generation) using the national Pokedex
def get_stats_table():
    pageStats = requests.get(urllib.parse.urljoin(BASE_DOMAIN, "pokedex/all"))
    soupStats = BeautifulSoup(pageStats.content, "html.parser")
    tbody = soupStats.select_one("tbody")
    trs = tbody.find_all("tr", recursive=True)
    results = pd.DataFrame(
        [
            {
                "name": tr.select_one("a.ent-name").text.lower(),
                "base_total": tr.select_one("td.cell-total").text,
                "gen": get_gen(int(tr.select_one("span.infocard-cell-data").text)),
                "img": tr.select_one("img.icon-pkmn")["src"],
            }
            for tr in trs
        ]
    )
    results = results.drop_duplicates(subset="name", keep="first")
    return results


# Gets all the Pokemon names and types available in a specific version group. Uses the national pokedex data computed only once for base total and generation.
def get_names_pokedex(url):
    page = requests.get(urllib.parse.urljoin(BASE_DOMAIN, "pokedex/" + url))
    soup = BeautifulSoup(page.content, "html.parser")
    base_total_table = get_stats_table()
    pokemons = soup.find_all("span", class_="infocard-lg-data")
    pokemons_lst = [
        {
            "name": pkmn.select_one("a.ent-name").text.lower(),
            "types": [x.text.lower() for x in pkmn.findAll("a", class_="itype")],
        }
        for pkmn in pokemons
    ]
    pokemons_lst_2 = pd.DataFrame(
        [
            {
                "name": pkmn["name"],
                "base_total": base_total_table.loc[base_total_table["name"] == pkmn["name"], "base_total"].iloc[0],
                "generation": base_total_table.loc[base_total_table["name"] == pkmn["name"], "gen"].iloc[0],
                "img": base_total_table.loc[base_total_table["name"] == pkmn["name"], "img"].iloc[0],
                "type1": pkmn["types"][0],
                "type2": np.nan if len(pkmn["types"]) <= 1 else pkmn["types"][1],
            }
            for pkmn in pokemons_lst
        ]
    )
    return pokemons_lst_2


# Returns the list of damage multiplier coefficients for a given Pokemon's types and a type matrix
def get_type_pkmn(type_1, type_2, type_mat):
    type_factor_1 = type_mat.loc[type_1]
    type_factor_2 = 1 if pd.isna(type_2) else type_mat.loc[type_2]
    series = type_factor_1 * type_factor_2
    series_renamed = series.rename({typev: f"against_{typev}" for typev in config.list_types})
    return series_renamed


# Builds the full dataset for a version group including information about fossils, legendaries, etc...
def build_dataset(url, gen):
    if gen == 1:
        type_mat = pd.read_csv(config.path_data / "attack_defense_types_mat_gen1.csv", index_col=0)
    elif gen >= 2 and gen < 6:
        type_mat = pd.read_csv(
            filepath_or_buffer=config.path_data / "attack_defense_types_mat_gen2to5.csv", index_col=0
        )
    else:
        type_mat = pd.read_csv(filepath_or_buffer=config.path_data / "attack_defense_types_mat.csv", index_col=0)

    pkms = get_names_pokedex(url)
    types = pkms.apply(lambda x: get_type_pkmn(x["type1"], x["type2"], type_mat), axis=1)
    pkms["is_starter"] = pkms["name"].isin(config.list_starters_final_stage).astype(int)
    pkms["is_legendary"] = pkms["name"].isin(config.list_legendaries).astype(int)
    pkms["is_pseudo_legendary"] = pkms["name"].isin(config.list_pseudo_legendaries).astype(int)
    pkms["is_fossil"] = pkms["name"].isin(config.list_fossils).astype(int)
    return pd.concat([pkms, types], axis=1)

In [7]:
# RUN TO RECOMPUTE THE DATASETS
for gen_id, details in config.list_games.items():
    df = build_dataset(**details)
    df = df.drop_duplicates(subset=["name"])
    df.to_csv(config.get_file_loc(gen_id), index=False)

In [10]:
pkmn_nat = pd.read_csv(config.get_file_loc("nat"))
for k in config.list_games.keys():
    if k == "nat":
        continue
    pkmn_k = pd.read_csv(config.get_file_loc(k))
    pkmn_nat[f"is_{k}"] = False
    names_pkmn = pkmn_k["name"]
    pkmn_nat.loc[pkmn_nat["name"].isin(names_pkmn), f"is_{k}"] = True

pkmn_nat.to_csv(config.path_data / "pokemon_full.csv", index=False)