# Predicting Pokemon Types from Images and Statistics

Scott Ratchford, (c) 2025

See `LICENSE.txt` for license information.

## Setup

### Constants and Parameters

In [1]:
# Set paths to data files and directories

import os
import pandas as pd
import numpy as np

CWD = os.getcwd()

# Input paths for data
POKEDEX_PATH = os.path.join(CWD, "data", "pokemon_images", "pokedex.csv")   # Modified .csv file from "Pokemon with Stats and Images"
PKMN_STATS_PATH = os.path.join(CWD, "data", "pokemon_stats.csv")            # .csv file from "Pokemon Pokedex"

# Output paths for created and modified data
PKMN_IMG_COLORS_PATH = os.path.join(CWD, "pokemon_colors.csv")              # Pokemon color data output

RNG_SEED = 151

### Import Datasets

In [2]:
from sklearn.preprocessing import LabelEncoder

# Import in-game statistics dataset
pkmn_stats = pd.read_csv(POKEDEX_PATH, encoding="utf-8")
size_before = pkmn_stats.shape[0]

# Drop ignored rows
pkmn_stats = pkmn_stats[pkmn_stats["Ignore"] == False]
print(f"Dropped {size_before - pkmn_stats.shape[0]} ignored Pokemon.")

print(f"PKMN stats rows: {pkmn_stats.shape[0]}")
print(f"PKMN stats columns: {list(pkmn_stats.columns)}")

# Set Name to lowercase
pkmn_stats["Name"] = pkmn_stats["Name"].apply(lambda x: x.lower())

# Encode type strings (to ints)
type_label_encoder = LabelEncoder()
pkmn_stats["Type 1"] = type_label_encoder.fit_transform(pkmn_stats["Type 1"])
pkmn_stats["Type 1"] = pkmn_stats["Type 1"].astype(dtype=int)
pkmn_stats["Type 2"] = type_label_encoder.fit_transform(pkmn_stats["Type 2"])
pkmn_stats["Type 2"] = pkmn_stats["Type 2"].astype(dtype=int)

# Change NaN values to "None"
pkmn_stats["Type 2"] = pkmn_stats["Type 2"].apply(lambda x: x if pd.notna(x) else "None")

# Import image colors dataset
pkmn_color_df = pd.read_csv(PKMN_IMG_COLORS_PATH, encoding="utf-8", index_col=0)
print(f"PKMN color rows: {pkmn_color_df.shape[0]}")
print(f"PKMN colors columns: {list(pkmn_color_df.columns)}")

# Add Number and Train columns to pkmn_stats
pkmn_stats = pd.merge(pkmn_stats, pkmn_color_df[["Name", "Number", "Train"]], on=["Name", ], how="inner")

Dropped 232 ignored Pokemon.
PKMN stats rows: 983
PKMN stats columns: ['Index', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'SP. Atk.', 'SP. Def', 'Speed', 'Ignore', 'Filename']
PKMN color rows: 959
PKMN colors columns: ['Name', 'Type 1', 'Type 2', 'Filename', 'Number', 'Train', 'black', 'blue', 'pink', 'green', 'purple', 'red', 'white', 'yellow', 'orange']


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Drop columns that provide too much or unneeded information about the Pokemon
drop_stats_cols = ["Name", "Ignore", "Filename", "Number", "Train", ]

pkmn_stats_train = pkmn_stats[pkmn_stats["Train"] == True]
pkmn_stats_test = pkmn_stats[pkmn_stats["Train"] == False]

X_stats_train = pkmn_stats_train.drop(labels=(drop_stats_cols + ["Type 1", "Type 2", ]), axis=1)
X_stats_test = pkmn_stats_test.drop(labels=(drop_stats_cols + ["Type 1", "Type 2", ]), axis=1)
y_stats_train = pkmn_stats_train[["Type 1", "Type 2", ]].copy()
y_stats_test = pkmn_stats_test[["Type 1", "Type 2", ]].copy()

print(f"Split Pokemon statistics into {X_stats_train.shape[0]} training and {X_stats_test.shape[0]} testing.")

Split Pokemon statistics into 767 training and 192 testing.


## Helper Functions

In [13]:
def either_correct(pred: tuple[str, str], true: tuple[str, str]) -> bool:
    for p in pred:
        if p == "None":
            continue
        if p in true:
            return True
    
    return False

def all_correct(pred: tuple[str, str], true: tuple[str, str]) -> bool:
    if len(pred) != 2 or len(true) != 2:
        raise ValueError("Length of pred and true must each be 2.")
    pred_set = set(pred)
    if "None" in pred:
        pred_set.remove("None")
        
    return pred_set.intersection(set(true)) == set(pred)

def either_accuracy_score(pred: pd.DataFrame, true: pd.DataFrame) -> float:
    df = pd.merge(pred, true, how="left", left_index=True, right_index=True)
    print(df.columns)
    df["either"] = df.apply(lambda x: either_correct((x["Type 1"], x["Type 2"]), (x["Type 1 Pred"], x["Type 2 Pred"])), axis=1)

    return df["either"].mean()

def both_accuracy_score(pred: pd.DataFrame, true: pd.DataFrame) -> float:
    df = pd.merge(pred, true, how="left", left_index=True, right_index=True)
    df["both"] = df.apply(lambda x: all_correct((x["Type 1"], x["Type 2"]), (x["Type 1 Pred"], x["Type 2 Pred"])), axis=1)

    return df["both"].mean()

## Multi-Label Type Classification Based on In-Game Statistics

These models use in-game statistics to predict the values of `Type 1` and `Type 2`.

### K-Nearest Neighbors

In [None]:
# Create the KNN model for in-game statistics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

# hyperparameters to try
knn_stats_hyperparameters = {
    "n_neighbors": range(3, 11),
    "leaf_size": range(10, 41),
    "p": range(1, 4),
    "weights": ("distance", "uniform", ),
    "algorithm": ("kd_tree", "ball_tree", ),
}

knn_stats_model = MultiOutputClassifier(GridSearchCV(KNeighborsClassifier(), knn_stats_hyperparameters, n_jobs=None), n_jobs=None)

knn_stats_model = knn_stats_model.fit(X_stats_train, y_stats_train)

In [14]:
knn_stats_pred = pd.DataFrame(knn_stats_model.predict(X_stats_test), columns=["Type 1 Pred", "Type 2 Pred"])

acc_either = either_accuracy_score(knn_stats_pred, y_stats_test)
acc_both = both_accuracy_score(knn_stats_pred, y_stats_test)

print(f"Either type accuracy: {acc_either}")
print(f"Both type accuracy: {acc_both}")

Index(['Type 1 Pred', 'Type 2 Pred', 'Type 1', 'Type 2'], dtype='object')
Either type accuracy: 0.08333333333333333
Both type accuracy: 0.0


## Multi-Label Type Classification Based on Image Colors

### Load Image Colors with Type 1 and Type 2

In [7]:
pkmn_color_df = pd.read_csv(PKMN_IMG_COLORS_PATH, sep=",", encoding="utf-8", index_col=0)

pkmn_color_train_df = pkmn_color_df[pkmn_color_df["Train"] == True]
pkmn_color_test_df = pkmn_color_df[pkmn_color_df["Train"] == False]

print(f"Color data for {pkmn_color_train_df.shape[0]} testing Pokemon loaded.")
print(f"Color data for {pkmn_color_test_df.shape[0]} testing Pokemon loaded.")

Color data for 767 testing Pokemon loaded.
Color data for 192 testing Pokemon loaded.


In [8]:
from sklearn.preprocessing import StandardScaler

drop_color_cols = []
# Drop columns that provide too much information about the Pokemon
drop_color_cols.extend(["Name", "Number", "Filename", ])
# Drop other columns
drop_color_cols.extend(["Train", ])
# Drop columns containing target information
drop_color_cols.extend(["Type 1", "Type 2", ])

X_colors_train = pkmn_color_train_df.drop(labels=drop_color_cols, axis=1)
X_colors_test = pkmn_color_test_df.drop(labels=drop_color_cols, axis=1)
y_colors_train = pkmn_color_train_df[["Type 1", "Type 2", ]].copy()
y_colors_test = pkmn_color_test_df[["Type 1", "Type 2", ]].copy()

# Change NaN values to "None"
y_colors_train["Type 2"] = y_colors_train["Type 2"].apply(lambda x: x if pd.notna(x) else "None")
y_colors_test["Type 2"] = y_colors_test["Type 2"].apply(lambda x: x if pd.notna(x) else "None")

# Scale color data
color_cols = [
    'black', 'blue', 'pink', 'green', 'purple', 'red', 'white', 'yellow', 'orange',
]
scaler = StandardScaler()
X_colors_train[color_cols] = scaler.fit_transform(X_colors_train[color_cols])
X_colors_test[color_cols] = scaler.fit_transform(X_colors_test[color_cols])

### Random Forest with Multilabel Classification

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

# hyperparameters to try
rf_multi_hyperparameters = {
    "criterion": ['gini', ],    # 'entropy', 'log_loss'
    "n_estimators": range(100, 500, 100),
    "min_samples_split": range(2, 8, 2),
    "max_depth": list(range(10, 25, 5)) + [None, ],
    "class_weight": ["balanced", "balanced_subsample", ],
    "random_state": (RNG_SEED, ),
}

rf_colors_multi_model = MultiOutputClassifier(GridSearchCV(RandomForestClassifier(), rf_multi_hyperparameters, n_jobs=None), n_jobs=None)

rf_colors_multi_model = rf_colors_multi_model.fit(X_colors_train, y_colors_train)

In [15]:
rf_color_multi_pred = pd.DataFrame(rf_colors_multi_model.predict(X_colors_test), columns=["Type 1 Pred", "Type 2 Pred"])

acc_either = either_accuracy_score(rf_color_multi_pred, y_colors_test)
acc_both = both_accuracy_score(rf_color_multi_pred, y_colors_test)

print(f"Either type accuracy: {acc_either}")
print(f"Both type accuracy: {acc_both}")

Index(['Type 1 Pred', 'Type 2 Pred', 'Type 1', 'Type 2'], dtype='object')
Either type accuracy: 0.2760416666666667
Both type accuracy: 0.015625


### Multi-Layer Perceptron Neural Network with Multilabel Classification

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

# hyperparameters to try
mlp_multi_hyperparameters = {
    "activation": ('relu', ),
    "solver": ('adam', ),
    "learning_rate": ('constant', 'invscaling', ),
    "max_iter": range(4000, 4500, 500),
    "n_iter_no_change": range(4, 12, 2),
    "random_state": (RNG_SEED, ),
}

mlp_colors_multi_model = MultiOutputClassifier(GridSearchCV(MLPClassifier(), mlp_multi_hyperparameters, n_jobs=None), n_jobs=None)
mlp_colors_multi_model = mlp_colors_multi_model.fit(X_colors_train, y_colors_train)

In [16]:
mlp_color_multi_pred = pd.DataFrame(mlp_colors_multi_model.predict(X_colors_test), columns=["Type 1 Pred", "Type 2 Pred"])

acc_either = either_accuracy_score(mlp_color_multi_pred, y_colors_test)
acc_both = both_accuracy_score(mlp_color_multi_pred, y_colors_test)

print(f"Either type accuracy: {acc_either}")
print(f"Both type accuracy: {acc_both}")

Index(['Type 1 Pred', 'Type 2 Pred', 'Type 1', 'Type 2'], dtype='object')
Either type accuracy: 0.2708333333333333
Both type accuracy: 0.026041666666666668


In [None]:
test = pd.read_csv(POKEDEX_PATH, encoding="utf-8")
ignored = test[test["Ignore"] == True]["Name"].tolist()
# ignored.to_clipboard(True, sep=", ")
print(ignored)

AttributeError: 'list' object has no attribute 'to_clipboard'