In [597]:
import csv
import numpy as np
import pandas as pd

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [598]:
raw_dataframe = pd.read_csv("pokemon_raw.csv", delimiter=",")

Check for columns that are missing data. With the exception of type2 missing, if a value is missing from a useful column then the row should be discarded.

In [599]:
raw_dataframe.isnull().sum()

abilities              0
against_bug            0
against_dark           0
against_dragon         0
against_electric       0
against_fairy          0
against_fight          0
against_fire           0
against_flying         0
against_ghost          0
against_grass          0
against_ground         0
against_ice            0
against_normal         0
against_poison         0
against_psychic        0
against_rock           0
against_steel          0
against_water          0
attack                 0
base_egg_steps         0
base_happiness         0
base_total             0
capture_rate           0
classification         0
defense                0
experience_growth      0
height_m              20
hp                     0
japanese_name          0
name                   0
percentage_male       98
pokedex_number         0
sp_attack              0
sp_defense             0
speed                  0
type1                  0
type2                384
weight_kg             20
generation             0


Remove any rows/columns that lack enough data to be useful

In [None]:
raw_dataframe = raw_dataframe.drop(["percentage_male","base_egg_steps","base_happiness","base_total","classification"], axis=1)

KeyError: "['base_egg_steps'] not found in axis"

Find if we can rely on pokedex_number to be a primary key for the table, by checking to see if any of the values are duplicated.

In [None]:
for index, row in enumerate(raw_dataframe['pokedex_number'].duplicated()):
    if row == True:
        print(f"Duplicate found {index}.")

Obtain a clean list of all unique types from both type columns, while also exluding empty columns. This also allows to check for spelling mistakes in the outputted list.

In [None]:
types_dataframe = raw_dataframe[['pokedex_number', 'type1', 'type2']];

type1_uniques = types_dataframe["type1"].unique().tolist()
type2_uniques = types_dataframe["type2"].unique().tolist()

unique_types = type1_uniques + type2_uniques
unique_types = list(set(unique_types))

for index, entry in enumerate(unique_types):
    if type(entry) == float:
        del unique_types[index]

print(unique_types)

['ground', 'fire', 'dragon', 'grass', 'psychic', 'fighting', 'poison', 'rock', 'steel', 'water', 'fairy', 'flying', 'ice', 'ghost', 'normal', 'dark', 'electric', 'bug']


Check to see if every entry has at least one type by checking for null values in each column.

In [None]:
types_dataframe.isnull().sum()

pokedex_number      0
type1               0
type2             384
dtype: int64

Split the two type columns into boolean identifiers so that the data can be more easily parsed into models, add these new columns to raw_dataframe rather than to the type_dataframe slice.

In [None]:
for pokemon_type in unique_types:
    raw_dataframe[f"type_{pokemon_type.lower()}"] = False

for index, row in raw_dataframe.iterrows():
    if row['type1'] in unique_types:
        raw_dataframe.loc[index, f"type_{row['type1'].lower()}"] = True

    if type(row['type2']) == float or row['type2'] not in unique_types: continue

    raw_dataframe.loc[index, f"type_{row['type2'].lower()}"] = True

raw_dataframe = raw_dataframe.drop('type1', axis=1)
raw_dataframe = raw_dataframe.drop('type2', axis=1)

print(raw_dataframe['type_ground'].values)
print(raw_dataframe.columns)


[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True False False  True False False  True False False
 False False False False False False False False False False False False
 False  True  True False False False False False False False False False
 False False False False False False False False False False False False
 False  True  True  True False False False False False False False False
 False False False False False False False False False False  True False
 False False False False False False False  True  True False False False
 False False  True  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

Check Abilities column for missing data.

In [None]:
abilities_dataframe = raw_dataframe[['pokedex_number', 'abilities']]
abilities_dataframe.isnull().sum()

pokedex_number    0
abilities         0
dtype: int64

Accquire a list of all unique abilities to both make categorisation easier and check for spelling errors.

In [None]:
unique_abilites = set()

for index, row in abilities_dataframe.iterrows():
    ability_string = abilities_dataframe['abilities'][index]
    ability_string = ability_string.replace("[", "").replace("]", "").replace("'", "").replace(" ", "")
    ability_list = ability_string.split(",")

    unique_abilites = unique_abilites.union(ability_list)


print(unique_abilites)



Split abilities lists into categroric data.

In [None]:
for pokemon_ability in unique_abilites:
    for index, character in enumerate(pokemon_ability):
        if character.isupper() and index != 0:
            pokemon_ability = pokemon_ability[:index] + "_" + pokemon_ability[index:]
        
    raw_dataframe[f"ability_{pokemon_ability.lower()}"] = False

for index, row in raw_dataframe.iterrows():
    ability_string = raw_dataframe['abilities'][index]
    ability_string = ability_string.replace("[", "").replace("]", "").replace("'", "").replace(" ", "")
    ability_list = ability_string.split(",")

    for pokemon_ability in ability_list:
        raw_dataframe.loc[index, f"ability_{pokemon_ability.lower()}"] = True
        
raw_dataframe =  raw_dataframe.drop("abilities", axis=1)
    
print(raw_dataframe["ability_adaptability"].values)
print(raw_dataframe.columns)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa