# Part 2: Data Loading and Preprocessing

## Make Necessary Imports

In [575]:
import pandas as pd 
import numpy as np
import re

## Load the Pokemon JSON File

In [576]:
df = pd.read_json("pokemon_data.json")

In [577]:
df.head()

Unnamed: 0,bulbasaur,ivysaur,venusaur,charmander,charmeleon,charizard,squirtle
name,bulbasaur,ivysaur,venusaur,charmander,charmeleon,charizard,squirtle
national_no,0001,0002,0003,0004,0005,0006,0007
types,"[grass, poison]","[grass, poison]","[grass, poison, grass, poison]",[fire],[fire],"[fire, flying, fire, dragon, fire, flying]",[water]
species,Seed Pokémon,Seed Pokémon,Seed Pokémon,Lizard Pokémon,Flame Pokémon,Flame Pokémon,Tiny Turtle Pokémon
height,0.7 m (2′04″),1.0 m (3′03″),2.0 m (6′07″),0.6 m (2′00″),1.1 m (3′07″),1.7 m (5′07″),0.5 m (1′08″)


## Dataframe Transposition

In [578]:
df = df.T

In [579]:
# Reset the Index after transposing the df 
df.reset_index(drop=True, inplace=True)

In [580]:
df.head()

Unnamed: 0,name,national_no,types,species,height,weight,abilities,ev_yield,catch_rate,base_friendship,...,egg_cycles,hp,attack,defense,sp_atk,sp_def,speed,total,evo_path,pokemon_img_url
0,bulbasaur,1,"[grass, poison]",Seed Pokémon,0.7 m (2′04″),6.9 kg (15.2 lbs),"Overgrow, Chlorophyll",1 Sp. Atk,"45 (5.9% with PokéBall, full HP)",50 (normal),...,"20(4,884–5,140 steps)",45,49,49,65,65,45,318,"[Bulbasaur, Ivysaur, Venusaur]",https://img.pokemondb.net/artwork/bulbasaur.jpg
1,ivysaur,2,"[grass, poison]",Seed Pokémon,1.0 m (3′03″),13.0 kg (28.7 lbs),"Overgrow, Chlorophyll","1 Sp. Atk, 1 Sp. Def","45 (5.9% with PokéBall, full HP)",50 (normal),...,"20(4,884–5,140 steps)",60,62,63,80,80,60,405,"[Bulbasaur, Ivysaur, Venusaur]",https://img.pokemondb.net/artwork/ivysaur.jpg
2,venusaur,3,"[grass, poison, grass, poison]",Seed Pokémon,2.0 m (6′07″),100.0 kg (220.5 lbs),"Overgrow, Chlorophyll","2 Sp. Atk, 1 Sp. Def","45 (5.9% with PokéBall, full HP)",50 (normal),...,"20(4,884–5,140 steps)",80,82,83,100,100,80,525,"[Bulbasaur, Ivysaur, Venusaur]",https://img.pokemondb.net/artwork/venusaur.jpg
3,charmander,4,[fire],Lizard Pokémon,0.6 m (2′00″),8.5 kg (18.7 lbs),"Blaze, Solar Power",1 Speed,"45 (5.9% with PokéBall, full HP)",50 (normal),...,"20(4,884–5,140 steps)",39,52,43,60,50,65,309,"[Charmander, Charmeleon, Charizard]",https://img.pokemondb.net/artwork/charmander.jpg
4,charmeleon,5,[fire],Flame Pokémon,1.1 m (3′07″),19.0 kg (41.9 lbs),"Blaze, Solar Power","1 Sp. Atk, 1 Speed","45 (5.9% with PokéBall, full HP)",50 (normal),...,"20(4,884–5,140 steps)",58,64,58,80,65,80,405,"[Charmander, Charmeleon, Charizard]",https://img.pokemondb.net/artwork/charmeleon.jpg


In [581]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             7 non-null      object
 1   national_no      7 non-null      object
 2   types            7 non-null      object
 3   species          7 non-null      object
 4   height           7 non-null      object
 5   weight           7 non-null      object
 6   abilities        7 non-null      object
 7   ev_yield         7 non-null      object
 8   catch_rate       7 non-null      object
 9   base_friendship  7 non-null      object
 10  base_exp         7 non-null      object
 11  growth_rate      7 non-null      object
 12  egg_groups       7 non-null      object
 13  gender           7 non-null      object
 14  egg_cycles       7 non-null      object
 15  hp               7 non-null      object
 16  attack           7 non-null      object
 17  defense          7 non-null      object

In [582]:
df.describe

<bound method NDFrame.describe of          name national_no                                       types  \
0   bulbasaur        0001                             [grass, poison]   
1     ivysaur        0002                             [grass, poison]   
2    venusaur        0003              [grass, poison, grass, poison]   
3  charmander        0004                                      [fire]   
4  charmeleon        0005                                      [fire]   
5   charizard        0006  [fire, flying, fire, dragon, fire, flying]   
6    squirtle        0007                                     [water]   

               species         height                weight  \
0         Seed Pokémon  0.7 m (2′04″)     6.9 kg (15.2 lbs)   
1         Seed Pokémon  1.0 m (3′03″)    13.0 kg (28.7 lbs)   
2         Seed Pokémon  2.0 m (6′07″)  100.0 kg (220.5 lbs)   
3       Lizard Pokémon  0.6 m (2′00″)     8.5 kg (18.7 lbs)   
4        Flame Pokémon  1.1 m (3′07″)    19.0 kg (41.9 lbs)   
5  

In [583]:
df.size

168

In [584]:
df.columns

Index(['name', 'national_no', 'types', 'species', 'height', 'weight',
       'abilities', 'ev_yield', 'catch_rate', 'base_friendship', 'base_exp',
       'growth_rate', 'egg_groups', 'gender', 'egg_cycles', 'hp', 'attack',
       'defense', 'sp_atk', 'sp_def', 'speed', 'total', 'evo_path',
       'pokemon_img_url'],
      dtype='object')

## Data Storage

### Save DataFrame as a .pkl file

In [585]:
df.to_pickle('data/pokemon_data.pkl')

## Data Type Conversion Simple

In [586]:
transform_string_to_int = ['national_no', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed', 'total']
df[transform_string_to_int] = df[transform_string_to_int].apply(pd.to_numeric, errors='coerce').astype('Int64')

In [587]:
print(df.columns)

Index(['name', 'national_no', 'types', 'species', 'height', 'weight',
       'abilities', 'ev_yield', 'catch_rate', 'base_friendship', 'base_exp',
       'growth_rate', 'egg_groups', 'gender', 'egg_cycles', 'hp', 'attack',
       'defense', 'sp_atk', 'sp_def', 'speed', 'total', 'evo_path',
       'pokemon_img_url'],
      dtype='object')


## Data Type Conversion

In [588]:
# Extract and convert numeric part for height and weight
df['height_m'] = df['height'].str.extract(r'(\d+\.?\d*)')[0]  
df['weight_kg'] = df['weight'].str.extract(r'(\d+\.?\d*)')[0]

In [589]:
# Print extracted values to verify
print("Extracted height values:")
print(df['height_m'].head())
print("\nExtracted weight values:")
print(df['weight_kg'].head())

Extracted height values:
0    0.7
1    1.0
2    2.0
3    0.6
4    1.1
Name: height_m, dtype: object

Extracted weight values:
0      6.9
1     13.0
2    100.0
3      8.5
4     19.0
Name: weight_kg, dtype: object


### Convert to numeric and check for NaN values

In [590]:
df['height_m'] = pd.to_numeric(df['height_m'], errors='coerce') 
df['weight_kg'] = pd.to_numeric(df['weight_kg'], errors='coerce')

In [591]:
# Print converted values to verify
print("Extracted height values:")
print(df['height_m'].head())
print("\nExtracted weight values:")
print(df['weight_kg'].head())

Extracted height values:
0    0.7
1    1.0
2    2.0
3    0.6
4    1.1
Name: height_m, dtype: float64

Extracted weight values:
0      6.9
1     13.0
2    100.0
3      8.5
4     19.0
Name: weight_kg, dtype: float64


### Convert attributes catch_rate, gender, egg_cycles into numbers and strip non numerical characters

In [592]:
df['catch_rate'] = df['catch_rate'].str.extract(r'(\d+)')[0].astype(float)
df['egg_cycles'] = pd.to_numeric(df['egg_cycles'].str.extract(r'(\d+)')[0], errors='coerce')

### Distill base_friendship attribute to its essential numeric value 

In [593]:
df['base_friendship'] = df['base_friendship'].str.extract(r'(\d+)').astype(float)
df['base_exp'] = df['base_exp'].astype(int)

## Verify NaN values

In [594]:
print(df.isna().sum())

name               0
national_no        0
types              0
species            0
height             0
weight             0
abilities          0
ev_yield           0
catch_rate         0
base_friendship    0
base_exp           0
growth_rate        0
egg_groups         0
gender             0
egg_cycles         0
hp                 0
attack             0
defense            0
sp_atk             0
sp_def             0
speed              0
total              0
evo_path           0
pokemon_img_url    0
height_m           0
weight_kg          0
dtype: int64


In [595]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             7 non-null      object 
 1   national_no      7 non-null      Int64  
 2   types            7 non-null      object 
 3   species          7 non-null      object 
 4   height           7 non-null      object 
 5   weight           7 non-null      object 
 6   abilities        7 non-null      object 
 7   ev_yield         7 non-null      object 
 8   catch_rate       7 non-null      float64
 9   base_friendship  7 non-null      float64
 10  base_exp         7 non-null      int32  
 11  growth_rate      7 non-null      object 
 12  egg_groups       7 non-null      object 
 13  gender           7 non-null      object 
 14  egg_cycles       7 non-null      int64  
 15  hp               7 non-null      Int64  
 16  attack           7 non-null      Int64  
 17  defense          7 n

## Column Splitting - Types

### Drop types, height and weight  column post-split

In [596]:
df.drop(columns=['types', 'height', 'weight'], inplace=True)

In [597]:
df.columns

Index(['name', 'national_no', 'species', 'abilities', 'ev_yield', 'catch_rate',
       'base_friendship', 'base_exp', 'growth_rate', 'egg_groups', 'gender',
       'egg_cycles', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed',
       'total', 'evo_path', 'pokemon_img_url', 'height_m', 'weight_kg'],
      dtype='object')

### Split gender field into male_percentage and female_percentage and convert to numeric values

In [598]:
# Extract gender distribution
df['male_percentage'] = df['gender'].str.extract(r'(\d+\.\d+)% male')[0].astype(float)
df['female_percentage'] = df['gender'].str.extract(r'(\d+\.\d+)% female')[0].astype(float)

In [599]:
df['male_percentage'].fillna(0, inplace=True)
df['female_percentage'].fillna(0, inplace=True)
print(df[['gender', 'male_percentage', 'female_percentage']].head())

                     gender  male_percentage  female_percentage
0  87.5% male, 12.5% female             87.5               12.5
1  87.5% male, 12.5% female             87.5               12.5
2  87.5% male, 12.5% female             87.5               12.5
3  87.5% male, 12.5% female             87.5               12.5
4  87.5% male, 12.5% female             87.5               12.5


In [600]:
df.columns

Index(['name', 'national_no', 'species', 'abilities', 'ev_yield', 'catch_rate',
       'base_friendship', 'base_exp', 'growth_rate', 'egg_groups', 'gender',
       'egg_cycles', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed',
       'total', 'evo_path', 'pokemon_img_url', 'height_m', 'weight_kg',
       'male_percentage', 'female_percentage'],
      dtype='object')

### Drop gender column as it was replaced with male_percentage and female_percentage

In [603]:
df.drop(columns=['gender'], inplace=True)

KeyError: "['gender'] not found in axis"

## Generation Identification

In [604]:
df = df.sort_values(by='national_no')

### Add a new column called gen to the DataFrame, indicating the generation in which the Pokémon first appeared

In [605]:
# Define generation boundaries
bins = [0, 151, 251, 386, 493, 649, 721, 809, 898]
labels = [1, 2, 3, 4, 5, 6, 7, 8]

# Create the generation column
df['gen'] = pd.cut(df['national_no'], bins=bins, labels=labels, right=True)
print(df[['name', 'national_no', 'gen']])

         name  national_no gen
0   bulbasaur            1   1
1     ivysaur            2   1
2    venusaur            3   1
3  charmander            4   1
4  charmeleon            5   1
5   charizard            6   1
6    squirtle            7   1


## Column Splitting - Abilities

## Separate the abilities column into primary_ability and secondary_ability

In [609]:
df.columns

Index(['name', 'national_no', 'species', 'abilities', 'ev_yield', 'catch_rate',
       'base_friendship', 'base_exp', 'growth_rate', 'egg_groups',
       'egg_cycles', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed',
       'total', 'evo_path', 'pokemon_img_url', 'height_m', 'weight_kg',
       'male_percentage', 'female_percentage', 'gen'],
      dtype='object')

In [610]:
abilities_split = df['abilities'].str.split(', ', expand=True)
df['primary_ability'] = abilities_split[0]
df['secondary_ability'] = abilities_split[1]

## Handle secondary slot if only one ability is present 

In [611]:
df['secondary_ability'].fillna('None', inplace=True)

In [612]:
print(df[['name', 'national_no', 'primary_ability', 'secondary_ability']])

         name  national_no primary_ability secondary_ability
0   bulbasaur            1        Overgrow       Chlorophyll
1     ivysaur            2        Overgrow       Chlorophyll
2    venusaur            3        Overgrow       Chlorophyll
3  charmander            4           Blaze       Solar Power
4  charmeleon            5           Blaze       Solar Power
5   charizard            6           Blaze       Solar Power
6    squirtle            7         Torrent         Rain Dish


### Drop abilities column after replacting it with primary_ability and secondary_ability

In [613]:
df.drop(columns=['abilities'], inplace=True)

In [614]:
df.columns

Index(['name', 'national_no', 'species', 'ev_yield', 'catch_rate',
       'base_friendship', 'base_exp', 'growth_rate', 'egg_groups',
       'egg_cycles', 'hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed',
       'total', 'evo_path', 'pokemon_img_url', 'height_m', 'weight_kg',
       'male_percentage', 'female_percentage', 'gen', 'primary_ability',
       'secondary_ability'],
      dtype='object')

## Data Cleansing

### Store the cleaned DataFrame to a new pickle file for persistent storage

In [615]:
df.to_pickle('data/final_cleaned_pokemon_data.pkl')

## Evaluation

### Use .info() method to check the datatype for different attribute columns.

In [616]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   name               7 non-null      object  
 1   national_no        7 non-null      Int64   
 2   species            7 non-null      object  
 3   ev_yield           7 non-null      object  
 4   catch_rate         7 non-null      float64 
 5   base_friendship    7 non-null      float64 
 6   base_exp           7 non-null      int32   
 7   growth_rate        7 non-null      object  
 8   egg_groups         7 non-null      object  
 9   egg_cycles         7 non-null      int64   
 10  hp                 7 non-null      Int64   
 11  attack             7 non-null      Int64   
 12  defense            7 non-null      Int64   
 13  sp_atk             7 non-null      Int64   
 14  sp_def             7 non-null      Int64   
 15  speed              7 non-null      Int64   
 16  total       

### Use the .unique() method on categorical columns to identify and rectify any typographical errors within the attributes

In [619]:
# Define all categorical columns
categorical_columns = ['species', 'growth_rate', 'egg_groups', 'evo_path', 'primary_ability', 'secondary_ability']
# categorical_columns += ['moves']

In [620]:
# Handle colums that cause "TypeError: unhashable type: 'list'" error
df['egg_groups'] = df['egg_groups'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['evo_path'] = df['evo_path'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
#df['moves'] = df['moves'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [574]:
print("Unique Values")
for column in categorical_columns:
    unique_value = df[column].unique()
    print(f"'{column}': {unique_value}\n")

Unique Values
'species': ['Seed Pokémon' 'Lizard Pokémon' 'Flame Pokémon' 'Tiny Turtle Pokémon'
 'Turtle Pokémon' 'Shellfish Pokémon' 'Worm Pokémon' 'Cocoon Pokémon'
 'Butterfly Pokémon' 'Hairy Bug Pokémon' 'Poison Bee Pokémon'
 'Tiny Bird Pokémon' 'Bird Pokémon' 'Mouse Pokémon' 'Beak Pokémon'
 'Snake Pokémon' 'Cobra Pokémon' 'Poison Pin Pokémon' 'Drill Pokémon'
 'Fairy Pokémon' 'Fox Pokémon']

'growth_rate': ['Medium Slow' 'Medium Fast' 'Fast']

'egg_groups': ['Grass, Monster' 'Dragon, Monster' 'Monster, Water 1' 'Bug' 'Flying'
 'Field' 'Dragon, Field' 'Fairy, Field' 'Field, Monster' 'Undiscovered'
 'Fairy']

'evo_path': ['Bulbasaur, Ivysaur, Venusaur' 'Charmander, Charmeleon, Charizard'
 'Squirtle, Wartortle, Blastoise' 'Caterpie, Metapod, Butterfree'
 'Weedle, Kakuna, Beedrill' 'Pidgey, Pidgeotto, Pidgeot'
 'Rattata, Raticate, Rattata, Raticate' 'Spearow, Fearow' 'Ekans, Arbok'
 'Pichu, Pikachu, Raichu, Raichu'
 'Sandshrew, Sandslash, Sandshrew, Sandslash'
 'Nidoran♀, Nidorina, Nido