# Data Cleaning

In [267]:
# Import packages
import pandas as pd
import numpy as np
import seaborn as sns

## EXP Types

	•	Several columns had an incorrect object data type due to the presence of the string '-', which was used to represent missing values.
	•	All instances of '-' were replaced with 0.
	•	After replacing the missing values, the affected columns were converted to the correct int64 data type.

In [102]:
# Import csv to DataFrame
exp_types_df = pd.read_csv('../data/raw/exp_types.csv', skiprows = 2, 
                           names = ['Erratic_total_xp','Fast_total_xp','Medium_Fast_total_xp',
                                    'Medium_Slow_total_xp','Slow_total_xp','Fluctuating_total_xp',
                                    'Level','Erratic_next_lvl','Fast_next_lvl','Medium_Fast_next_lvl',
                                    'Medium_Slow_next_lvl','Slow_next_lvl','Fluctuating_next_lvl'],
                           index_col='Level')
                         

In [104]:
exp_types_df.head()

Unnamed: 0_level_0,Erratic_total_xp,Fast_total_xp,Medium_Fast_total_xp,Medium_Slow_total_xp,Slow_total_xp,Fluctuating_total_xp,Erratic_next_lvl,Fast_next_lvl,Medium_Fast_next_lvl,Medium_Slow_next_lvl,Slow_next_lvl,Fluctuating_next_lvl
Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,0,0,0,0,15,6,8,9,10,4
2,15,6,8,9,10,4,37,15,19,48,23,9
3,52,21,27,57,33,13,70,30,37,39,47,19
4,122,51,64,96,80,32,115,49,61,39,76,33
5,237,100,125,135,156,65,169,72,91,44,114,47


In [26]:
exp_types_df.tail()

Unnamed: 0_level_0,Erratic_total_xp,Fast_total_xp,Medium_Fast_total_xp,Medium_Slow_total_xp,Slow_total_xp,Fluctuating_total_xp,Erratic_next_lvl,Fast_next_lvl,Medium_Fast_next_lvl,Medium_Slow_next_lvl,Slow_next_lvl,Fluctuating_next_lvl
Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
96,560922,707788,884736,932903,1105920,1415577,10411,22350,27937,30729,34921,44699
97,571333,730138,912673,963632,1140841,1460276,12206,22815,28519,31398,35649,64455
98,583539,752953,941192,995030,1176490,1524731,8343,23286,29107,32073,36383,47153
99,591882,776239,970299,1027103,1212873,1571884,8118,23761,29701,32757,37127,68116
100,600000,800000,1000000,1059860,1250000,1640000,-,-,-,-,-,-


In [106]:
exp_types_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 100
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Erratic_total_xp      100 non-null    int64 
 1   Fast_total_xp         100 non-null    int64 
 2   Medium_Fast_total_xp  100 non-null    int64 
 3   Medium_Slow_total_xp  100 non-null    int64 
 4   Slow_total_xp         100 non-null    int64 
 5   Fluctuating_total_xp  100 non-null    int64 
 6   Erratic_next_lvl      100 non-null    object
 7   Fast_next_lvl         100 non-null    object
 8   Medium_Fast_next_lvl  100 non-null    object
 9   Medium_Slow_next_lvl  100 non-null    object
 10  Slow_next_lvl         100 non-null    object
 11  Fluctuating_next_lvl  100 non-null    object
dtypes: int64(6), object(6)
memory usage: 10.2+ KB


In [108]:
cols_to_convert = [
    'Erratic_next_lvl', 'Fast_next_lvl', 'Medium_Fast_next_lvl',
    'Medium_Slow_next_lvl', 'Slow_next_lvl', 'Fluctuating_next_lvl'
]
exp_types_df[cols_to_convert] = exp_types_df[cols_to_convert].replace('-', 0).astype(int)
exp_types_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 100
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Erratic_total_xp      100 non-null    int64
 1   Fast_total_xp         100 non-null    int64
 2   Medium_Fast_total_xp  100 non-null    int64
 3   Medium_Slow_total_xp  100 non-null    int64
 4   Slow_total_xp         100 non-null    int64
 5   Fluctuating_total_xp  100 non-null    int64
 6   Erratic_next_lvl      100 non-null    int64
 7   Fast_next_lvl         100 non-null    int64
 8   Medium_Fast_next_lvl  100 non-null    int64
 9   Medium_Slow_next_lvl  100 non-null    int64
 10  Slow_next_lvl         100 non-null    int64
 11  Fluctuating_next_lvl  100 non-null    int64
dtypes: int64(12)
memory usage: 10.2 KB


## Trainers

	•	Identified two columns, number.of.pokemon and total_EXP, that contained invalid 0 values.
	•	To correct number.of.pokemon, the count of non-null Pokémon names across the pokemon1 through pokemon6 columns was used.
	•	To correct total_EXP, the sum of the exp_pokemon1 through exp_pokemon6 columns was calculated. However, some entries had all experience values as 0, indicating invalid data.
	•	A total of 8 rows (less than 1% of the dataset) had invalid experience values and were dropped.
	•	Additionally, inconsistencies were found between Pokémon and their corresponding level columns. In cases where a pokemon{i} existed but level_pokemon{i} was missing:
	•	If other Pokémon levels for the same trainer were available, the missing level was filled using the average of the existing levels.
	•	If level_pokemon1 was missing (with no other Pokémon levels to average), the missing value was filled using the average level_pokemon1 across trainers within the same gym.section.

In [118]:
trainers_df = pd.read_csv('../data/raw/trainers.csv', index_col=0)
trainers_df.head()

Unnamed: 0,trainer.name,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,pokemon1,...,pokemon6,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
1,Youngster Tristan,1,60,0,0,0,0,0,1,Starly,...,,5.0,,,,,,5.0,0,60
2,Youngster Logan,1,65,0,0,0,0,0,2,Burmy,...,,5.0,,,,,,5.0,0,65
3,Lass Natalie,1,62,0,0,0,0,0,3,Bidoof,...,,5.0,,,,,,5.0,0,62
4,Youngster Michael,2,81,69,0,0,0,0,4,Kricketot,...,,7.0,6.0,,,,,7.0,0,150
5,Camper,1,61,0,0,0,0,0,5,Rattata,...,,5.0,,,,,,5.0,0,61


In [82]:
trainers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 927 entries, 1 to 927
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trainer.name       927 non-null    object 
 1   number.of.pokemon  927 non-null    int64  
 2   exp_pokemon1       927 non-null    int64  
 3   exp_pokemon2       927 non-null    int64  
 4   exp_pokemon3       927 non-null    int64  
 5   exp_pokemon4       927 non-null    int64  
 6   exp_pokemon5       927 non-null    int64  
 7   exp_pokemon6       927 non-null    int64  
 8   x                  927 non-null    int64  
 9   pokemon1           927 non-null    object 
 10  pokemon2           527 non-null    object 
 11  pokemon3           270 non-null    object 
 12  pokemon4           81 non-null     object 
 13  pokemon5           50 non-null     object 
 14  pokemon6           23 non-null     object 
 15  level_pokemon1     922 non-null    float64
 16  level_pokemon2     523 non-null

In [84]:
trainers_df.describe()

Unnamed: 0,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
count,927.0,927.0,927.0,927.0,927.0,927.0,927.0,927.0,922.0,523.0,254.0,79.0,29.0,12.0,927.0,927.0,927.0
mean,1.743258,852.724919,715.385113,417.587918,161.554477,57.559871,23.612729,464.0,30.812364,39.900574,44.031496,48.936709,53.965517,45.25,-inf,4.234088,2228.425027
std,1.135951,774.879835,870.259022,811.523978,588.779316,348.690182,224.890033,267.746148,19.132844,15.853184,15.256664,16.69895,18.022769,13.328882,,3.137628,2702.401093
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,3.0,4.0,14.0,16.0,-inf,0.0,0.0
25%,1.0,113.0,0.0,0.0,0.0,0.0,0.0,232.5,13.0,27.0,32.0,36.0,40.0,44.0,13.0,1.0,205.5
50%,2.0,619.0,313.0,0.0,0.0,0.0,0.0,464.0,32.0,41.0,44.0,51.0,54.0,51.0,32.0,4.0,1336.0
75%,2.0,1431.0,1367.0,426.0,0.0,0.0,0.0,695.5,45.0,54.0,57.0,63.0,66.0,53.75,46.0,8.0,3126.5
max,6.0,2985.0,3453.0,3583.0,3453.0,3378.0,2520.0,927.0,81.0,79.0,81.0,79.0,83.0,56.0,83.0,8.0,16182.0


In [88]:
trainers_df['number.of.pokemon'].unique()

array([1, 2, 3, 6, 4, 0, 5])

In [128]:
trainers_df[trainers_df['number.of.pokemon']==0]

Unnamed: 0,trainer.name,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,pokemon1,...,pokemon6,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
82,Rich Boy Jason,0,827,0,0,0,0,0,82,Prinplup,...,,27.0,,,,,,27.0,3,827
83,Lady Melissa,0,393,0,0,0,0,0,83,Cherubi,...,,27.0,,,,,,27.0,3,393
84,Gentleman Jeremy,0,619,0,0,0,0,0,84,Chatot,...,,27.0,,,,,,27.0,3,619
85,Socialite Reina,0,879,0,0,0,0,0,85,Roselia,...,,27.0,,,,,,27.0,3,879
99,Scientist Stefano,0,932,0,0,0,0,0,99,Kadabra,...,,30.0,,,,,,30.0,4,932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,Rich Boy Liam,0,0,0,0,0,0,0,892,Blissey,...,,,,,,,,-inf,0,0
893,Lady Celeste,0,0,0,0,0,0,0,893,Blissey,...,,,,,,,,-inf,0,0
913,Galactic Boss Cyrus,0,962,1246,0,0,0,0,913,Sneasel,...,,34.0,34.0,,,,,34.0,5,2208
921,Leader Volkner,0,2328,2364,0,0,0,0,921,Luxray,...,,56.0,56.0,,,,,56.0,8,4692


In [130]:
# Found number.of.pokemon to be 0 for 88 entries, will attempt updating the column with  number of non-null Pokémon per row
pokemon_cols = ['pokemon1', 'pokemon2', 'pokemon3', 'pokemon4', 'pokemon5', 'pokemon6']

trainers_df['number.of.pokemon'] = trainers_df[pokemon_cols].notna().sum(axis=1)

# Found total_EXP to be 0 for 8 entries, will attempt updating the column with  sum of indivual pokemon EXP
exp_cols = ['exp_pokemon1', 'exp_pokemon2', 'exp_pokemon3', 'exp_pokemon4', 'exp_pokemon5', 'exp_pokemon6']

trainers_df['total_EXP'] = trainers_df[exp_cols].sum(axis=1)

trainers_df.describe()

Unnamed: 0,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
count,927.0,927.0,927.0,927.0,927.0,927.0,927.0,927.0,922.0,523.0,254.0,79.0,29.0,12.0,927.0,927.0,927.0
mean,2.02589,852.724919,715.385113,417.587918,161.554477,57.559871,23.612729,464.0,30.812364,39.900574,44.031496,48.936709,53.965517,45.25,-inf,4.234088,2228.425027
std,1.195464,774.879835,870.259022,811.523978,588.779316,348.690182,224.890033,267.746148,19.132844,15.853184,15.256664,16.69895,18.022769,13.328882,,3.137628,2702.401093
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,3.0,4.0,14.0,16.0,-inf,0.0,0.0
25%,1.0,113.0,0.0,0.0,0.0,0.0,0.0,232.5,13.0,27.0,32.0,36.0,40.0,44.0,13.0,1.0,205.5
50%,2.0,619.0,313.0,0.0,0.0,0.0,0.0,464.0,32.0,41.0,44.0,51.0,54.0,51.0,32.0,4.0,1336.0
75%,3.0,1431.0,1367.0,426.0,0.0,0.0,0.0,695.5,45.0,54.0,57.0,63.0,66.0,53.75,46.0,8.0,3126.5
max,6.0,2985.0,3453.0,3583.0,3453.0,3378.0,2520.0,927.0,81.0,79.0,81.0,79.0,83.0,56.0,83.0,8.0,16182.0


In [136]:
# Updating the 8 entries with total_EXP of 0 did not work and they can be dropped fromt the dataset, less than 1% of total entries
trainers_df = trainers_df[trainers_df['total_EXP']>0]
trainers_df.describe()

Unnamed: 0,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
count,919.0,919.0,919.0,919.0,919.0,919.0,919.0,919.0,919.0,522.0,253.0,79.0,29.0,12.0,919.0,919.0,919.0
mean,2.02938,860.147987,721.612622,421.223069,162.960827,58.060936,23.818281,462.355822,30.855277,39.931034,44.110672,48.936709,53.965517,45.25,31.554951,4.264418,2247.823721
std,1.197404,774.131416,871.465449,814.111462,591.145215,350.164649,225.85697,267.767191,19.143471,15.853063,15.234529,16.69895,18.022769,13.328882,19.560892,3.131947,2706.097148
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,3.0,4.0,14.0,16.0,3.0,0.0,34.0
25%,1.0,130.0,0.0,0.0,0.0,0.0,0.0,230.5,13.0,27.0,32.0,36.0,40.0,44.0,13.0,1.0,232.0
50%,2.0,621.0,334.0,0.0,0.0,0.0,0.0,461.0,32.0,41.0,44.0,51.0,54.0,51.0,33.0,5.0,1369.0
75%,3.0,1439.0,1372.0,466.0,0.0,0.0,0.0,694.5,45.0,54.0,57.0,63.0,66.0,53.75,47.0,8.0,3169.0
max,6.0,2985.0,3453.0,3583.0,3453.0,3378.0,2520.0,927.0,81.0,79.0,81.0,79.0,83.0,56.0,83.0,8.0,16182.0


In [213]:
# Find rows where pokemon2 exists but level_pokemon2 is missing
mask2 = trainers_df['pokemon2'].notna() & trainers_df['level_pokemon2'].isna()
trainers_df[mask2].iloc[1]

trainer.name         Commander Mars
number.of.pokemon                 2
exp_pokemon1                    174
exp_pokemon2                      0
exp_pokemon3                      0
exp_pokemon4                      0
exp_pokemon5                      0
exp_pokemon6                      0
x                               295
pokemon1                      Zubat
pokemon2                    Purugly
pokemon3                        NaN
pokemon4                        NaN
pokemon5                        NaN
pokemon6                        NaN
level_pokemon1                 15.0
level_pokemon2                  NaN
level_pokemon3                  NaN
level_pokemon4                  NaN
level_pokemon5                  NaN
level_pokemon6                  NaN
max.level                      15.0
gym.section                       1
total_EXP                       174
Name: 295, dtype: object

In [229]:
# Find rows where pokemon2 exists but level_pokemon2 is missing
mask3 = trainers_df['pokemon3'].notna() & trainers_df['level_pokemon3'].isna()
mask3.sum()
trainers_df[mask3].iloc[1]

trainer.name         Leader Gardenia
number.of.pokemon                  3
exp_pokemon1                     274
exp_pokemon2                     570
exp_pokemon3                       0
exp_pokemon4                       0
exp_pokemon5                       0
exp_pokemon6                       0
x                                315
pokemon1                     Turtwig
pokemon2                     Cherrim
pokemon3                    Roserade
pokemon4                         NaN
pokemon5                         NaN
pokemon6                         NaN
level_pokemon1                  20.0
level_pokemon2                  20.0
level_pokemon3                   NaN
level_pokemon4                   NaN
level_pokemon5                   NaN
level_pokemon6                   NaN
max.level                       20.0
gym.section                        2
total_EXP                        844
Name: 315, dtype: object

In [231]:
# Find rows where pokemon2 exists but level_pokemon2 is missing
mask4 = trainers_df['pokemon4'].notna() & trainers_df['level_pokemon4'].isna()
mask4.sum()
trainers_df[mask4].iloc[1]

trainer.name         Leader Volkner
number.of.pokemon                 4
exp_pokemon1                   1942
exp_pokemon2                   1203
exp_pokemon3                   1995
exp_pokemon4                      0
exp_pokemon5                      0
exp_pokemon6                      0
x                               320
pokemon1                    Jolteon
pokemon2                     Raichu
pokemon3                     Luxray
pokemon4                 Electivire
pokemon5                        NaN
pokemon6                        NaN
level_pokemon1                 46.0
level_pokemon2                 46.0
level_pokemon3                 48.0
level_pokemon4                  NaN
level_pokemon5                  NaN
level_pokemon6                  NaN
max.level                      48.0
gym.section                       8
total_EXP                      5140
Name: 320, dtype: object

In [233]:
# Find rows where pokemon2 exists but level_pokemon2 is missing
mask5 = trainers_df['pokemon5'].notna() & trainers_df['level_pokemon5'].isna()
mask5.sum()
trainers_df[mask5].iloc[1]

trainer.name         Elite Four Bertha
number.of.pokemon                    5
exp_pokemon1                      1693
exp_pokemon2                      2181
exp_pokemon3                      2206
exp_pokemon4                      1972
exp_pokemon5                         0
exp_pokemon6                         0
x                                  262
pokemon1                      Whiscash
pokemon2                       Gliscor
pokemon3                     Hippowdon
pokemon4                         Golem
pokemon5                     Rhyperior
pokemon6                           NaN
level_pokemon1                    50.0
level_pokemon2                    53.0
level_pokemon3                    52.0
level_pokemon4                    52.0
level_pokemon5                     NaN
level_pokemon6                     NaN
max.level                         53.0
gym.section                          8
total_EXP                         8052
Name: 262, dtype: object

In [235]:
# Find rows where pokemon2 exists but level_pokemon2 is missing
mask6 = trainers_df['pokemon6'].notna() & trainers_df['level_pokemon6'].isna()
mask6.sum()
trainers_df[mask6].iloc[1]

trainer.name             Rival
number.of.pokemon            6
exp_pokemon1              2248
exp_pokemon2              2250
exp_pokemon3              2614
exp_pokemon4              2427
exp_pokemon5              2079
exp_pokemon6                 0
x                          837
pokemon1             Staraptor
pokemon2              Floatzel
pokemon3             Heracross
pokemon4              Rapidash
pokemon5               Snorlax
pokemon6              Torterra
level_pokemon1            61.0
level_pokemon2            59.0
level_pokemon3            61.0
level_pokemon4            59.0
level_pokemon5            63.0
level_pokemon6             NaN
max.level                 63.0
gym.section                  8
total_EXP                11618
Name: 837, dtype: object

In [237]:
def fix_missing_levels(df):
    # Define relevant columns
    level_cols = ['level_pokemon1', 'level_pokemon2', 'level_pokemon3', 'level_pokemon4', 'level_pokemon5', 'level_pokemon6']
    pokemon_cols = ['pokemon1', 'pokemon2', 'pokemon3', 'pokemon4', 'pokemon5', 'pokemon6']
    
    for i in range(1, 7):  # For pokemon1 to pokemon6
        pokemon_col = f'pokemon{i}'
        level_col = f'level_pokemon{i}'
        
        # Find where pokemon exists but level is missing
        mask = df[pokemon_col].notna() & df[level_col].isna()
        
        for idx in df[mask].index:
            if i == 1:
                # Special case: level_pokemon1 missing
                gym_section = df.loc[idx, 'gym.section']
                
                # Calculate average level_pokemon1 in same gym.section
                gym_avg = df[(df['gym.section'] == gym_section) & (df['level_pokemon1'].notna())]['level_pokemon1'].mean()
                
                if pd.notna(gym_avg):
                    df.loc[idx, level_col] = round(gym_avg)
                else:
                    # As a fallback if gym.section has no valid averages (rare)
                    df.loc[idx, level_col] = 5  # or some sensible low default
            else:
                # Normal case for pokemon2-6
                row_levels = df.loc[idx, level_cols]
                avg_level = row_levels.dropna().mean()
                
                if pd.notna(avg_level):
                    df.loc[idx, level_col] = round(avg_level)
                else:
                    # Fallback: If somehow no other levels exist
                    df.loc[idx, level_col] = 5  # or a sensible default
            
    return df

# Apply it
trainers_df = fix_missing_levels(trainers_df)

In [239]:
trainers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 919 entries, 1 to 927
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trainer.name       919 non-null    object 
 1   number.of.pokemon  919 non-null    int64  
 2   exp_pokemon1       919 non-null    int64  
 3   exp_pokemon2       919 non-null    int64  
 4   exp_pokemon3       919 non-null    int64  
 5   exp_pokemon4       919 non-null    int64  
 6   exp_pokemon5       919 non-null    int64  
 7   exp_pokemon6       919 non-null    int64  
 8   x                  919 non-null    int64  
 9   pokemon1           919 non-null    object 
 10  pokemon2           524 non-null    object 
 11  pokemon3           268 non-null    object 
 12  pokemon4           81 non-null     object 
 13  pokemon5           50 non-null     object 
 14  pokemon6           23 non-null     object 
 15  level_pokemon1     919 non-null    float64
 16  level_pokemon2     524 non-null

In [263]:
trainers_df.drop_duplicates(inplace=True)
trainers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 919 entries, 1 to 927
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trainer.name       919 non-null    object 
 1   number.of.pokemon  919 non-null    int64  
 2   exp_pokemon1       919 non-null    int64  
 3   exp_pokemon2       919 non-null    int64  
 4   exp_pokemon3       919 non-null    int64  
 5   exp_pokemon4       919 non-null    int64  
 6   exp_pokemon5       919 non-null    int64  
 7   exp_pokemon6       919 non-null    int64  
 8   x                  919 non-null    int64  
 9   pokemon1           919 non-null    object 
 10  pokemon2           524 non-null    object 
 11  pokemon3           268 non-null    object 
 12  pokemon4           81 non-null     object 
 13  pokemon5           50 non-null     object 
 14  pokemon6           23 non-null     object 
 15  level_pokemon1     919 non-null    float64
 16  level_pokemon2     524 non-null

## Pokemon Summary

	•	The capture_rate column had an unexpected object data type due to one malformed entry: '30 (Meteorite)255 (Core)'. This value was cleaned by extracting the core value (255) and converting the entire column to int.
	•	The columns height_m, percentage_male, and weight_kg contained null values and were dropped, as they were not relevant to the planned analysis.
	•	The type2 column also contained a significant number of null values. Since the absence of a secondary type is valid, these nulls were replaced with the string 'None'.

In [152]:
poke_sum_df = pd.read_csv('../data/raw/pokemon_summary.csv', header=0, index_col='pokedex_number')
poke_sum_df.head()

Unnamed: 0_level_0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,name,percentage_male,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
pokedex_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Bulbasaur,88.1,65,65,45,grass,poison,6.9,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Ivysaur,88.1,80,80,60,grass,poison,13.0,1,0
3,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Venusaur,88.1,122,120,80,grass,poison,100.0,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,Charmander,88.1,60,50,65,fire,,8.5,1,0
5,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,Charmeleon,88.1,80,65,80,fire,,19.0,1,0


In [154]:
poke_sum_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 801 entries, 1 to 801
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non-null

In [148]:
poke_sum_df.describe()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,experience_growth,height_m,hp,percentage_male,sp_attack,sp_defense,speed,weight_kg,generation,is_legendary
count,801.0,801.0,801.0,801.0,801.0,801.0,801.0,801.0,801.0,801.0,...,801.0,781.0,801.0,703.0,801.0,801.0,801.0,781.0,801.0,801.0
mean,0.996255,1.057116,0.968789,1.07397,1.068976,1.065543,1.135456,1.192884,0.985019,1.03402,...,1054996.0,1.163892,68.958801,55.155761,71.305868,70.911361,66.334582,61.378105,3.690387,0.087391
std,0.597248,0.438142,0.353058,0.654962,0.522167,0.717251,0.691853,0.604488,0.558256,0.788896,...,160255.8,1.080326,26.576015,20.261623,32.353826,27.942501,28.907662,109.354766,1.93042,0.282583
min,0.25,0.25,0.0,0.0,0.25,0.0,0.25,0.25,0.0,0.25,...,600000.0,0.1,1.0,0.0,10.0,20.0,5.0,0.1,1.0,0.0
25%,0.5,1.0,1.0,0.5,1.0,0.5,0.5,1.0,1.0,0.5,...,1000000.0,0.6,50.0,50.0,45.0,50.0,45.0,9.0,2.0,0.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1000000.0,1.0,65.0,50.0,65.0,66.0,65.0,27.3,4.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1059860.0,1.5,80.0,50.0,91.0,90.0,85.0,64.8,5.0,0.0
max,4.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,1640000.0,14.5,255.0,100.0,194.0,230.0,180.0,999.9,7.0,1.0


In [158]:
# Drop columns with missing data that are not relevant to the analysis
poke_sum_df.drop(['height_m','percentage_male','weight_kg'], axis=1, inplace=True)

# Fill missing values with 'None' for type2
poke_sum_df['type2'] = poke_sum_df['type2'].fillna('None')

poke_sum_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 801 entries, 1 to 801
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non-null

In [161]:
poke_sum_df['capture_rate'].unique()

array(['45', '255', '120', '127', '90', '190', '75', '235', '150', '25',
       '170', '50', '200', '100', '180', '60', '225', '30', '35', '3',
       '65', '70', '125', '205', '155', '145', '130', '140', '15', '220',
       '160', '80', '55', '30 (Meteorite)255 (Core)'], dtype=object)

In [169]:
# '30 (Meteorite)255 (Core)' -> 255 and change to int type
poke_sum_df.loc[poke_sum_df['capture_rate'] == '30 (Meteorite)255 (Core)', 'capture_rate'] = 255
poke_sum_df['capture_rate'] = poke_sum_df['capture_rate'].astype(int)

poke_sum_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 801 entries, 1 to 801
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non-null

In [261]:
poke_sum_df.drop_duplicates(inplace=True)
poke_sum_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 801 entries, 1 to 801
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non-null

## Main Pokemon Dataset

	•	Dropped the columns Additional.Criteria and Location as they were not relevant to the analysis.
	•	Filled missing values in the type2 column with "None" to reflect Pokémon that have only one type.
	•	Retained NaN values in:
	•	Level, to indicate Pokémon that do not evolve (i.e., are fully evolved).
	•	Average.Level, to indicate Pokémon that cannot be found in the wild (only obtained through evolution or other means).
    •	Dropped 8 duplicate entries to ensure uniqueness and data integrity.
	•	Confirmed all remaining columns have appropriate data types for analysis.

In [253]:
main_pkmn_df = pd.read_csv('../data/raw/main_pkmn_dataset.csv', index_col='index', header=0)
main_pkmn_df.head()

Unnamed: 0_level_0,pokedex_number,name,type1,type2,Gym.Section,attack,base_total,defense,experience_growth,hp,...,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,387,Turtwig,grass,,0,68,318,64,1059860,55,...,1.0,0.5,0.5,2.0,1.0,2.0,1.0,1.0,1.0,0.5
1,388,Grotle,grass,,0,89,405,85,1059860,75,...,1.0,0.5,0.5,2.0,1.0,2.0,1.0,1.0,1.0,0.5
2,389,Torterra,grass,ground,0,109,525,105,1059860,95,...,1.0,1.0,0.5,4.0,1.0,1.0,1.0,0.5,1.0,1.0
3,390,Chimchar,fire,,0,58,309,44,1059860,44,...,1.0,0.5,2.0,0.5,1.0,1.0,1.0,2.0,0.5,2.0
4,391,Monferno,fire,fighting,0,78,405,52,1059860,64,...,1.0,0.5,2.0,0.5,1.0,1.0,2.0,1.0,0.5,2.0


In [255]:
main_pkmn_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 218 entries, 0 to 209
Data columns (total 36 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pokedex_number       218 non-null    int64  
 1   name                 218 non-null    object 
 2   type1                218 non-null    object 
 3   type2                106 non-null    object 
 4   Gym.Section          218 non-null    int64  
 5   attack               218 non-null    int64  
 6   base_total           218 non-null    int64  
 7   defense              218 non-null    int64  
 8   experience_growth    218 non-null    int64  
 9   hp                   218 non-null    int64  
 10  sp_attack            218 non-null    int64  
 11  sp_defense           218 non-null    int64  
 12  speed                218 non-null    int64  
 13  generation           218 non-null    int64  
 14  is_legendary         218 non-null    int64  
 15  Level                65 non-null     float64


In [257]:
main_pkmn_df.drop(['Additional.Criteria','Location'], axis=1, inplace=True)

In [259]:
main_pkmn_df.drop_duplicates(inplace=True)
main_pkmn_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 210 entries, 0 to 209
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pokedex_number     210 non-null    int64  
 1   name               210 non-null    object 
 2   type1              210 non-null    object 
 3   type2              102 non-null    object 
 4   Gym.Section        210 non-null    int64  
 5   attack             210 non-null    int64  
 6   base_total         210 non-null    int64  
 7   defense            210 non-null    int64  
 8   experience_growth  210 non-null    int64  
 9   hp                 210 non-null    int64  
 10  sp_attack          210 non-null    int64  
 11  sp_defense         210 non-null    int64  
 12  speed              210 non-null    int64  
 13  generation         210 non-null    int64  
 14  is_legendary       210 non-null    int64  
 15  Level              59 non-null     float64
 16  Average.Level      132 non-null

In [271]:
main_pkmn_df['type2'] = main_pkmn_df['type2'].fillna('None')
main_pkmn_df['Level'] = main_pkmn_df['Level'].fillna(np.nan)
main_pkmn_df['Average.Level'] = main_pkmn_df['Average.Level'].fillna(np.nan)

main_pkmn_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 210 entries, 0 to 209
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pokedex_number     210 non-null    int64  
 1   name               210 non-null    object 
 2   type1              210 non-null    object 
 3   type2              210 non-null    object 
 4   Gym.Section        210 non-null    int64  
 5   attack             210 non-null    int64  
 6   base_total         210 non-null    int64  
 7   defense            210 non-null    int64  
 8   experience_growth  210 non-null    int64  
 9   hp                 210 non-null    int64  
 10  sp_attack          210 non-null    int64  
 11  sp_defense         210 non-null    int64  
 12  speed              210 non-null    int64  
 13  generation         210 non-null    int64  
 14  is_legendary       210 non-null    int64  
 15  Level              59 non-null     float64
 16  Average.Level      132 non-null