# Data Cleaning

In [4]:
# Import packages
import pandas as pd
import seaborn as sns

## EXP Types

Cleaning 'exp_types.csv'. Multiple columns were found to have an object dtype because of a string '-' used to represent a null value. These items were changed to 0 and the dtypes changes to int64. 

In [102]:
# Import csv to DataFrame
exp_types_df = pd.read_csv('../data/raw/exp_types.csv', skiprows = 2, 
                           names = ['Erratic_total_xp','Fast_total_xp','Medium_Fast_total_xp',
                                    'Medium_Slow_total_xp','Slow_total_xp','Fluctuating_total_xp',
                                    'Level','Erratic_next_lvl','Fast_next_lvl','Medium_Fast_next_lvl',
                                    'Medium_Slow_next_lvl','Slow_next_lvl','Fluctuating_next_lvl'],
                           index_col='Level')
                         

In [104]:
exp_types_df.head()

Unnamed: 0_level_0,Erratic_total_xp,Fast_total_xp,Medium_Fast_total_xp,Medium_Slow_total_xp,Slow_total_xp,Fluctuating_total_xp,Erratic_next_lvl,Fast_next_lvl,Medium_Fast_next_lvl,Medium_Slow_next_lvl,Slow_next_lvl,Fluctuating_next_lvl
Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,0,0,0,0,15,6,8,9,10,4
2,15,6,8,9,10,4,37,15,19,48,23,9
3,52,21,27,57,33,13,70,30,37,39,47,19
4,122,51,64,96,80,32,115,49,61,39,76,33
5,237,100,125,135,156,65,169,72,91,44,114,47


In [26]:
exp_types_df.tail()

Unnamed: 0_level_0,Erratic_total_xp,Fast_total_xp,Medium_Fast_total_xp,Medium_Slow_total_xp,Slow_total_xp,Fluctuating_total_xp,Erratic_next_lvl,Fast_next_lvl,Medium_Fast_next_lvl,Medium_Slow_next_lvl,Slow_next_lvl,Fluctuating_next_lvl
Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
96,560922,707788,884736,932903,1105920,1415577,10411,22350,27937,30729,34921,44699
97,571333,730138,912673,963632,1140841,1460276,12206,22815,28519,31398,35649,64455
98,583539,752953,941192,995030,1176490,1524731,8343,23286,29107,32073,36383,47153
99,591882,776239,970299,1027103,1212873,1571884,8118,23761,29701,32757,37127,68116
100,600000,800000,1000000,1059860,1250000,1640000,-,-,-,-,-,-


In [106]:
exp_types_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 100
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Erratic_total_xp      100 non-null    int64 
 1   Fast_total_xp         100 non-null    int64 
 2   Medium_Fast_total_xp  100 non-null    int64 
 3   Medium_Slow_total_xp  100 non-null    int64 
 4   Slow_total_xp         100 non-null    int64 
 5   Fluctuating_total_xp  100 non-null    int64 
 6   Erratic_next_lvl      100 non-null    object
 7   Fast_next_lvl         100 non-null    object
 8   Medium_Fast_next_lvl  100 non-null    object
 9   Medium_Slow_next_lvl  100 non-null    object
 10  Slow_next_lvl         100 non-null    object
 11  Fluctuating_next_lvl  100 non-null    object
dtypes: int64(6), object(6)
memory usage: 10.2+ KB


In [108]:
cols_to_convert = [
    'Erratic_next_lvl', 'Fast_next_lvl', 'Medium_Fast_next_lvl',
    'Medium_Slow_next_lvl', 'Slow_next_lvl', 'Fluctuating_next_lvl'
]
exp_types_df[cols_to_convert] = exp_types_df[cols_to_convert].replace('-', 0).astype(int)
exp_types_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 100
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Erratic_total_xp      100 non-null    int64
 1   Fast_total_xp         100 non-null    int64
 2   Medium_Fast_total_xp  100 non-null    int64
 3   Medium_Slow_total_xp  100 non-null    int64
 4   Slow_total_xp         100 non-null    int64
 5   Fluctuating_total_xp  100 non-null    int64
 6   Erratic_next_lvl      100 non-null    int64
 7   Fast_next_lvl         100 non-null    int64
 8   Medium_Fast_next_lvl  100 non-null    int64
 9   Medium_Slow_next_lvl  100 non-null    int64
 10  Slow_next_lvl         100 non-null    int64
 11  Fluctuating_next_lvl  100 non-null    int64
dtypes: int64(12)
memory usage: 10.2 KB


## Trainers


In [76]:
trainers_df = pd.read_csv('../data/raw/trainers.csv', index_col=0)
trainers_df.head()

Unnamed: 0,trainer.name,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,pokemon1,...,pokemon6,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
1,Youngster Tristan,1,60,0,0,0,0,0,1,Starly,...,,5.0,,,,,,5.0,0,60
2,Youngster Logan,1,65,0,0,0,0,0,2,Burmy,...,,5.0,,,,,,5.0,0,65
3,Lass Natalie,1,62,0,0,0,0,0,3,Bidoof,...,,5.0,,,,,,5.0,0,62
4,Youngster Michael,2,81,69,0,0,0,0,4,Kricketot,...,,7.0,6.0,,,,,7.0,0,150
5,Camper,1,61,0,0,0,0,0,5,Rattata,...,,5.0,,,,,,5.0,0,61


In [82]:
trainers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 927 entries, 1 to 927
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trainer.name       927 non-null    object 
 1   number.of.pokemon  927 non-null    int64  
 2   exp_pokemon1       927 non-null    int64  
 3   exp_pokemon2       927 non-null    int64  
 4   exp_pokemon3       927 non-null    int64  
 5   exp_pokemon4       927 non-null    int64  
 6   exp_pokemon5       927 non-null    int64  
 7   exp_pokemon6       927 non-null    int64  
 8   x                  927 non-null    int64  
 9   pokemon1           927 non-null    object 
 10  pokemon2           527 non-null    object 
 11  pokemon3           270 non-null    object 
 12  pokemon4           81 non-null     object 
 13  pokemon5           50 non-null     object 
 14  pokemon6           23 non-null     object 
 15  level_pokemon1     922 non-null    float64
 16  level_pokemon2     523 non-null

In [84]:
trainers_df.describe()

Unnamed: 0,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
count,927.0,927.0,927.0,927.0,927.0,927.0,927.0,927.0,922.0,523.0,254.0,79.0,29.0,12.0,927.0,927.0,927.0
mean,1.743258,852.724919,715.385113,417.587918,161.554477,57.559871,23.612729,464.0,30.812364,39.900574,44.031496,48.936709,53.965517,45.25,-inf,4.234088,2228.425027
std,1.135951,774.879835,870.259022,811.523978,588.779316,348.690182,224.890033,267.746148,19.132844,15.853184,15.256664,16.69895,18.022769,13.328882,,3.137628,2702.401093
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,3.0,4.0,14.0,16.0,-inf,0.0,0.0
25%,1.0,113.0,0.0,0.0,0.0,0.0,0.0,232.5,13.0,27.0,32.0,36.0,40.0,44.0,13.0,1.0,205.5
50%,2.0,619.0,313.0,0.0,0.0,0.0,0.0,464.0,32.0,41.0,44.0,51.0,54.0,51.0,32.0,4.0,1336.0
75%,2.0,1431.0,1367.0,426.0,0.0,0.0,0.0,695.5,45.0,54.0,57.0,63.0,66.0,53.75,46.0,8.0,3126.5
max,6.0,2985.0,3453.0,3583.0,3453.0,3378.0,2520.0,927.0,81.0,79.0,81.0,79.0,83.0,56.0,83.0,8.0,16182.0


In [88]:
trainers_df['number.of.pokemon'].unique()

array([1, 2, 3, 6, 4, 0, 5])

In [90]:
trainers_df[trainers_df['number.of.pokemon']==0]

Unnamed: 0,trainer.name,number.of.pokemon,exp_pokemon1,exp_pokemon2,exp_pokemon3,exp_pokemon4,exp_pokemon5,exp_pokemon6,x,pokemon1,...,pokemon6,level_pokemon1,level_pokemon2,level_pokemon3,level_pokemon4,level_pokemon5,level_pokemon6,max.level,gym.section,total_EXP
82,Rich Boy Jason,0,827,0,0,0,0,0,82,Prinplup,...,,27.0,,,,,,27.0,3,827
83,Lady Melissa,0,393,0,0,0,0,0,83,Cherubi,...,,27.0,,,,,,27.0,3,393
84,Gentleman Jeremy,0,619,0,0,0,0,0,84,Chatot,...,,27.0,,,,,,27.0,3,619
85,Socialite Reina,0,879,0,0,0,0,0,85,Roselia,...,,27.0,,,,,,27.0,3,879
99,Scientist Stefano,0,932,0,0,0,0,0,99,Kadabra,...,,30.0,,,,,,30.0,4,932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,Rich Boy Liam,0,0,0,0,0,0,0,892,Blissey,...,,,,,,,,-inf,0,0
893,Lady Celeste,0,0,0,0,0,0,0,893,Blissey,...,,,,,,,,-inf,0,0
913,Galactic Boss Cyrus,0,962,1246,0,0,0,0,913,Sneasel,...,,34.0,34.0,,,,,34.0,5,2208
921,Leader Volkner,0,2328,2364,0,0,0,0,921,Luxray,...,,56.0,56.0,,,,,56.0,8,4692
