# Python Pandas Tutorial (Part 9) : Cleaning Data - Casting Datatypes and Handling Missing Values

In [32]:
import pandas as pd
import numpy as np

people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [33]:
df = pd.DataFrame(people)
df.describe

<bound method NDFrame.describe of    first     last                    email      age
0  Corey  Schafer  CoreyMSchafer@gmail.com       33
1   Jane      Doe        JaneDoe@email.com       55
2   John      Doe        JohnDoe@email.com       63
3  Chris  Schafer                     None       36
4    NaN      NaN                      NaN     None
5   None      NaN      Anonymous@email.com     None
6     NA  Missing                       NA  Missing>

In [34]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


## We have some missing values <br>
#### Best way to handle missing data is to remove it

In [35]:
# lets drop the na 
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [36]:
#by defult
df.dropna(axis = 'index' , how = 'any')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [37]:
df.dropna(axis = "index" , how = "all")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


### Subset argument


In [38]:
df.dropna(axis="index" , how = "all", subset = ["last", "email"] )

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


## Replace the missing values with <code>.replace()</code>

In [39]:
df.replace("NA" , np.nan, inplace = True)
df.replace("Missing", np.nan, inplace=True)

In [40]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


#### Check if it has na with <code>.isna()</code>

In [41]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


### If we want to fill in the NA values <code>.fillna()</code>

In [43]:
# this is usually good for numerical data.
df.fillna("0")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


## Casting data types

In [45]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [47]:
type(np.nan)

float

In [50]:
df['age'] = df['age'].astype(float)

In [52]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [53]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [54]:
df.mean()

  df.mean()


age    46.75
dtype: float64

In [58]:
pokemon_df = pd.read_csv("Pokemon.csv")

In [61]:
pokemon_df.head(3)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False


In [62]:
pokemon_df["Attack"].mean()

79.00125

In [65]:
pokemon_df["Total"].mean()

435.1025

In [66]:
pokemon_df["Total"].unique()

array([318, 405, 525, 625, 309, 534, 634, 314, 530, 630, 195, 205, 395,
       495, 251, 349, 479, 579, 253, 413, 262, 442, 288, 438, 320, 485,
       300, 450, 275, 365, 505, 273, 323, 483, 299, 270, 435, 245, 455,
       490, 285, 305, 265, 290, 440, 500, 350, 555, 385, 510, 310, 400,
       590, 390, 335, 515, 410, 315, 325, 465, 352, 460, 475, 600, 328,
       330, 480, 520, 425, 340, 345, 295, 200, 540, 640, 535, 355, 615,
       580, 420, 680, 780, 215, 415, 250, 218, 210, 470, 280, 610, 360,
       180, 430, 336, 380, 700, 635, 220, 240, 198, 278, 518, 618, 269,
       414, 670, 266, 456, 236, 237, 474, 190, 375, 260, 575, 302, 467,
       560, 458, 468, 308, 565, 770, 194, 384, 263, 363, 523, 224, 424,
       244, 482, 348, 498, 452, 329, 411, 454, 334, 494, 594, 545, 720,
       528, 418, 255, 370, 281, 446, 316, 292, 487, 264, 358, 488, 497,
       313, 508, 445, 294, 509, 351, 519, 461, 303, 401, 567, 473, 428,
       464, 319, 472, 489, 471, 484, 550, 660, 307, 409, 423, 38

In [68]:
# if you have some strange unique values, conver it with
# .replace("something", "to something", inplace = True)