In [15]:
import pandas as pd
import numpy as np

In [16]:
people={
    'first':['Corey','Jane','Chris',np.nan,None,'N/A'],
    'last':['Schafer','Doe','Doe','Schafer',np.nan, np.nan],
    'email':['coreyshafer@gmail.com','janedoe@email.com','chrisdoe@email.com',None,np.nan,'N/A'],
    'age':['33','55','63',None, None, 'Missing']
}

In [17]:
df = pd.DataFrame(people)

df.replace('N/A', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [18]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33.0
1,Jane,Doe,janedoe@email.com,55.0
2,Chris,Doe,chrisdoe@email.com,63.0
3,,Schafer,,
4,,,,
5,,,,


In [1]:
# df.dtypes show data types of every column
# df.first.dtype  show data type of column first 
# columns consisting entirely of strings do not get their own type; they are instead given the object type.
# to convert a column of one type into another wherever such a conversion makes sense by using the astype() function

NameError: name 'df' is not defined

In [None]:
# Entries missing values are given the value NaN, short for "Not a Number". 
# For technical reasons these NaN values are always of the float64 dtype.
# To select NaN entries you can use pd.isnull() (or its companion pd.notnull())

In [19]:
df.dropna() # drop all rows with any missing values

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33
1,Jane,Doe,janedoe@email.com,55
2,Chris,Doe,chrisdoe@email.com,63


In [20]:
df.dropna(axis='index',how='any') # default setting for dropna(), drop the index row if there is any missing number in that row

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33
1,Jane,Doe,janedoe@email.com,55
2,Chris,Doe,chrisdoe@email.com,63


In [21]:
df.dropna(axis='index',how='all') # drop the index row if all are missing numbers in that row

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33.0
1,Jane,Doe,janedoe@email.com,55.0
2,Chris,Doe,chrisdoe@email.com,63.0
3,,Schafer,,


In [22]:
df.dropna(axis='columns',how='any')

0
1
2
3
4
5


In [23]:
# drop rows according to one specific column
df.dropna(axis='index',how='any',subset=['email']) # 'how' won't influence if only has one subset

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33
1,Jane,Doe,janedoe@email.com,55
2,Chris,Doe,chrisdoe@email.com,63


In [24]:
df.dropna(axis='index',how='all', subset=['last','email']) # drop index row if 'last' and 'email' all are none

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33.0
1,Jane,Doe,janedoe@email.com,55.0
2,Chris,Doe,chrisdoe@email.com,63.0
3,,Schafer,,


In [25]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,True,False,True,True
4,True,True,True,True
5,True,True,True,True


In [26]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33
1,Jane,Doe,janedoe@email.com,55
2,Chris,Doe,chrisdoe@email.com,63
3,MISSING,Schafer,MISSING,MISSING
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,MISSING,MISSING


In [27]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyshafer@gmail.com,33
1,Jane,Doe,janedoe@email.com,55
2,Chris,Doe,chrisdoe@email.com,63
3,0,Schafer,0,0
4,0,0,0,0
5,0,0,0,0


In [None]:
df.fillna(method ='pad') * Filling null values with the previous ones
df.fillna(method ='bfill') Filling null value with the next ones
# will replace  Nan value in dataframe with value -99   
data.replace(to_replace = np.nan, value = -99)  
# to interpolate the missing values  
df.interpolate(method ='linear', limit_direction ='forward') 

In [29]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [33]:
df['age'].astype(float).mean()

50.333333333333336

In [None]:
# specify customed none values when reading from csv files
na_vals=['NA','Missing'] 
df=pd.read_csv('xxxxx.csv',na_values=na_vals)

In [34]:
df['email'].unique() #check all values to see if there are some unexpected ones

array(['coreyshafer@gmail.com', 'janedoe@email.com', 'chrisdoe@email.com',
       None, nan], dtype=object)

In [None]:
# use replace to deal with unexpected data type
df['email'].replace('unexpected values',wanted values ,inplace=True)