In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
"first": ["Jigyasa", "Varun", "Chirag", "Chris", np.nan, None, "NA"],
"last": ["Sachdeva","Maheshwari", "Ahluwalia", "Shaffer", np.nan, np.nan, "Missing"],
"email": ["jigyasa.sachdeva24@gmail.com", "vm@gmail.com", "ca@gmail.com", None, np.nan, "NA", "NA"],
"age": ['33', '32', '35', '63', None, None, "Missing"]
}

In [3]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
3,Chris,Shaffer,,63
4,,,,
5,,,,
6,,Missing,,Missing


In [4]:
#Remove all null values
df.dropna()
#None and np.na are removed

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
6,,Missing,,Missing


In [5]:
df.dropna(axis = 'index', how = 'any') #default parameters
#axis: index/columns -> index: rows; will drop columns with missing values
#how: criterion for dropping; 

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
6,,Missing,,Missing


In [6]:
df.dropna(axis = 'index', how = 'all') 
#Drops only when rows have all missing values

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
3,Chris,Shaffer,,63
5,,,,
6,,Missing,,Missing


In [7]:
#Dropping rows with specific column values missing
df.dropna(axis = 'index', how = 'any', subset = ['email']) 
#subset= email: drop rows without an email (any/all will give same)
#Any/all for 1 parameter in subset gives same results

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
5,,,,
6,,Missing,,Missing


In [8]:
#Either last or email should be present, else drop
# , : OR
df.dropna(axis = 'index', how = 'all', subset = ['email', 'last']) 
#For a column to be dropped, all the subset columns should be missing
#Add inplace = True if has to be established in df

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
3,Chris,Shaffer,,63
5,,,,
6,,Missing,,Missing


In [9]:
#Treating custom missing values
#Replace NA, Missing with NaN value
df.replace('NA', np.nan, inplace = True)
df.replace('Missing', np.nan, inplace = True)
df

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33.0
1,Varun,Maheshwari,vm@gmail.com,32.0
2,Chirag,Ahluwalia,ca@gmail.com,35.0
3,Chris,Shaffer,,63.0
4,,,,
5,,,,
6,,,,


In [10]:
df.dropna(axis = 'index', how = 'all', subset = ['email', 'last']) 
#Results changed

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
3,Chris,Shaffer,,63


In [11]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,True,True
6,True,True,True,True


In [12]:
df.fillna('0')
#If it makes sense 

Unnamed: 0,first,last,email,age
0,Jigyasa,Sachdeva,jigyasa.sachdeva24@gmail.com,33
1,Varun,Maheshwari,vm@gmail.com,32
2,Chirag,Ahluwalia,ca@gmail.com,35
3,Chris,Shaffer,0,63
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0


In [13]:
df.dtypes
#it shows it is objects
#Average age wouldn't work as it is string/object
#        df['age'].mean()

first    object
last     object
email    object
age      object
dtype: object

In [14]:
type(np.nan)

float

In [15]:
#Type casting
df['age'] = df['age'].astype(float)
#Can't be an integer because nan is float

df.dtypes
#Now age is float

first     object
last      object
email     object
age      float64
dtype: object

In [16]:
df['age'].mean()

40.75

In [17]:
## df.astype() to convert all columns in a type