In [43]:
import numpy as np
import pandas as pd

people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [44]:
df = pd.DataFrame(people)

df.columns = df.columns.str.title()

In [46]:
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
# df.replace(None, np.nan, inplace=True)

In [48]:
# 🎯 What's the difference between None vs. np.nan?
# ✅ Are they both treated as isnull().mean() - YES. They are

df

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [50]:
# Count the percentage % of missing values -> isnull()
# 🧠 df.isnull().mean()

(df.isnull().mean() * 100).round(4)

First    42.8571
Last     42.8571
Email    42.8571
Age      42.8571
dtype: float64

In [55]:
# Check the entire data frame for na values
# 🧠 df.isna()

df.isna()

Unnamed: 0,First,Last,Email,Age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [18]:
# 🧠 df.dropna()

df.dropna(axis='index', how='any')

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [57]:
# Mass fill missing data with string or integer ('MISSING', 0)
# 🧠 df.fillna()

df.fillna('MISSING_Input')

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING_Input,36
4,MISSING_Input,MISSING_Input,MISSING_Input,MISSING_Input
5,MISSING_Input,MISSING_Input,Anonymous@email.com,MISSING_Input
6,MISSING_Input,MISSING_Input,MISSING_Input,MISSING_Input


In [20]:
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6


In [21]:
df.dropna(axis='index', how='all')  # Drop row ONLY if the entire row is missing contents

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [22]:
df.dropna(axis='columns', how='all')

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


First    28.5714
Last     28.5714
Email    28.5714
Age      28.5714
dtype: float64

In [27]:
# Drop a specific column with condition

df.dropna(axis='index', how='any', subset=['Email'])

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [42]:
df.dropna(axis='index', how='all', subset=['Email', 'Last'])  # .loc[6, 'Age']

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing
