## Data cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first':['corey', 'jane', 'jhon', 'chris', np.nan, None, 'NA'],
    'last':['shafer', 'doe', 'doe', 'shafer', np.nan, np.nan, 'missing'],
    'email':['coreyms@gmail.com', 'janeDoe@email.com', 'jhon@yahoo.com', None, np.nan,'anonymous@yahoo.com', 'missing'],
    'age':['33', '55', '63', '36', None, None, 'missing']
}

In [3]:
df = pd.DataFrame(people)


In [4]:
df

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63
3,chris,shafer,,36
4,,,,
5,,,anonymous@yahoo.com,
6,,missing,missing,missing


In [6]:
df.replace('NA', np.nan, inplace=True)
df.replace('missing', np.nan, inplace=True)

In [7]:
df

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33.0
1,jane,doe,janeDoe@email.com,55.0
2,jhon,doe,jhon@yahoo.com,63.0
3,chris,shafer,,36.0
4,,,,
5,,,anonymous@yahoo.com,
6,,,,


In [8]:
df.dropna()

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63


In [9]:
df.dropna(axis='index', how="any")

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63


In [10]:
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6


In [5]:
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63
3,chris,shafer,,36
4,,,,
5,,,anonymous@yahoo.com,
6,,missing,missing,missing


In [6]:
df.dropna(axis='index', how='any', subset=["email", 'last'])

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63
6,,missing,missing,missing


In [7]:
df.dropna(axis='index', how='all', subset=["email", 'last'])

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63
3,chris,shafer,,36
5,,,anonymous@yahoo.com,
6,,missing,missing,missing


In [8]:
df.isna() # find how many na values

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,False,False,False,False


In [16]:
df.isna().sum()

first    3
last     3
email    3
age      3
dtype: int64

In [11]:
df.fillna("missing") # filling value in na

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33
1,jane,doe,janeDoe@email.com,55
2,jhon,doe,jhon@yahoo.com,63
3,chris,shafer,missing,36
4,missing,missing,missing,missing
5,missing,missing,anonymous@yahoo.com,missing
6,,missing,missing,missing


In [12]:
df["age"] = df['age'].astype(float)

ValueError: could not convert string to float: 'missing'

In [19]:
df

Unnamed: 0,first,last,email,age
0,corey,shafer,coreyms@gmail.com,33.0
1,jane,doe,janeDoe@email.com,55.0
2,jhon,doe,jhon@yahoo.com,63.0
3,chris,shafer,,36.0
4,,,,
5,,,anonymous@yahoo.com,
6,,,,


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   first   4 non-null      object 
 1   last    4 non-null      object 
 2   email   4 non-null      object 
 3   age     4 non-null      float64
dtypes: float64(1), object(3)
memory usage: 352.0+ bytes
