### CHAPTER 7
# Data Cleaning and Preparation

In [192]:
import pandas as pd
import numpy as np

In [193]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [194]:
float_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [195]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [196]:
string_data.isnull()

0    False
1     True
2     True
3    False
dtype: bool

In [197]:
float_data = pd.Series([1, 2, None], dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [198]:
float_data.isnull()

0    False
1    False
2     True
dtype: bool

In [199]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [200]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [201]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [202]:
data.dropna()  # Drops any row with NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [203]:
data.dropna(how='all')  # Drops any row with all NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [204]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [205]:
data.dropna(axis="columns", how='all')  # Drops any column with all NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [206]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df

Unnamed: 0,0,1,2
0,0.814056,0.042235,0.759533
1,1.417799,-0.62743,0.932203
2,0.703296,-0.700158,0.233801
3,-0.713472,0.598446,0.119701
4,-0.07785,1.087527,-0.135161
5,-0.344773,-0.333332,-0.205075
6,-0.789541,2.220655,-0.755379


In [207]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.814056,,
1,1.417799,,
2,0.703296,,0.233801
3,-0.713472,,0.119701
4,-0.07785,1.087527,-0.135161
5,-0.344773,-0.333332,-0.205075
6,-0.789541,2.220655,-0.755379


In [208]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.07785,1.087527,-0.135161
5,-0.344773,-0.333332,-0.205075
6,-0.789541,2.220655,-0.755379


In [209]:
df.dropna(thresh=2)  # Keep only rows with at least 2 non-NaN values

Unnamed: 0,0,1,2
2,0.703296,,0.233801
3,-0.713472,,0.119701
4,-0.07785,1.087527,-0.135161
5,-0.344773,-0.333332,-0.205075
6,-0.789541,2.220655,-0.755379


### Filling In Missing Data

In [210]:
df.fillna(0)  # Fill NaN values with 0

Unnamed: 0,0,1,2
0,0.814056,0.0,0.0
1,1.417799,0.0,0.0
2,0.703296,0.0,0.233801
3,-0.713472,0.0,0.119701
4,-0.07785,1.087527,-0.135161
5,-0.344773,-0.333332,-0.205075
6,-0.789541,2.220655,-0.755379


In [211]:
df.fillna({1:0.5, 2:0})  # Fill NaN values in column 1 with 0.5 and column 2 with 0

Unnamed: 0,0,1,2
0,0.814056,0.5,0.0
1,1.417799,0.5,0.0
2,0.703296,0.5,0.233801
3,-0.713472,0.5,0.119701
4,-0.07785,1.087527,-0.135161
5,-0.344773,-0.333332,-0.205075
6,-0.789541,2.220655,-0.755379


In [212]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.511662,0.150656,0.069131
1,-0.101381,1.847011,0.102307
2,-1.23628,,-0.462342
3,0.560866,,-0.245528
4,0.727958,,
5,-1.048169,,


In [213]:
df.fillna(method='ffill')  # Forward fill NaN values

  df.fillna(method='ffill')  # Forward fill NaN values


Unnamed: 0,0,1,2
0,-0.511662,0.150656,0.069131
1,-0.101381,1.847011,0.102307
2,-1.23628,1.847011,-0.462342
3,0.560866,1.847011,-0.245528
4,0.727958,1.847011,-0.245528
5,-1.048169,1.847011,-0.245528


In [214]:
df.fillna(method='ffill', limit=2)  # Forward fill NaN values with a limit of 2

  df.fillna(method='ffill', limit=2)  # Forward fill NaN values with a limit of 2


Unnamed: 0,0,1,2
0,-0.511662,0.150656,0.069131
1,-0.101381,1.847011,0.102307
2,-1.23628,1.847011,-0.462342
3,0.560866,1.847011,-0.245528
4,0.727958,,-0.245528
5,-1.048169,,-0.245528


In [215]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [216]:
data.fillna(data.mean())  # Fill NaN values with the mean of the Series

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64