### CHAPTER 7
# Data Cleaning and Preparation

In [217]:
import pandas as pd
import numpy as np

In [218]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [219]:
float_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [220]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [221]:
string_data.isnull()

0    False
1     True
2     True
3    False
dtype: bool

In [222]:
float_data = pd.Series([1, 2, None], dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [223]:
float_data.isnull()

0    False
1    False
2     True
dtype: bool

In [224]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [225]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [226]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [227]:
data.dropna()  # Drops any row with NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [228]:
data.dropna(how='all')  # Drops any row with all NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [229]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [230]:
data.dropna(axis="columns", how='all')  # Drops any column with all NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [231]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df

Unnamed: 0,0,1,2
0,-0.016797,-0.517389,0.615454
1,0.89777,0.664381,-1.004279
2,-0.808611,0.013866,0.059678
3,1.420469,0.605489,0.417605
4,-0.302169,0.011447,-2.319039
5,1.535266,-0.243608,-0.140582
6,0.37279,0.83538,0.218853


In [232]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.016797,,
1,0.89777,,
2,-0.808611,,0.059678
3,1.420469,,0.417605
4,-0.302169,0.011447,-2.319039
5,1.535266,-0.243608,-0.140582
6,0.37279,0.83538,0.218853


In [233]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.302169,0.011447,-2.319039
5,1.535266,-0.243608,-0.140582
6,0.37279,0.83538,0.218853


In [234]:
df.dropna(thresh=2)  # Keep only rows with at least 2 non-NaN values

Unnamed: 0,0,1,2
2,-0.808611,,0.059678
3,1.420469,,0.417605
4,-0.302169,0.011447,-2.319039
5,1.535266,-0.243608,-0.140582
6,0.37279,0.83538,0.218853


### Filling In Missing Data

In [235]:
df.fillna(0)  # Fill NaN values with 0

Unnamed: 0,0,1,2
0,-0.016797,0.0,0.0
1,0.89777,0.0,0.0
2,-0.808611,0.0,0.059678
3,1.420469,0.0,0.417605
4,-0.302169,0.011447,-2.319039
5,1.535266,-0.243608,-0.140582
6,0.37279,0.83538,0.218853


In [236]:
df.fillna({1:0.5, 2:0})  # Fill NaN values in column 1 with 0.5 and column 2 with 0

Unnamed: 0,0,1,2
0,-0.016797,0.5,0.0
1,0.89777,0.5,0.0
2,-0.808611,0.5,0.059678
3,1.420469,0.5,0.417605
4,-0.302169,0.011447,-2.319039
5,1.535266,-0.243608,-0.140582
6,0.37279,0.83538,0.218853


In [237]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.506779,-0.771091,0.746665
1,-1.332065,0.033029,0.495725
2,-1.608497,,0.608341
3,-0.85072,,-0.452997
4,-0.488704,,
5,0.425391,,


In [238]:
df.fillna(method='ffill')  # Forward fill NaN values

  df.fillna(method='ffill')  # Forward fill NaN values


Unnamed: 0,0,1,2
0,-0.506779,-0.771091,0.746665
1,-1.332065,0.033029,0.495725
2,-1.608497,0.033029,0.608341
3,-0.85072,0.033029,-0.452997
4,-0.488704,0.033029,-0.452997
5,0.425391,0.033029,-0.452997


In [239]:
df.fillna(method='ffill', limit=2)  # Forward fill NaN values with a limit of 2

  df.fillna(method='ffill', limit=2)  # Forward fill NaN values with a limit of 2


Unnamed: 0,0,1,2
0,-0.506779,-0.771091,0.746665
1,-1.332065,0.033029,0.495725
2,-1.608497,0.033029,0.608341
3,-0.85072,0.033029,-0.452997
4,-0.488704,,-0.452997
5,0.425391,,-0.452997


In [240]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [241]:
data.fillna(data.mean())  # Fill NaN values with the mean of the Series

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 Data Transformation

### Removing Duplicates

In [242]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [243]:
data.duplicated()  # Check for duplicate rows

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [244]:
data.drop_duplicates()  # Drop duplicate rows

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [245]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [247]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [248]:
data.drop_duplicates(subset=["k1"])  # Drop duplicates based on column 'k1'

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [249]:
data.drop_duplicates(["k1", "k2"], keep="last")  # Drop duplicates based on columns 'k1' and 'k2'

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6
