# Chapter 7

# 7  Data Cleaning and Preparation

# 7.1 Handling Missing Data


In [4]:
import pandas as pd
import numpy as np

In [5]:
# For data with float64 dtype, pandas uses the floating-point value NaN (Not a Number) to represent missing data.
# We call this a sentinel value: when present, it indicates a missing (or null) value:

float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [6]:
# The isna method gives us a Boolean Series with True where values are null:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

#### Missing data as NA stands for not available. In statistics applications, NA data may either be data that does not exist or that exists but was not observed 

#### When cleaning up data for analysis, it is often important to do analysis on the missing data itself to identify data collection problems or potential biases in the data caused by missing data.

In [7]:
# The built-in Python None value is also treated as NA:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [8]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [9]:
float_data = pd.Series([1, 2, None], dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [10]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [11]:
# Table 7.1: NA handling object methods
Method	Description

dropna:	Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.

fillna:	Fill in missing data with some value or using an interpolation method such as "ffill" or "bfill".

isna:	Return Boolean values indicating which values are missing/NA.

notna:	Negation of isna, returns True for non-NA values and False for NA values.

SyntaxError: invalid syntax (707950151.py, line 2)

## Filtering Out Missing Data

In [13]:
# There are a few ways to filter out missing data. While you always have the option to do it by hand using pandas.isna and Boolean indexing, dropna can be helpful.

data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
# This is the same thing as doing:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
# With DataFrame objects, there are different ways to remove missing data. You may want to drop rows or columns that are all NA, or only those rows or columns containing any NAs at all. dropna by default drops any row containing a missing value:

data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data                   

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [17]:
# Passing how="all" will drop only rows that are all NA:

data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
# To drop columns in the same way, pass axis="columns":

data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [19]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
# Suppose you want to keep only rows containing at most a certain number of missing observations. You can indicate this with the thresh argument:

df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan

df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.282163,,
1,-0.60983,,
2,-1.248253,,-0.907155
3,-0.842707,,-0.089524
4,-0.09814,0.53387,0.18518
5,0.527915,0.92297,-0.054683
6,0.735333,1.894119,-0.226998


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.09814,0.53387,0.18518
5,0.527915,0.92297,-0.054683
6,0.735333,1.894119,-0.226998


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.248253,,-0.907155
3,-0.842707,,-0.089524
4,-0.09814,0.53387,0.18518
5,0.527915,0.92297,-0.054683
6,0.735333,1.894119,-0.226998


## Filling In Missing Data


#### Rather than filtering out missing data (and potentially discarding other data along with it), you may want to fill in the “holes” in any number of ways. For most purposes, the fillna method is the workhorse function to use. Calling fillna with a constant replaces missing values with that value:



In [23]:
df.fillna(0)


Unnamed: 0,0,1,2
0,0.282163,0.0,0.0
1,-0.60983,0.0,0.0
2,-1.248253,0.0,-0.907155
3,-0.842707,0.0,-0.089524
4,-0.09814,0.53387,0.18518
5,0.527915,0.92297,-0.054683
6,0.735333,1.894119,-0.226998


In [24]:
# Calling fillna with a dictionary, you can use a different fill value for each column:

df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.282163,0.5,0.0
1,-0.60983,0.5,0.0
2,-1.248253,0.5,-0.907155
3,-0.842707,0.5,-0.089524
4,-0.09814,0.53387,0.18518
5,0.527915,0.92297,-0.054683
6,0.735333,1.894119,-0.226998


In [25]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan

df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.909085,0.925645,0.157486
1,0.74774,-0.557054,1.370068
2,1.643537,,-1.309276
3,1.166355,,1.034455
4,-0.577645,,
5,-1.244918,,


In [26]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,-1.909085,0.925645,0.157486
1,0.74774,-0.557054,1.370068
2,1.643537,-0.557054,-1.309276
3,1.166355,-0.557054,1.034455
4,-0.577645,-0.557054,1.034455
5,-1.244918,-0.557054,1.034455


In [27]:
df.fillna(method="ffill", limit=2)

Unnamed: 0,0,1,2
0,-1.909085,0.925645,0.157486
1,0.74774,-0.557054,1.370068
2,1.643537,-0.557054,-1.309276
3,1.166355,-0.557054,1.034455
4,-0.577645,,1.034455
5,-1.244918,,1.034455


In [28]:
# With fillna you can do lots of other things such as simple data imputation using the median or mean statistics:

data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [29]:
# Table 7.2: fillna function arguments

Argument	Description

value:	Scalar value or dictionary-like object to use to fill missing values

method:	Interpolation method: one of "bfill" (backward fill) or "ffill" (forward fill); default is None

axis:	Axis to fill on ("index" or "columns"); default is axis="index"

limit:	For forward and backward filling, maximum number of consecutive periods to fill

SyntaxError: invalid syntax (3532699136.py, line 3)

# 7.2 Data Transformation

## Removing Duplicates

In [30]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [31]:
# The DataFrame method duplicated returns a Boolean Series indicating whether each row is a duplicate (its column values are exactly equal to those in an earlier row) or not:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [32]:
# Relatedly, drop_duplicates returns a DataFrame with rows where the duplicated array is False filtered out:

data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [33]:
# Both methods by default consider all of the columns; alternatively, you can specify any subset of them to detect duplicates. 

data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [34]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [35]:
# duplicated and drop_duplicates by default keep the first observed value combination. Passing keep="last" will return the last one:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## Transforming Data Using a Function or Mapping

In [36]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                              "pastrami", "corned beef", "bacon",
                              "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [37]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [38]:
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [40]:
def get_animal(x):
    return meat_to_animal[x]

data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## Replacing Values

#### Filling in missing data with the fillna method is a special case of more general value replacement. As you've already seen, map can be used to modify a subset of values in an object, but replace provides a simpler and more flexible way to do so. Let’s consider this Series:

In [41]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [42]:
# To replace these with NA values that pandas understands, we can use replace, producing a new Series:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [43]:
# If you want to replace multiple values at once, you instead pass a list and then the substitute value:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [44]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [45]:
# The argument passed can also be a dictionary:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## Renaming Axis Indexes

#### Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects.

In [46]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

In [48]:
# Like a Series, the axis indexes have a map method:
def transform(x):
    return x[:4].upper()

data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [49]:
#You can assign to the index attribute, modifying the DataFrame in place:
data.index = data.index.map(transform)

data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [50]:
# If you want to create a transformed version of a dataset without modifying the original, a useful method is rename:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [51]:
# Notably, rename can be used in conjunction with a dictionary-like object, providing new values for a subset of the axis labels:
data.rename(index={"OHIO": "INDIANA"},
            columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


#### rename saves you from the chore of copying the DataFrame manually and assigning new values to its index and columns attributes.

## Discretization and Binning


In [52]:
# Continuous data is often discretized or otherwise separated into “bins” for analysis. 

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

# Let’s divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To do so, you have to use pandas.cut:
bins = [18, 25, 35, 60, 100]

age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [53]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [54]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [55]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [57]:
# pd.value_counts(categories) are the bin counts for the result of pandas.cut.
pd.value_counts(age_categories)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [59]:
# In the string representation of an interval, a parenthesis means that the side is open (exclusive), while the square bracket means it is closed (inclusive). You can change which side is closed by passing right=False:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [60]:
# You can override the default interval-based bin labeling by passing a list or array to the labels option:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [61]:
# If you pass an integer number of bins to pandas.cut instead of explicit bin edges, it will compute equal-length bins based on the minimum and maximum values in the data. 
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

[(0.26, 0.5], (0.74, 0.97], (0.5, 0.74], (0.5, 0.74], (0.026, 0.26], ..., (0.74, 0.97], (0.026, 0.26], (0.74, 0.97], (0.026, 0.26], (0.026, 0.26]]
Length: 20
Categories (4, interval[float64, right]): [(0.026, 0.26] < (0.26, 0.5] < (0.5, 0.74] < (0.74, 0.97]]

In [62]:
# The precision=2 option limits the decimal precision to two digits.
# A closely related function, pandas.qcut, bins the data based on sample quantiles. 
# Depending on the distribution of the data, using pandas.cut will not usually result in each bin having the same number of data points.
# Since pandas.qcut uses sample quantiles instead, you will obtain roughly equally sized bins:

In [64]:
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(0.67, 2.96], (0.67, 2.96], (0.032, 0.67], (0.032, 0.67], (-0.71, 0.032], ..., (-3.4, -0.71], (0.032, 0.67], (0.67, 2.96], (-3.4, -0.71], (0.032, 0.67]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.4, -0.71] < (-0.71, 0.032] < (0.032, 0.67] < (0.67, 2.96]]

In [65]:
pd.value_counts(quartiles)

(-3.4, -0.71]     250
(-0.71, 0.032]    250
(0.032, 0.67]     250
(0.67, 2.96]      250
Name: count, dtype: int64

In [66]:
# Similar to pandas.cut, you can pass your own quantiles (numbers between 0 and 1, inclusive):
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.389, -1.296]    100
(-1.296, 0.0322]    400
(0.0322, 1.323]     400
(1.323, 2.957]      100
Name: count, dtype: int64

## Detecting and Filtering Outliers

In [67]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.008357,0.029276,0.014986,0.025019
std,0.993408,0.994509,1.012601,1.024018
min,-3.127875,-3.780591,-3.902669,-3.283597
25%,-0.672042,-0.64999,-0.67273,-0.674831
50%,0.026899,0.04045,0.029805,0.02562
75%,0.653313,0.724943,0.698471,0.7186
max,2.926928,3.092028,3.214792,3.052971


In [68]:
# Suppose you wanted to find values in one of the columns exceeding 3 in absolute value:
col = data[2]
col[col.abs() > 3]

360   -3.156996
439    3.144573
452   -3.385603
755   -3.902669
895    3.214792
Name: 2, dtype: float64

In [70]:
# To select all rows having a value exceeding 3 or –3, you can use the any method on a Boolean DataFrame:
# The parentheses around data.abs() > 3 are necessary in order to call the any method on the result of the comparison operation.
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
101,0.049011,-3.780591,0.330536,1.474069
251,0.529903,-1.487495,-0.24533,3.052971
308,-0.808706,-3.061037,-0.822211,-0.881166
327,-3.127875,0.373101,0.032819,-0.296991
339,-1.234305,-0.559531,-0.152346,-3.266303
360,-1.227633,0.830631,-3.156996,-0.48606
395,0.048372,-0.368822,0.976508,-3.283597
439,-0.588304,-0.342834,3.144573,-1.096566
452,-0.553643,-1.300752,-3.385603,0.191461
755,-0.394793,0.482577,-3.902669,0.503071


In [71]:
data[data.abs() > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.008186,0.030026,0.016072,0.025516
std,0.992884,0.991349,1.006693,1.022161
min,-3.0,-3.0,-3.0,-3.0
25%,-0.672042,-0.64999,-0.67273,-0.674831
50%,0.026899,0.04045,0.029805,0.02562
75%,0.653313,0.724943,0.698471,0.7186
max,2.926928,3.0,3.0,3.0


In [72]:
# The statement np.sign(data) produces 1 and –1 values based on whether the values in data are positive or negative:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,-1.0,1.0,-1.0,-1.0
2,1.0,-1.0,-1.0,1.0
3,1.0,-1.0,-1.0,-1.0
4,-1.0,1.0,1.0,1.0
