# Data Cleaning and Preparation

Discretization and Binning

In [1]:
import pandas as pd

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

#Divide the ages into bins of 18 to 25, 26 to 35, 36 to 60, and 61 and older

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [2]:
# The object pandas returns is a special categorical object.
# The output describes the bins computed by pandas.cut
# The output is treated like an array of strings indicating the bin name
# It contains a categories array specifying the distinct category names 
#along with a labeling of the ages data in the codes attribute

In [3]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [4]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [5]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [7]:
#Make the right bracket open
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [8]:
#Give names to the categories
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [9]:
cats2 = pd.cut(ages, bins, labels=group_names)
pd.value_counts(cats2)

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
dtype: int64

In [10]:
#Passing integer number of bins
import numpy as np
data = np.random.rand(20)
data

array([0.02306707, 0.68860253, 0.48178742, 0.40989692, 0.38055007,
       0.27918344, 0.775431  , 0.57715458, 0.37087947, 0.73639779,
       0.89841292, 0.93330422, 0.19061164, 0.67085412, 0.37742531,
       0.58381466, 0.45116328, 0.74268543, 0.40412361, 0.26718128])

In [11]:
cats = pd.cut(data, 4, precision=2)
cats

[(0.022, 0.25], (0.48, 0.71], (0.48, 0.71], (0.25, 0.48], (0.25, 0.48], ..., (0.48, 0.71], (0.25, 0.48], (0.71, 0.93], (0.25, 0.48], (0.25, 0.48]]
Length: 20
Categories (4, interval[float64, right]): [(0.022, 0.25] < (0.25, 0.48] < (0.48, 0.71] < (0.71, 0.93]]

In [13]:
pd.value_counts(cats)

(0.25, 0.48]     8
(0.48, 0.71]     5
(0.71, 0.93]     5
(0.022, 0.25]    2
dtype: int64

In [14]:
#qcut bins the data based on sample quantiles, depending on the 
# distribution of the data.
# Each bin will have the same number of data points

In [15]:
data = np.random.rand(1000)
cats = pd.qcut(data,4)
cats

[(0.503, 0.752], (-1.500000000000004e-05, 0.234], (0.503, 0.752], (-1.500000000000004e-05, 0.234], (0.503, 0.752], ..., (0.752, 0.999], (0.503, 0.752], (0.752, 0.999], (0.503, 0.752], (0.234, 0.503]]
Length: 1000
Categories (4, interval[float64, right]): [(-1.500000000000004e-05, 0.234] < (0.234, 0.503] < (0.503, 0.752] < (0.752, 0.999]]

In [16]:
pd.value_counts(cats)

(-1.500000000000004e-05, 0.234]    250
(0.234, 0.503]                     250
(0.503, 0.752]                     250
(0.752, 0.999]                     250
dtype: int64

In [17]:
# You can pass your own quantiles
cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

In [18]:
pd.value_counts(cats)

(0.0953, 0.503]                     400
(0.503, 0.897]                      400
(-1.500000000000004e-05, 0.0953]    100
(0.897, 0.999]                      100
dtype: int64

Detecting and Filtering Outliers

In [19]:
#Applys array operations
data = pd.DataFrame(np.random.randn(1000,4))
data

Unnamed: 0,0,1,2,3
0,1.363533,-0.088125,0.427747,1.124903
1,1.036658,1.866430,0.603163,-1.696285
2,1.421525,1.177118,0.049772,0.534240
3,0.043681,0.772190,0.759137,-0.184528
4,-2.028702,-1.270808,-0.774426,-0.764986
...,...,...,...,...
995,0.528049,1.099670,-0.753113,0.216052
996,0.636859,1.881588,0.008629,-0.740122
997,0.059114,-0.488941,0.138166,0.221155
998,-1.177525,0.576378,-0.167296,-0.581174


In [20]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.049574,-0.003423,-0.008027,0.0436
std,0.989682,1.024655,0.997067,0.990431
min,-3.564961,-3.225127,-3.119047,-3.402672
25%,-0.730819,-0.694318,-0.644127,-0.627896
50%,-0.043068,-0.026163,-0.032918,0.078573
75%,0.59899,0.64745,0.650682,0.721862
max,3.444798,3.026835,2.97842,3.541351


In [21]:
#Find values of one of the columns exceeding 3 in absolute value
col = data[2]
col[np.abs(col) > 3]

17   -3.119047
Name: 2, dtype: float64

In [26]:
# Select all the rows having a value exceeding 3 or -3
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
17,1.081151,0.045159,-3.119047,0.682757
151,-3.564961,0.39496,1.929402,0.258015
183,3.444798,0.242508,-0.821906,-0.639925
516,-1.126131,0.687878,1.305587,3.541351
540,0.324097,0.5759,1.359419,-3.139057
603,-0.279768,1.498356,1.811188,3.494139
691,0.998954,-0.461748,0.101801,3.294871
770,0.809136,-3.225127,-1.012992,1.187265
776,-1.434486,1.055077,0.125609,-3.402672
912,0.176124,3.026835,1.283175,-0.968402


In [27]:
#Any checkes for each row if there is at 
#least one 'True' value across its column 
# 1 determine the axis

In [28]:
# Produces 1 and -1 values based on whether the values in data
# are positive or negative
np.sign(data)

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,1.0,1.0,-1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...
995,1.0,1.0,-1.0,1.0
996,1.0,1.0,1.0,-1.0
997,1.0,-1.0,1.0,1.0
998,-1.0,1.0,-1.0,-1.0


In [29]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,1.0,1.0,-1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0


In [30]:
data.head()

Unnamed: 0,0,1,2,3
0,1.363533,-0.088125,0.427747,1.124903
1,1.036658,1.86643,0.603163,-1.696285
2,1.421525,1.177118,0.049772,0.53424
3,0.043681,0.77219,0.759137,-0.184528
4,-2.028702,-1.270808,-0.774426,-0.764986
