In [1]:
# Removing Duplicates
import pandas as pd
import numpy as np

data=pd.DataFrame({
    'k1':['one']*3+['two']*4,
    'k2':[1,1,2,3,3,4,4]
})

data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [2]:
# The DataFrame method duplicated returns a boolean Series indicating whether each
#  row is a duplicate or not

data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [4]:
# The DataFrame method duplicated returns a boolean Series indicating whether each
#  row is a duplicate or not:

data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [7]:
#  Both of these methods by default consider all of the columns; alternatively you can
#  specify any subset of them to detect duplicates. Suppose we had an additional column
#  of values and wanted to filter duplicates only based on the 'k1' column

data['v1']=range(7)
print(data)
data.drop_duplicates(['k1'])

    k1  k2  v1
0  one   1   0
1  one   1   1
2  one   2   2
3  two   3   3
4  two   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [10]:
#  duplicated and drop_duplicates by default keep the first observed value combination.
#  Passing take_last=True will return the last one

data.drop_duplicates(['k1', 'k2'], take_last=True) #Not Working

TypeError: drop_duplicates() got an unexpected keyword argument 'take_last'

In [12]:
# Transforming Data Using a Function or Mapping

data=pd.DataFrame({
    'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
            'corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}
)
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [16]:
# Suppose you wanted to add a column indicating the type of animal that each food came
#  from. Let’s write down a mapping of each distinct meat type to the kind of animal
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [20]:
# The map method on a Series accepts a function or dict-like object containing a mapping,
#  but here we have a small problem in that some of the meats above are capitalized and
#  others are not. Thus, we also need to convert each value to lower case

data['animals']=data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animals
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [21]:
# We could also have passed a function that does all the work
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [None]:
# Using map is a convenient way to perform element-wise transformations and other data
#  cleaning-related operations.

In [1]:
# Replacing Values

#  Filling in missing data with the fillna method can be thought of as a special case of
#  more general value replacement. While map, as you’ve seen above, can be used to modify
#  a subset of values in an object, replace provides a simpler and more flexible way to do
#  so. Let’s consider this Series

import pandas as pd
import numpy as np

data=pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [2]:
data.replace(-999.,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [3]:
#  If you want to replace multiple values at once, you instead pass a list then the substitute
#  value
data.replace([-999.,-1000.],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [4]:
#  To use a different replacement for each value, pass a list of substitutes
data.replace([-999.,-1000.],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [7]:
# The argument passed can also be a dict
data.replace({-999.:2,-1000.:1})

0    1.0
1    2.0
2    2.0
3    2.0
4    1.0
5    3.0
dtype: float64

In [8]:
#  Renaming Axis Indexes
data=pd.DataFrame(np.arange(12).reshape(3,4),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [10]:
# Like a Series, the axis indexes have a map method
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [11]:
data.index=data.index.map(str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [12]:
#  If you want to create a transformed version of a data set without modifying the original,
#  a useful method is rename
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [15]:
# Notably, rename can be used in conjunction with a dict-like object providing new values
#  for a subset of the axis labels
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [19]:
#  rename saves having to copy the DataFrame manually and assign to its index and col
#  umns attributes. Should you wish to modify a data set in place, pass inplace=True
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'},inplace=True)
data

          one  two  peekaboo  four
INDIANA     0    1         2     3
COLORADO    4    5         6     7
NEW YORK    8    9        10    11


Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [1]:
# Discretization and Binning`
import pandas as pd
import numpy as np
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [2]:
#  Let’s divide these into bins of 18 to 25, 26 to 35, 35 to 60, and finally 60 and older. To
#  do so, you have to use cut, a function in pandas

bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [3]:
# The object pandas returns is a special Categorical object. You can treat it like an array
#  of strings indicating the bin name; internally it contains a levels array indicating the
#  distinct category names along with a labeling for the ages data in the labels attribute

# print(cats.labels) #Not Working
# print(cats.levels) #Not Working

pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [4]:
# Consistent with mathematical notation for intervals, a parenthesis means that the side
#  is open while the square bracket means it is closed (inclusive). Which side is closed can
#  be changed by passing right=False

pd.cut(ages,[18,26,36,61,100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [5]:
#  You can also pass your own bin names by passing a list or array to the labels option
group_names=['Youth','YoungAdult','MiddleAged','Senior']

pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [6]:
#  If you pass cut a integer number of bins instead of explicit bin edges, it will compute
#  equal-length bins based on the minimum and maximum values in the data. Consider
#  the case of some uniformly distributed data chopped into fourths

data=np.random.rand(20)
pd.cut(data,4,precision=2)

[(0.5, 0.73], (0.5, 0.73], (0.73, 0.96], (0.5, 0.73], (0.27, 0.5], ..., (0.5, 0.73], (0.031, 0.27], (0.031, 0.27], (0.5, 0.73], (0.73, 0.96]]
Length: 20
Categories (4, interval[float64, right]): [(0.031, 0.27] < (0.27, 0.5] < (0.5, 0.73] < (0.73, 0.96]]

In [7]:
#  A closely related function, qcut, bins the data based on sample quantiles. Depending
#  on the distribution of the data, using cut will not usually result in each bin having the
#  same number of data points. Since qcut uses sample quantiles instead, by definition
#  you will obtain roughly equal-size bins

data=np.random.randn(1000)
cats=pd.qcut(data,4)
print(cats)
pd.value_counts(cats)

[(0.691, 4.061], (-0.052, 0.691], (0.691, 4.061], (0.691, 4.061], (-0.749, -0.052], ..., (-0.052, 0.691], (-0.749, -0.052], (-0.749, -0.052], (-0.052, 0.691], (-3.685, -0.749]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.685, -0.749] < (-0.749, -0.052] < (-0.052, 0.691] < (0.691, 4.061]]


(-3.685, -0.749]    250
(-0.749, -0.052]    250
(-0.052, 0.691]     250
(0.691, 4.061]      250
dtype: int64

In [8]:
# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive)
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

# We’ll return to cut and qcut later in the chapter on aggregation and group operations,
#  as these discretization functions are especially useful for quantile and group analysis

[(1.327, 4.061], (-0.052, 1.327], (-0.052, 1.327], (1.327, 4.061], (-1.368, -0.052], ..., (-0.052, 1.327], (-1.368, -0.052], (-1.368, -0.052], (-0.052, 1.327], (-3.685, -1.368]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.685, -1.368] < (-1.368, -0.052] < (-0.052, 1.327] < (1.327, 4.061]]

In [18]:
# Detecting and Filtering Outliers
#Filtering or transforming outliers is largely a matter of applying array operations.

np.random.seed(12345)
data=pd.DataFrame(np.random.randn(1000,4))
print(data)
data.describe()

            0         1         2         3
0   -0.204708  0.478943 -0.519439 -0.555730
1    1.965781  1.393406  0.092908  0.281746
2    0.769023  1.246435  1.007189 -1.296221
3    0.274992  0.228913  1.352917  0.886429
4   -2.001637 -0.371843  1.669025 -0.438570
..        ...       ...       ...       ...
995  1.089085  0.251232 -1.451985  1.653126
996 -0.478509 -0.010663 -1.060881 -1.502870
997 -1.946267  1.013592  0.037333  0.133304
998 -1.293122 -0.322542 -0.782960 -0.303340
999  0.089987  0.292291  1.177706  0.882755

[1000 rows x 4 columns]


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [22]:
# Suppose you wanted to find values in one of the columns exceeding three in magnitude
col=data[3]
col[np.abs(col)>3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [29]:
# To select all rows having a value exceeding 3 or -3, you can use the any method on a
#  boolean DataFrame
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [31]:
# Values can just as easily be set based on these criteria. Here is code to cap values outside
#  the interval -3 to 3

data[np.abs(data)>3]=np.sign(data)*3
data.describe()

#  The ufunc np.sign returns an array of 1 and -1 depending on the sign of the values

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


In [37]:
# Permutation and Random Sampling

#  Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using
#  the numpy.random.permutation function. Calling permutation with the length of the axis
#  you want to permute produces an array of integers indicating the new ordering
df=pd.DataFrame(np.arange(5*4).reshape(5,4))
print(df)

sampler=np.random.permutation(5)
sampler

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19


array([1, 0, 4, 3, 2])

In [38]:
# That array can then be used in ix-based indexing or the take function
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11


In [42]:
# To select a random subset without replacement, one way is to slice off the first k ele
# ments of the array returned by permutation, where k is the desired subset size. There
#  are much more efficient sampling-without-replacement algorithms, but this is an easy
#  strategy that uses readily available tools
df.take(np.random.permutation(len(df))[:3])

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
1,4,5,6,7


In [46]:
#  To generate a sample with replacement, the fastest way is to use np.random.randint to
#  draw random integers

bag=np.array([5,7,-1,6,4])

sampler=np.random.randint(0,len(bag),size=10)
print(sampler)

draws=bag.take(sampler)
draws

[2 2 2 2 4 3 1 0 2 1]


array([-1, -1, -1, -1,  4,  6,  7,  5, -1,  7])

In [3]:
#  Computing Indicator/Dummy Variables

# Another type of transformation for statistical modeling or machine learning applica
# tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a
#  column in a DataFrame has k distinct values, you would derive a matrix or DataFrame
#  containing k columns containing all 1’s and 0’s. pandas has a get_dummies function for
#  doing this, though devising one yourself is not difficult.
import numpy as np
import pandas as pd

df=pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'],
                'data1':range(6)})

pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [5]:
#  In some cases, you may want to add a prefix to the columns in the indicator DataFrame,
#  which can then be merged with the other data. get_dummies has a prefix argument for
#  doing just this

dummies=pd.get_dummies(df['key'],prefix='key')

df_with_dummy=df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [None]:
# If a row in a DataFrame belongs to multiple categories, things are a bit more compli
# cated. Let’s return to the MovieLens 1M dataset from earlier in the book:
#  In [194]: mnames = ['movie_id', 'title', 'genres']
#  In [195]: movies = pd.read_table('ch07/movies.dat', sep='::', header=None,
#    .....:                         names=mnames)
#  In [196]: movies[:10]
#  Out[196]:
#    movie_id                               title                        genres
#  0         1                    Toy Story (1995)   Animation|Children's|Comedy
#  1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
#  2         3             Grumpier Old Men (1995)                Comedy|Romance
#  3         4            Waiting to Exhale (1995)                  Comedy|Drama
#  4         5  Father of the Bride Part II (1995)                        Comedy
#  5         6                         Heat (1995)         Action|Crime|Thriller
#  6         7                      Sabrina (1995)                Comedy|Romance
#  7         8                 Tom and Huck (1995)          Adventure|Children's
#  8         9                 Sudden Death (1995)                        Action
#  9        10                    GoldenEye (1995)     Action|Adventure|Thriller

In [None]:
#  Adding indicator variables for each genre requires a little bit of wrangling. First, we
#  extract the list of unique genres in the dataset (using a nice set.union trick):
#  In [197]: genre_iter = (set(x.split('|')) for x in movies.genres)
#  In [198]: genres = sorted(set.union(*genre_iter))
#  Now, one way to construct the indicator DataFrame is to start with a DataFrame of all
#  zeros:
#  In [199]: dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
#  Now, iterate through each movie and set entries in each row of dummies to 1:
#  In [200]: for i, gen in enumerate(movies.genres):
#    .....:     dummies.ix[i, gen.split('|')] = 1
#  Then, as above, you can combine this with movies:
#  In [201]: movies_windic = movies.join(dummies.add_prefix('Genre_'))
#  In [202]: movies_windic.ix[0]
#  Out[202]:
#  movie_id                                       1
#  title                           Toy Story (1995)
#  genres               Animation|Children's|Comedy
#  Genre_Action                                   0
#  Genre_Adventure                                0
#  Genre_Animation                                1
#  Genre_Children's                               1
#  Genre_Comedy                                   1
#  Genre_Crime                                    0
#  Genre_Documentary                              0
#  Genre_Drama                                    0
#  Genre_Fantasy                                  0

In [None]:
#  A useful recipe for statistical applications is to combine get_dummies with a discretiza
# tion function like cut:
#  In [204]: values = np.random.rand(10)
#  In [205]: values
#  Out[205]:
#  array([ 0.9296,  0.3164,  0.1839,  0.2046,  0.5677,  0.5955,  0.9645,
#         0.6532,  0.7489,  0.6536])
#  In [206]: bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
#  In [207]: pd.get_dummies(pd.cut(values, bins))
#  Out[207]:
#    (0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1]
#  0         0           0           0           0         1
#  1         0           1           0           0         0
#  2         1           0           0           0         0
#  3         0           1           0           0         0
#  4         0           0           1           0         0
#  5         0           0           1           0         0
#  6         0           0           0           0         1
#  7         0           0           0           1         0
#  8         0           0           0           1         0
#  9         0           0           0           1         0

In [None]:
# End