In [1]:
import pandas as pd
import numpy as np

In [2]:
values = pd.Series(['apple','orange','apple','apple']*2)

In [3]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [4]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [5]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

In [6]:
dim = pd.Series(np.arange(8))

In [7]:
dim

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
dtype: int32

In [9]:
values.take(dim)

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [10]:
m = pd.Categorical(['foo','bar','baz','foo','bar'])

In [11]:
m

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

In [12]:
ca = ['foo','bar','baz']


In [13]:
code = [0,1,2,0,1]

In [14]:
m2 = pd.Categorical.from_codes(code,ca)

In [15]:
m2

[foo, bar, baz, foo, bar]
Categories (3, object): [foo, bar, baz]

In [16]:
m2 = pd.Categorical.from_codes(code,ca,ordered=True)

In [17]:
m2

[foo, bar, baz, foo, bar]
Categories (3, object): [foo < bar < baz]

In [18]:
np.random.seed(12345)
draws = np.random.randn(1000)
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [19]:
bins = pd.qcut(draws,4)

In [20]:
bins

[(-0.684, -0.0101], (-0.0101, 0.63], (-0.684, -0.0101], (-0.684, -0.0101], (0.63, 3.928], ..., (-0.0101, 0.63], (-0.684, -0.0101], (-2.95, -0.684], (-0.0101, 0.63], (0.63, 3.928]]
Length: 1000
Categories (4, interval[float64]): [(-2.95, -0.684] < (-0.684, -0.0101] < (-0.0101, 0.63] < (0.63, 3.928]]

In [21]:
bins = pd.qcut(draws,4,labels=['q1','q2','q3','q4'])

In [22]:
bins

[q2, q3, q2, q2, q4, ..., q3, q2, q1, q3, q4]
Length: 1000
Categories (4, object): [q1 < q2 < q3 < q4]

In [23]:
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

In [25]:
bins = pd.Series(bins,name='quartile')
results = (pd.Series(draws).groupby(bins).agg(['count','min','max']).reset_index())

In [26]:
results

Unnamed: 0,quartile,count,min,max
0,q1,250,-2.949343,-0.685484
1,q2,250,-0.683066,-0.010115
2,q3,250,-0.010032,0.628894
3,q4,250,0.634238,3.927528


In [27]:
n = 10000000
draws = pd.Series(np.random.randn(n))
labels = pd.Series(['foo','bar','baz','qux'] * (n // 4))

In [30]:
ca = labels.astype('category')

In [33]:
labels.memory_usage()

80000080

In [34]:
ca.memory_usage()

10000272

In [35]:
s= pd.Series(['a','b','c','d'] * 2)

In [36]:
cats = s.astype('category')

In [37]:
cats

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [38]:
cats.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [39]:
cats.value_counts

<bound method IndexOpsMixin.value_counts of 0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]>

In [40]:
cat3 = cats[cats.isin(['a','b'])]

In [41]:
cat3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): [a, b, c, d]

In [42]:
pd.get_dummies(s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


In [43]:
df = pd.DataFrame({'key':['a','b','c'] * 4,'value':np.arange(12.)})

In [44]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [45]:
g= df.groupby('key').value

In [46]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [47]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64