In [1]:
import numpy as np; import pandas as pd

In [2]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size=N),
                   'weight': np.random.uniform(0, 4, size=N)},
                   columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,3.908131
1,1,orange,5,2.317159
2,2,apple,7,1.119488
3,3,apple,3,3.752865
4,4,apple,11,3.302997
5,5,orange,8,2.260873
6,6,apple,4,1.048684
7,7,apple,8,2.367639


In [4]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: object

In [3]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [8]:
type(df['fruit'].values)

numpy.ndarray

In [5]:
type(fruit_cat)

pandas.core.series.Series

In [6]:
type(fruit_cat.values)        # pandas Categorical object: categories and codes for each category

pandas.core.arrays.categorical.Categorical

In [10]:
fruit_cat.values.categories

Index(['apple', 'orange'], dtype='object')

In [11]:
fruit_cat.values.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [12]:
my_cat = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
my_cat

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [13]:
my_cat.codes

array([2, 0, 1, 2, 0], dtype=int8)

In [16]:
cats = ['foo', 'bar', 'baz']
cods = [2, 1, 2, 0, 0, 1]
my_cat2 = pd.Categorical.from_codes(cods, cats)
my_cat2

['baz', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [18]:
ordered_my_cat2 = pd.Categorical.from_codes(cods, cats, ordered=True)   # ordered by the order of cats
ordered_my_cat2

['baz', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [21]:
my_cat2.as_ordered(inplace=True)
my_cat2

['baz', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [22]:
s = pd.Series(list('abcd' * 2))
s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: object

In [23]:
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [25]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [26]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [27]:
cat_s2 = cat_s.cat.set_categories(['a', 'b', 'c', 'd', 'e'])
cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [28]:
cat_s.value_counts()

d    2
c    2
b    2
a    2
dtype: int64

In [29]:
cat_s2.value_counts()

d    2
c    2
b    2
a    2
e    0
dtype: int64

In [30]:
cat_s2.cat.remove_unused_categories()

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [31]:
N = 10000000
draws = pd.Series(np.random.randn(N))

In [None]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [47]:
cats = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)).astype('category')

In [48]:
labels.memory_usage()

80000128

In [49]:
cats.memory_usage()

10000320

In [50]:
%timeit draws.groupby(labels)

26.7 µs ± 887 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [55]:
%timeit draws.groupby(cats.values.codes)

25.8 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [52]:
%timeit draws.groupby(labels).mean()

400 ms ± 8.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
%timeit draws.groupby(cats.values.codes).mean()

121 ms ± 524 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [57]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 3,
                   'value': np.arange(9.)})
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0


In [58]:
g = df.groupby('key')['value']
list(g)

[('a',
  0    0.0
  3    3.0
  6    6.0
  Name: value, dtype: float64),
 ('b',
  1    1.0
  4    4.0
  7    7.0
  Name: value, dtype: float64),
 ('c',
  2    2.0
  5    5.0
  8    8.0
  Name: value, dtype: float64)]

In [59]:
g.mean()

key
a    3.0
b    4.0
c    5.0
Name: value, dtype: float64

In [64]:
g.transform(lambda x: np.mean(x))   # transform result in aggregation being overlayed on the original array

0    3.0
1    4.0
2    5.0
3    3.0
4    4.0
5    5.0
6    3.0
7    4.0
8    5.0
Name: value, dtype: float64

In [68]:
df.merge(g.mean(), on='key', suffixes=('', '_group_mean'))     # either add a column by merge....

Unnamed: 0,key,value,value_group_mean
0,a,0.0,3.0
1,a,3.0,3.0
2,a,6.0,3.0
3,b,1.0,4.0
4,b,4.0,4.0
5,b,7.0,4.0
6,c,2.0,5.0
7,c,5.0,5.0
8,c,8.0,5.0


In [71]:
df['value_group_mean'] = g.transform('mean')         # .... or, use transform as its size is same as original
df

Unnamed: 0,key,value,value_group_mean
0,a,0.0,3.0
1,b,1.0,4.0
2,c,2.0,5.0
3,a,3.0,3.0
4,b,4.0,4.0
5,c,5.0,5.0
6,a,6.0,3.0
7,b,7.0,4.0
8,c,8.0,5.0


In [73]:
# transform is usually used in conjunction with groupby

In [88]:
g.transform(lambda x: (x - x.mean()) / x.std())

0   -1.0
1   -1.0
2   -1.0
3    0.0
4    0.0
5    0.0
6    1.0
7    1.0
8    1.0
Name: value, dtype: float64

In [87]:
g.apply(lambda x: (x - x.mean()) / x.std())

0   -1.0
1   -1.0
2   -1.0
3    0.0
4    0.0
5    0.0
6    1.0
7    1.0
8    1.0
Name: value, dtype: float64

In [84]:
g.transform('mean')    # g.apply('mean') will return an error - special only for transform so faster

0    3.0
1    4.0
2    5.0
3    3.0
4    4.0
5    5.0
6    3.0
7    4.0
8    5.0
Name: value, dtype: float64

In [90]:
g.apply(lambda x: x.mean())     # only transform will yield same size results as original df

key
a    3.0
b    4.0
c    5.0
Name: value, dtype: float64

In [85]:
(df['value'] - g.transform('mean')) / g.transform('std')

0   -1.0
1   -1.0
2   -1.0
3    0.0
4    0.0
5    0.0
6    1.0
7    1.0
8    1.0
Name: value, dtype: float64

In [89]:
(df['value'] - g.apply(lambda x: x.mean())) / g.apply(lambda x: x.std())

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
a   NaN
b   NaN
c   NaN
Name: value, dtype: float64

In [81]:
%timeit g.apply(lambda x: (x - x.mean()) / x.std())

1.68 ms ± 32.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [80]:
%timeit g.transform(lambda x: (x - x.mean()) / x.std())

1.91 ms ± 72.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [86]:
%timeit (df['value'] - g.transform('mean')) / g.transform('std')

754 µs ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [91]:
N = 15
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)
df = pd.DataFrame({'time': times, 'value': np.arange(N)})
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [94]:
df.set_index('time').resample('5min')     # resample requires index to be set to datetime type column

<pandas.core.resample.DatetimeIndexResampler object at 0x7fb5dabefa90>

In [97]:
df.set_index('time').resample('14min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,14
2017-05-20 00:14:00,1


In [108]:
df2 = df.copy()
df2['k'] = 'new' + ' ' + 'assignment'
df2

Unnamed: 0,time,value,k
0,2017-05-20 00:00:00,0,new assignment
1,2017-05-20 00:01:00,1,new assignment
2,2017-05-20 00:02:00,2,new assignment
3,2017-05-20 00:03:00,3,new assignment
4,2017-05-20 00:04:00,4,new assignment
5,2017-05-20 00:05:00,5,new assignment
6,2017-05-20 00:06:00,6,new assignment
7,2017-05-20 00:07:00,7,new assignment
8,2017-05-20 00:08:00,8,new assignment
9,2017-05-20 00:09:00,9,new assignment


In [109]:
df3 = df.assign(k = 'new' + ' ' + 'assignment')    # no quotation '' around column label k here with .assign
df3

Unnamed: 0,time,value,k
0,2017-05-20 00:00:00,0,new assignment
1,2017-05-20 00:01:00,1,new assignment
2,2017-05-20 00:02:00,2,new assignment
3,2017-05-20 00:03:00,3,new assignment
4,2017-05-20 00:04:00,4,new assignment
5,2017-05-20 00:05:00,5,new assignment
6,2017-05-20 00:06:00,6,new assignment
7,2017-05-20 00:07:00,7,new assignment
8,2017-05-20 00:08:00,8,new assignment
9,2017-05-20 00:09:00,9,new assignment
