In [1]:
import numpy as np
import pandas as pd

## Attributes of Pandas objects

In [2]:
df = pd.DataFrame(np.random.rand(8,3), columns=['A','B','C'])

In [3]:
df.columns = [x.lower() for x in df.columns]

In [4]:
df

Unnamed: 0,a,b,c
0,0.115929,0.582846,0.452788
1,0.795512,0.954343,0.474776
2,0.859988,0.352238,0.026058
3,0.343801,0.518575,0.169803
4,0.75907,0.211614,0.70897
5,0.277336,0.408349,0.678844
6,0.895274,0.203892,0.923852
7,0.598835,0.165248,0.232614


In [5]:
df.a.array

<PandasArray>
[0.11592894644778973,  0.7955124900005679,  0.8599884590038056,
   0.343801129314409,  0.7590703277846061,  0.2773364686902342,
  0.8952737524523614,  0.5988354031903128]
Length: 8, dtype: float64

## Counting values in Series

In [6]:
data = np.random.randint(0, 7, size=50)
data

array([3, 4, 3, 3, 4, 5, 0, 6, 3, 4, 3, 6, 5, 1, 3, 0, 3, 0, 2, 6, 3, 4,
       0, 0, 3, 4, 0, 1, 3, 6, 2, 0, 1, 0, 6, 6, 6, 3, 0, 3, 5, 1, 3, 0,
       3, 1, 0, 0, 5, 6])

In [7]:
s = pd.Series(data)

In [8]:
s.value_counts()

3    14
0    12
6     8
4     5
1     5
5     4
2     2
dtype: int64

In [9]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

0    3
1    7
dtype: int64

In [10]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})
df5.mode()

Unnamed: 0,A,B
0,4,2
1,6,6


## Altering labels

#### Reindexing

To reindex means to conform the data to match a given set of labels along a particular axis.

In [11]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    1.554758
b    0.952604
c   -0.963020
d   -0.464181
e   -0.441186
dtype: float64

In [12]:
s.reindex(['e', 'b', 'f', 'd'])

e   -0.441186
b    0.952604
f         NaN
d   -0.464181
dtype: float64

With a DataFrame, we can simultaneously reindex the index and columns:

In [13]:
df = pd.DataFrame({
     'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
     'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
     'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [14]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,-0.247046,-0.464004,1.311783
f,,,
b,1.871875,-0.868649,0.965348


In [15]:
df.reindex(['c', 'f', 'b'], axis='index')

Unnamed: 0,one,two,three
c,1.311783,-0.464004,-0.247046
f,,,
b,0.965348,-0.868649,1.871875


Index objects containing the actual axis labels can be shared between objects. So if we have a Series and a DataFrame, the following can be done:

In [16]:
# rs = s.reindex(df.index)

#### Dropping labels from an axis


In [17]:
df

Unnamed: 0,one,two,three
a,-1.027283,0.457198,
b,0.965348,-0.868649,1.871875
c,1.311783,-0.464004,-0.247046
d,,-0.521668,-0.035347


In [18]:
df.drop(['a', 'd'], axis=0)

Unnamed: 0,one,two,three
b,0.965348,-0.868649,1.871875
c,1.311783,-0.464004,-0.247046


In [19]:
df.drop(['one'], axis=1)

Unnamed: 0,two,three
a,0.457198,
b,-0.868649,1.871875
c,-0.464004,-0.247046
d,-0.521668,-0.035347


#### Renaming

In [20]:
s

a    1.554758
b    0.952604
c   -0.963020
d   -0.464181
e   -0.441186
dtype: float64

In [21]:
s.rename(str.upper)

A    1.554758
B    0.952604
C   -0.963020
D   -0.464181
E   -0.441186
dtype: float64

In [22]:
df

Unnamed: 0,one,two,three
a,-1.027283,0.457198,
b,0.965348,-0.868649,1.871875
c,1.311783,-0.464004,-0.247046
d,,-0.521668,-0.035347


In [23]:
df.rename(columns={'one': 'foo', 'two': 'bar'},
              index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

Unnamed: 0,foo,bar,three
apple,-1.027283,0.457198,
banana,0.965348,-0.868649,1.871875
c,1.311783,-0.464004,-0.247046
durian,,-0.521668,-0.035347


In [24]:
df.rename({'one': 'foo', 'two': 'bar'}, axis='columns')

Unnamed: 0,foo,bar,three
a,-1.027283,0.457198,
b,0.965348,-0.868649,1.871875
c,1.311783,-0.464004,-0.247046
d,,-0.521668,-0.035347


In [25]:
df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index')

Unnamed: 0,one,two,three
apple,-1.027283,0.457198,
banana,0.965348,-0.868649,1.871875
c,1.311783,-0.464004,-0.247046
durian,,-0.521668,-0.035347


## .dt and .str accessors

#### .dt accessor

Series has an accessor to succinctly return datetime-like properties for the values of the Series, if it is a datetime/period-like Series. This will return a Series, indexed like an existing Series.

In [27]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [28]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [31]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [32]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [33]:
s.dt.dayofweek

0    1
1    2
2    3
3    4
dtype: int64

In [34]:
stz = s.dt.tz_localize('US/Eastern')

In [35]:
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [36]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

#### .str accessor

Series is equipped with a set of string processing methods that make it easy to operate on each element of the array.

In [37]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
                  dtype="string")

In [38]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

Note: using the `.str` accessor, we can apply all string functions from standard Python to our Series.

## Sorting

#### By index

In [41]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.406794,0.483122,
b,1.538784,0.438932,-0.66811
c,-0.40942,0.455331,0.581923
d,,-0.312592,1.069413


In [42]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                          columns=['three', 'two', 'one'])
unsorted_df

Unnamed: 0,three,two,one
a,,0.483122,-0.406794
d,1.069413,-0.312592,
c,0.581923,0.455331,-0.40942
b,-0.66811,0.438932,1.538784


In [43]:
# Sort DataFrame by index

unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,0.483122,-0.406794
b,-0.66811,0.438932,1.538784
c,0.581923,0.455331,-0.40942
d,1.069413,-0.312592,


In [44]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,1.069413,-0.312592,
c,0.581923,0.455331,-0.40942
b,-0.66811,0.438932,1.538784
a,,0.483122,-0.406794


In [45]:
# Sort DataFrame by column names

unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.406794,,0.483122
d,,1.069413,-0.312592
c,-0.40942,0.581923,0.455331
b,1.538784,-0.66811,0.438932


In [46]:
# Sort Series by index

unsorted_df['three'].sort_index()

a         NaN
b   -0.668110
c    0.581923
d    1.069413
Name: three, dtype: float64

#### By values

The `Series.sort_values()` method is used to sort a Series by its values. The `DataFrame.sort_values()` method is used to sort a DataFrame by its column or row values.

In [48]:
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                        'two': [1, 3, 2, 4],
                        'three': [5, 4, 3, 2]})
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [49]:
# Sort DataFrame by column "two"
df1.sort_values(by='two')

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [50]:
# Sort DataFrame by columns "one" and "two"

df1[['one', 'two', 'three']].sort_values(by=['one', 'two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


These methods have a special treatment of NA values via the na_position argument:



In [51]:
s[2] = np.nan

In [52]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [53]:
s.sort_values(na_position='first')

2    <NA>
5    <NA>
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: string

##### MultiIndex

In [59]:
# Build MultiIndex
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                   ('b', 2), ('b', 1), ('b', 1)])
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           )

In [60]:
idx.names = ['first', 'second']
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           names=['first', 'second'])

In [56]:
# Build DataFrame
In [311]: df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                            index=idx)

In [57]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [58]:
# Sort DataFrame by 'second' (index) and 'A' (column)
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
