In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a   -0.634798
b    1.551101
c   -0.467335
d    0.558254
e    0.419427
dtype: float64

In [4]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
pd.Series(np.random.rand(5))

0    0.850729
1    0.378673
2    0.947962
3    0.065949
4    0.524387
dtype: float64

### From Dict

In [6]:
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

### From scalar value

In [7]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

### Series is ndarray-like

In [8]:
s[0]

-0.634798246399784

In [9]:
s[:3]

a   -0.634798
b    1.551101
c   -0.467335
dtype: float64

In [10]:
s[s > s.median()]

b    1.551101
d    0.558254
dtype: float64

In [11]:
s[[4, 3, 1]]

e    0.419427
d    0.558254
b    1.551101
dtype: float64

In [12]:
np.exp(s)

a    0.530042
b    4.716658
c    0.626670
d    1.747618
e    1.521090
dtype: float64

In [13]:
s.dtype

dtype('float64')

While Series is ndarray-like, if you need an actual ndarray, then use Series.to_numpy():

In [14]:
s.to_numpy()

array([-0.63479825,  1.55110051, -0.46733515,  0.55825381,  0.41942708])

### Series is dict-like

In [15]:
s['a']

-0.634798246399784

In [16]:
s['e']

0.4194270752638271

In [17]:
s

a   -0.634798
b    1.551101
c   -0.467335
d    0.558254
e    0.419427
dtype: float64

In [18]:
'e' in s

True

In [19]:
'f' in s

False

In [20]:
s['f']

KeyError: 'f'

### Vectorized operations

In [None]:
s + s

In [None]:
s * 2

In [None]:
np.exp(s)

A key difference between Series and ndarray is that operations between Series automatically align data based on the label. Thus, you can write computations without considering whether the Series involved have the same labels.

In [None]:
s1 = s[1:]
s2 = s[:-1]
s1 + s2

### Name Attribute

In [None]:
s = pd.Series(np.random.randn(5), name='something')
s

In [None]:
s.name

## DataFrame

### From Dictionary of Series or dictionaries

In [None]:
d = {'one': pd.Series([1., 2., 3.], index=['a','b','c']),
    'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

In [None]:
pd.DataFrame(d,index=['d','b','a'])

In [None]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

### From Dictionary of ndarrays or lists

In [None]:
d = {'one': [1., 2., 3., 4.], 
    'two': [4., 3., 2., 1.]}
pd.DataFrame(d)

In [None]:
pd.DataFrame(d,index=['a', 'b', 'c', 'd'])

### From a Series

In [None]:
pd.DataFrame(pd.Series(np.random.randn(5), name='something'))

### Column selection, addition, deletion

In [None]:
df['one']

In [None]:
df['three']= df['one']*df['two']
df['flag'] = df['one'] > 2
df

In [None]:
del df['two']

In [None]:
df['foo'] = 'bar'
df

When insterting a Series that does not have the same index as the DataFrame, it will be conformed to the DataFrame's index

In [None]:
df['one_trunc'] = df['one'][:2]
df

### Scalar Operations

In [None]:
df = pd.DataFrame(np.random.randn(8,3),  columns=list('ABC'))
df * 5 +2

In [None]:
1/df

In [None]:
df ** 4

## Boolean operators are vectorized as well

And:

In [None]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)
print(df1, df2)
df1 & df2

Or:


In [None]:
df1 | df2

exclusive or:

In [None]:
df1 ^ df2

In [None]:
-df1

## dtypes

In [None]:
dft = pd.DataFrame({'A': np.random.rand(3),
                    'B': 1,
                    'C': 'foo',
                    'D': pd.Timestamp('20010102'),
                    'E': pd.Series([1.0] * 3).astype('float32'),
                    'F': False,
                    'G': pd.Series([1] * 3, dtype='int8')})
                        
dft

In [None]:
dft.dtypes

In [None]:
dft['A'].dtype

In [None]:
pd.Series([1,2, 3, 6., 'foo'])

In [None]:
df1 = pd.DataFrame(np.random.randn(8,1), columns=['A'], dtype='float32')
df1.dtypes

In [None]:
df1 = df1.astype('float64')

In [None]:
df1.dtypes

Convert certain columns to a specific dtype by passing ia dict to astype()

In [None]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c':[7, 8, 9]})
df1 = df1.astype({'a':np.bool, 'c': np.float64})
df1

In [None]:
df1.dtypes

In [21]:
pd.__version__

'1.1.5'

In [22]:
np.__version__

'1.19.2'

## Basics

In [26]:
df = pd.DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'])
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
0,0.572244,-0.569334,-1.298233
1,1.151113,0.721189,0.862211
2,0.826144,-1.004734,0.059331
3,-0.440114,-1.910747,1.220245
4,-0.220607,1.368263,-0.269882
5,1.135963,-1.338763,1.585172
6,-0.329171,1.08221,0.067842
7,-1.416242,-0.953708,-0.179437


In [27]:
df.a.array

<PandasArray>
[  0.5722437962903226,    1.151113283835246,   0.8261440527901767,
  -0.4401136508018495, -0.22060733730154083,   1.1359630726283219,
 -0.32917066432647646,  -1.4162419656250094]
Length: 8, dtype: float64

In [28]:
data = np.random.randint(0,7, size=50)
data

array([3, 1, 4, 3, 0, 5, 6, 1, 2, 3, 4, 5, 6, 5, 0, 4, 2, 0, 3, 3, 5, 2,
       2, 2, 3, 6, 1, 3, 3, 0, 4, 1, 4, 6, 6, 1, 4, 1, 6, 4, 2, 3, 3, 5,
       6, 0, 0, 4, 0, 4])

In [29]:
s = pd.Series(data)

In [30]:
s.value_counts()

3    10
4     9
6     7
0     7
2     6
1     6
5     5
dtype: int64

In [31]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

0    3
1    7
dtype: int64

In [36]:
df5 = pd.DataFrame({'A': np.random.randint(0,7, size=50), 
                   'B': np.random.randint(-10, 15, size=50)})
df5.mode()

Unnamed: 0,A,B
0,2,-8


In [37]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a   -0.162585
b   -0.004628
c    1.133393
d   -1.033480
e    1.488069
dtype: float64

In [38]:
s.reindex(['e','b','f', 'd'])

e    1.488069
b   -0.004628
f         NaN
d   -1.033480
dtype: float64

In [43]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,0.523179,-0.450956,
b,0.549698,-0.089037,0.781536
c,0.875931,-0.661037,0.308231
d,,-0.423159,1.029138


In [44]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,0.308231,-0.661037,0.875931
f,,,
b,0.781536,-0.089037,0.549698


In [42]:
df.reindex(['c', 'f', 'b'], axis='index')

Unnamed: 0,one,two,three
c,-0.024731,0.299914,0.265767
f,,,
b,1.076222,0.111778,1.255859


### Dropping labels from an axis

In [45]:
df

Unnamed: 0,one,two,three
a,0.523179,-0.450956,
b,0.549698,-0.089037,0.781536
c,0.875931,-0.661037,0.308231
d,,-0.423159,1.029138


In [46]:
df.drop(['a', 'd'], axis=0)

Unnamed: 0,one,two,three
b,0.549698,-0.089037,0.781536
c,0.875931,-0.661037,0.308231


In [50]:
df.drop(['one','two'], axis=1)

Unnamed: 0,three
a,
b,0.781536
c,0.308231
d,1.029138


### Renaming

In [51]:
s

a   -0.162585
b   -0.004628
c    1.133393
d   -1.033480
e    1.488069
dtype: float64

In [52]:
s.rename(str.upper)

A   -0.162585
B   -0.004628
C    1.133393
D   -1.033480
E    1.488069
dtype: float64

In [53]:
df.rename(columns={'one': 'foo', 'two': 'bar'},
              index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

Unnamed: 0,foo,bar,three
apple,0.523179,-0.450956,
banana,0.549698,-0.089037,0.781536
c,0.875931,-0.661037,0.308231
durian,,-0.423159,1.029138


In [54]:
df.rename({'one': 'foo', 'two': 'bar'}, axis='columns')

Unnamed: 0,foo,bar,three
a,0.523179,-0.450956,
b,0.549698,-0.089037,0.781536
c,0.875931,-0.661037,0.308231
d,,-0.423159,1.029138


In [55]:
df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index')

Unnamed: 0,one,two,three
apple,0.523179,-0.450956,
banana,0.549698,-0.089037,0.781536
c,0.875931,-0.661037,0.308231
durian,,-0.423159,1.029138


.dt and .str accessors

In [67]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [69]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [60]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [61]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [62]:
s.dt.dayofweek

0    1
1    2
2    3
3    4
dtype: int64

timezone aware transformation

In [64]:
stz = s.dt.tz_localize('US/Eastern')
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [65]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [71]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], dtype='string')
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

## Sorting

### By index

In [73]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                        columns=['three', 'two', 'one'])
unsorted_df

Unnamed: 0,three,two,one
a,,0.785705,-0.314992
d,1.969346,-0.77421,
c,0.679428,-0.749451,0.46799
b,-0.754894,-0.749979,-0.227422


In [74]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,0.785705,-0.314992
b,-0.754894,-0.749979,-0.227422
c,0.679428,-0.749451,0.46799
d,1.969346,-0.77421,


In [75]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,1.969346,-0.77421,
c,0.679428,-0.749451,0.46799
b,-0.754894,-0.749979,-0.227422
a,,0.785705,-0.314992


In [76]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.314992,,0.785705
d,,1.969346,-0.77421
c,0.46799,0.679428,-0.749451
b,-0.227422,-0.754894,-0.749979


In [77]:
unsorted_df['three'].sort_index()

a         NaN
b   -0.754894
c    0.679428
d    1.969346
Name: three, dtype: float64

### By values

In [79]:
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                   'two': [1, 3, 2, 4],
                   'three': [5, 4, 3, 2]})
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [80]:
df1.sort_values(by='two')

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [81]:
df1[['one', 'two', 'three']].sort_values(by=['one', 'two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [82]:
s[2] = np.nan

In [83]:
s

0       A
1       B
2    <NA>
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [84]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [85]:
s.sort_values(na_position='first')

2    <NA>
5    <NA>
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: string

Use the name of the index to sort by both an index and column

In [86]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                   ('b', 2), ('b', 1), ('b', 1)])
idx.names=['first', 'second']
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           names=['first', 'second'])

In [87]:
df_multi = pd.DataFrame({'A': np.arange(6,0,-1)},
                       index=idx)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [88]:
# Sort DataFrame by 'second' (index) and 'A' (column)
df_multi.sort_values(by=['second','A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
