In [1]:
print("""
@Description: Essential Functionality
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-06-20 09:25:13
""")


@Description: Essential Functionality
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-06-20 09:25:13



In [2]:
import numpy as np
import pandas as pd

## Arithmetic and Data Alignment

In [3]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [4]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [5]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

The internal data alignment introduces missing values in the label locations that don’t overlap. Missing values will then propagate in further arithmetic computations.

In [6]:
df1 = pd.DataFrame(np.arange(9.).reshape(3, 3), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [7]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [8]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


If you add DataFrame objects with no column or row labels in common, the result will contain all nulls:

In [9]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1

Unnamed: 0,A
0,1
1,2


In [10]:
df2

Unnamed: 0,B
0,3
1,4


In [11]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


### Arithmetic methods with fill values

In [12]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [13]:
df2.loc[1, 'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [14]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [15]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [16]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [17]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [18]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


### Operations between DataFrame and Series

In [19]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [20]:
arr[0]

array([0., 1., 2., 3.])

In [21]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [22]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]

In [23]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [24]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [25]:
frame.shape, series.shape

((4, 3), (3,))

In [26]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [27]:
series2 = pd.Series(np.arange(3), index=list('bef'))
series2

b    0
e    1
f    2
dtype: int32

In [28]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods and specify to match over the index.

In [29]:
series3 = frame['d']
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [30]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [31]:
series3.shape, frame.shape

((4,), (4, 3))

In [32]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## Function Application and Mapping

In [33]:
frame = pd.DataFrame(np.random.standard_normal(size=(4, 3)),
                     columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.445849,0.727656,-0.741423
Ohio,0.493071,-0.161458,-1.383022
Texas,0.115033,-0.543146,-0.497061
Oregon,0.291566,1.144579,-0.411433


In [34]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.445849,0.727656,0.741423
Ohio,0.493071,0.161458,1.383022
Texas,0.115033,0.543146,0.497061
Oregon,0.291566,1.144579,0.411433


In [35]:
def f1(x):
    return x.max() - x.min()
frame.apply(f1)

b    0.378038
d    1.687725
e    0.971589
dtype: float64

In [36]:
frame.apply(f1, axis='columns')

Utah      1.469079
Ohio      1.876093
Texas     0.658179
Oregon    1.556012
dtype: float64

In [37]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f2)

Unnamed: 0,b,d,e
min,0.115033,-0.543146,-1.383022
max,0.493071,1.144579,-0.411433


In [38]:
def my_format(x):
    return f'{x:.2f}'
frame.applymap(my_format)

Unnamed: 0,b,d,e
Utah,0.45,0.73,-0.74
Ohio,0.49,-0.16,-1.38
Texas,0.12,-0.54,-0.5
Oregon,0.29,1.14,-0.41


## Sorting and Ranking

In [39]:
obj = pd.Series(np.arange(4), index=list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int32

In [40]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [43]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['Three', 'One'],
                     columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
Three,0,1,2,3
One,4,5,6,7


In [46]:
frame.sort_index()

Unnamed: 0,d,a,b,c
One,4,5,6,7
Three,0,1,2,3


In [47]:
frame.sort_index(axis='columns')

Unnamed: 0,a,b,c,d
Three,1,2,3,0
One,5,6,7,4


In [48]:
frame.sort_index(axis='columns', ascending=False)

Unnamed: 0,d,c,b,a
Three,0,3,2,1
One,4,7,6,5


In [51]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [52]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [53]:
obj.sort_index(na_position='first')

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [54]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [56]:
frame.sort_values(['a', 'b'], axis='index')

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [57]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [58]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [59]:
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [60]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [61]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis Indexes with Duplicate Labels

In [62]:
obj = pd.Series(np.arange(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [63]:
obj.index.is_unique

False

In [64]:
obj['a']

a    0
a    1
dtype: int32

In [65]:
obj['c']

4

In [66]:
df = pd.DataFrame(np.random.standard_normal(size=(5, 3)), index=['a', 'a', 'b', 'b', 'c'])
df

Unnamed: 0,0,1,2
a,0.253651,-1.423573,-0.468814
a,-2.091572,0.721981,-1.627762
b,-2.125648,1.093773,1.281485
b,-0.085145,-1.942656,1.011311
c,-1.552392,1.193097,0.648308


In [67]:
df.loc['b']

Unnamed: 0,0,1,2
b,-2.125648,1.093773,1.281485
b,-0.085145,-1.942656,1.011311


In [68]:
df.loc['c']

0   -1.552392
1    1.193097
2    0.648308
Name: c, dtype: float64