# 常用運算
## 均值中位数 df.mean()；df.median()
## 累加累乘 df.sum()；df.prod()
## 最大最小 df.max(); df.min()

In [23]:
import pandas as pd
import numpy as np

data = np.array([
    [1.39, 1.77, None],
    [0.34, 1.91, -0.05],
    [0.34, 1.47, 1.22],
    [None, 0.27, -0.61]
])
df = pd.DataFrame(data, index=["r0", "r1", "r2", "r3"], columns=["c0", "c1", "c2"])
print('df:\n', df)
df1 = df.describe()  # 快速獲取基本訊息, count: 有效數據
print('df.describe():\n', df1)

df:
       c0    c1    c2
r0  1.39  1.77  None
r1  0.34  1.91 -0.05
r2  0.34  1.47  1.22
r3  None  0.27 -0.61
df.describe():
           c0    c1    c2
count   3.00  4.00  3.00
unique  2.00  4.00  3.00
top     0.34  1.77 -0.05
freq    2.00  1.00  1.00


In [31]:
df1 = df.mean(axis=0)
print('df:\n', df)
print('df1:\n', df1)

df:
       c0    c1    c2
r0  1.39  1.77  None
r1  0.34  1.91 -0.05
r2  0.34  1.47  1.22
r3  None  0.27 -0.61
df1:
 c0    0.690000
c1    1.355000
c2    0.186667
dtype: float64


In [33]:
s = pd.Series([1000, 2000, 4000, 100000])
print("mean():", s.mean())   # 平均
print("median():", s.median())  # 中位數

mean(): 26750.0
median(): 3000.0


In [42]:
df = pd.DataFrame(np.arange(12).reshape((4,3)), columns=["c0", "c1", "c2"])
print('df:\n', df)
print("sum():\n", df.sum())
print("\nsum(axis=0):\n", df.sum(axis=0))
print("\nsum(axis=1):\n", df.sum(axis=1))
print("prod():\n", df.prod())
print("\nprod(axis=0):\n", df.prod(axis=0))
print("\nprod(axis=1):\n", df.prod(axis=1))
print("max():\n", df.max())  # 每一行的最大值
print('df.max().max():\n', df.max().max())  # 全部的最大值
print('df.values.ravel().max():\n', df.values.ravel().max())  # 用 Numpy 的方式运算


df:
    c0  c1  c2
0   0   1   2
1   3   4   5
2   6   7   8
3   9  10  11
sum():
 c0    18
c1    22
c2    26
dtype: int64

sum(axis=0):
 c0    18
c1    22
c2    26
dtype: int64

sum(axis=1):
 0     3
1    12
2    21
3    30
dtype: int64
prod():
 c0      0
c1    280
c2    880
dtype: int32

prod(axis=0):
 c0      0
c1    280
c2    880
dtype: int32

prod(axis=1):
 0      0
1     60
2    336
3    990
dtype: int32
max():
 c0     9
c1    10
c2    11
dtype: int32
df.max().max():
 11
df.values.ravel().max():
 11


## 處理空值 df.isnull(); df.notnull(); df.dropna(); df.fillna()

In [46]:
df = pd.DataFrame([[1, 2, 3, 0],
                   [3, 4, None, 1],
                   [None, None, None, None],
                   [None, 3, None, 4]],
                  columns=list("ABCD"))
print('df:\n', df)
print("\nisnull():\n", df.isnull())  # True 就是空
print("\nnotnull()\n", df.notnull())  # False 為空

df:
      A    B    C    D
0  1.0  2.0  3.0  0.0
1  3.0  4.0  NaN  1.0
2  NaN  NaN  NaN  NaN
3  NaN  3.0  NaN  4.0

isnull():
        A      B      C      D
0  False  False  False  False
1  False  False   True  False
2   True   True   True   True
3   True  False   True  False

notnull()
        A      B      C      D
0   True   True   True   True
1   True   True  False   True
2  False  False  False  False
3  False   True  False   True


In [51]:
df = pd.DataFrame([[1, 2, 3, 0],
                   [3, 4, None, 1],
                   [None, None, None, None],
                   [None, 3, None, 4]],
                  columns=list("ABCD"))
print('df:\n', df)
print("df.dropna()：\n", df.dropna())  # 刪除出現空值的列
print("df.dropna(axis=1):\n", df.dropna(axis=1))  # 刪除出現空值的行
print('df1.dropna(how="all"):\n', df.dropna(how="all"))  # 整列為空才刪除

df:
      A    B    C    D
0  1.0  2.0  3.0  0.0
1  3.0  4.0  NaN  1.0
2  NaN  NaN  NaN  NaN
3  NaN  3.0  NaN  4.0
df.dropna()：
      A    B    C    D
0  1.0  2.0  3.0  0.0
df.dropna(axis=1):
 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
df1.dropna(how="all"):
      A    B    C    D
0  1.0  2.0  3.0  0.0
1  3.0  4.0  NaN  1.0
3  NaN  3.0  NaN  4.0


In [55]:
df = pd.DataFrame([[1, 2, 3, 0],
                   [3, 4, None, 1],
                   [None, None, None, None],
                   [None, 3, None, 4]],
                  columns=list("ABCD"))
df_default = pd.DataFrame(np.arange(16).reshape((4,4)), columns=list("ABCD"))
df1 = df.fillna(111)  # 填充 111
df2 = df.fillna(value={"A": 0, "B": 1, "C": 2, "D": 3})  # 每行填不一樣的
print('df:\n', df)
print('df1:\n', df1)
print('df2:\n', df2)
print("df_default:\n", df_default)
print("\nfillna(df2):\n", df.fillna(df_default))  # 空的值就用df_default的相對位置的數值取代

df:
      A    B    C    D
0  1.0  2.0  3.0  0.0
1  3.0  4.0  NaN  1.0
2  NaN  NaN  NaN  NaN
3  NaN  3.0  NaN  4.0
df1:
        A      B      C      D
0    1.0    2.0    3.0    0.0
1    3.0    4.0  111.0    1.0
2  111.0  111.0  111.0  111.0
3  111.0    3.0  111.0    4.0
df2:
      A    B    C    D
0  1.0  2.0  3.0  0.0
1  3.0  4.0  2.0  1.0
2  0.0  1.0  2.0  3.0
3  0.0  3.0  2.0  4.0
df_default:
     A   B   C   D
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15

fillna(df2):
       A    B     C     D
0   1.0  2.0   3.0   0.0
1   3.0  4.0   6.0   1.0
2   8.0  9.0  10.0  11.0
3  12.0  3.0  14.0   4.0


## 獲取索引 df.idxmin(); df.idxmax()

In [57]:
df = pd.DataFrame([[1, 2, 3, 0],
                   [3, 4, None, 1],
                   [3, 5, 2, 1],
                   [3, 2, 2, 3]],
                  columns=list("ABCD"))
print('df:\n', df)
print("\nidxmax():\n", df.idxmax())  # 獲取每行最大值的index
print("\nidxmax(skipna=False):\n", df.idxmax(skipna=False))
print("\nidxmin():\n", df.idxmin())

df:
    A  B    C  D
0  1  2  3.0  0
1  3  4  NaN  1
2  3  5  2.0  1
3  3  2  2.0  3

idxmax():
 A    1
B    2
C    0
D    3
dtype: int64

idxmax(skipna=False):
 A    1.0
B    2.0
C    NaN
D    3.0
dtype: float64

idxmin():
 A    0
B    0
C    2
D    0
dtype: int64


# 數據解析
## explode:
### DataFrame.explode(column, ignore_index=False)

In [4]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [(0, 1, 2), 'foo', [], [3, 4]],
                   'B': 1,
                   'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
df1 = df.explode('A')  # 可加上ignore_index=True
print('df:\n', df)
print('df1:\n', df1)

df:
            A  B          C
0  (0, 1, 2)  1  [a, b, c]
1        foo  1        NaN
2         []  1         []
3     [3, 4]  1     [d, e]
df1:
      A  B          C
0    0  1  [a, b, c]
0    1  1  [a, b, c]
0    2  1  [a, b, c]
1  foo  1        NaN
2  NaN  1         []
3    3  1     [d, e]
3    4  1     [d, e]


## apply:
### DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwargs)
### axis{0 or ‘index’, 1 or ‘columns’}, default 0

In [20]:
def func(x):
    return x[0] * 2, x[1] * -1

df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
df1 = df.apply(np.sqrt)
df2 = df.apply(np.sum, axis=0)
df3 = df.apply(np.sum, axis=1)
df4 = df.apply(lambda x: [1, 2], axis=1)
df5 = df.apply(lambda x: [1, 2], axis=1, result_type='expand')
df6 = df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
df7 = df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
df8 = df.apply(func, axis=1, result_type='broadcast')  # 可以自訂function
print('df:\n', df)
print('df1:\n', df1)
print('df2:\n', df2, type(df2))
print('df3:\n', df3, type(df3))
print('df4:\n', df4)
print('df5:\n', df5)
print('df6:\n', df6)
print('df7:\n', df7)
print('df8:\n', df8)

df:
    A  B
0  4  9
1  4  9
2  4  9
df1:
      A    B
0  2.0  3.0
1  2.0  3.0
2  2.0  3.0
df2:
 A    12
B    27
dtype: int64 <class 'pandas.core.series.Series'>
df3:
 0    13
1    13
2    13
dtype: int64 <class 'pandas.core.series.Series'>
df4:
 0    [1, 2]
1    [1, 2]
2    [1, 2]
dtype: object
df5:
    0  1
0  1  2
1  1  2
2  1  2
df6:
    A  B
0  1  2
1  1  2
2  1  2
df7:
    foo  bar
0    1    2
1    1    2
2    1    2
df8:
    A  B
0  8 -9
1  8 -9
2  8 -9
