In [23]:
import numpy as np
import pandas as pd

# Numpy 教學
## 1. Numpy array 建立

In [24]:
Zeros = np.zeros((3, 3))
Ones = np.ones((3, 3))
Empty = np.empty((3, 3))
Arange = np.arange(0, 12, 3)
Line = np.linspace(0, 12, 3)
Reshape = Zeros.reshape(-1)
print(Zeros, Ones, Empty, Arange, Line, Reshape, sep = '\n\n')

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

[[-1.28822975e-231  2.32035068e+077  2.96439388e-323]
 [ 0.00000000e+000  0.00000000e+000  0.00000000e+000]
 [-1.28822975e-231 -1.28822975e-231  1.48219694e-323]]

[0 3 6 9]

[ 0.  6. 12.]

[0. 0. 0. 0. 0. 0. 0. 0. 0.]


## 2. Numpy 計算

In [25]:
arr = np.arange(0, 12).reshape((3, 4))
sum1 = np.sum(arr, axis = 0)
sum2 = np.sum(arr, axis = 1)
sum3 = np.sum(arr, axis = (0, 1))
sum4 = np.sum(arr, axis = (1, 0))
sum5 = np.sum(arr)
print(arr, '\n')
print(f'sum1, sum2, sum3, sum4, sum5 = ({sum1}, {sum2}, {sum3}, {sum4}, {sum5})')

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] 

sum1, sum2, sum3, sum4, sum5 = ([12 15 18 21], [ 6 22 38], 66, 66, 66)


In [26]:
print(sum1 + sum1, sum1 - sum1, sum1 * 3, sum1 / 2, sep = '\n\n')
# sum1 + sum2

[24 30 36 42]

[0 0 0 0]

[36 45 54 63]

[ 6.   7.5  9.  10.5]


In [27]:
print(arr == 5, arr > 5, arr <5, sep = '\n\n')

[[False False False False]
 [False  True False False]
 [False False False False]]

[[False False False False]
 [False False  True  True]
 [ True  True  True  True]]

[[ True  True  True  True]
 [ True False False False]
 [False False False False]]


## 3. Numpy IO

In [28]:
np.save('arr.npy', arr)
arr = np.load('arr.npy')

# Pandas 教學
## 1. Pandas 型別
* Series
* Dataframe

In [29]:
# ------------- Series -------------

# From numpy array
series1 = pd.Series(sum1)

# From Python's list
series2 = pd.Series([1, 2, 3, 4])

print(series1, series2, sep = '\n\n')

0    12
1    15
2    18
3    21
dtype: int64

0    1
1    2
2    3
3    4
dtype: int64


In [4]:
# ------------- Dataframe -------------
dates = pd.date_range('20210101',periods = 6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns = [f"col{i}" for i in range(4)])
print(df, '\n')
print(df.index, df.columns, df.describe(), sep = '\n\n')

                col0      col1      col2      col3
2021-01-01  0.634531  0.330870  1.730830  0.765785
2021-01-02 -0.000893  0.752005 -0.848354 -1.940398
2021-01-03 -0.251021  0.641798  0.492876 -0.512745
2021-01-04 -0.584171  1.173616  0.596560 -0.537101
2021-01-05 -1.140709 -0.419344 -0.588302  0.062054
2021-01-06 -0.424615 -0.085499  1.659300 -0.698844 

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

Index(['col0', 'col1', 'col2', 'col3'], dtype='object')

           col0      col1      col2      col3
count  6.000000  6.000000  6.000000  6.000000
mean  -0.294480  0.398908  0.507152 -0.476875
std    0.594660  0.581835  1.083554  0.897857
min   -1.140709 -0.419344 -0.848354 -1.940398
25%   -0.544282  0.018593 -0.318007 -0.658408
50%   -0.337818  0.486334  0.544718 -0.524923
75%   -0.063425  0.724453  1.393615 -0.081646
max    0.634531  1.173616  1.730830  0.765785


## 2. 索引
* simple
* loc
* iloc
* ix(deprecated in 0.20.0 version)

In [5]:
# simple(順序沒差)
print('------- simple demo -------')
print(df['col1'], df[['col1', 'col2']], df[:'2021-01-02']['col1'], df['col1'][:'2021-01-02'], sep = '\n\n')
print()

# loc (先 row 後 column)
print('------- loc demo -------')
print(df.loc[:'2021-01-02']['col1'], sep = '\n\n')
print()
# df['col1']

# iloc (先 row 後 column)
print('------- iloc demo -------')
print(df.iloc[:2][:3], sep = '\n\n')
print()

# !! ix had deprecated in Pandas == 0.20.0 !!
# ix (先 row 後 column)
# print(df.ix[:2][:col3], sep = '\n\n')
# print()


------- simple demo -------
2021-01-01    0.330870
2021-01-02    0.752005
2021-01-03    0.641798
2021-01-04    1.173616
2021-01-05   -0.419344
2021-01-06   -0.085499
Freq: D, Name: col1, dtype: float64

                col1      col2
2021-01-01  0.330870  1.730830
2021-01-02  0.752005 -0.848354
2021-01-03  0.641798  0.492876
2021-01-04  1.173616  0.596560
2021-01-05 -0.419344 -0.588302
2021-01-06 -0.085499  1.659300

2021-01-01    0.330870
2021-01-02    0.752005
Freq: D, Name: col1, dtype: float64

2021-01-01    0.330870
2021-01-02    0.752005
Freq: D, Name: col1, dtype: float64

------- loc demo -------
2021-01-01    0.330870
2021-01-02    0.752005
Freq: D, Name: col1, dtype: float64

------- iloc demo -------
                col0      col1      col2      col3
2021-01-01  0.634531  0.330870  1.730830  0.765785
2021-01-02 -0.000893  0.752005 -0.848354 -1.940398



## 3. 合併與連接
- pandas.concat
- pandas.merge

In [6]:
# ------------- concat demo -------------
dates = pd.date_range('20210101',periods = 6)
tmp = pd.DataFrame(np.random.randn(6, 4), index=dates, columns = [f"col{i}" for i in range(2, 6)])
print(f'df: \n{df}\n')
print(f'tmp: \n{tmp}\n')

# ------- axis demo -------
print('------- axis demo -------')
df1 = pd.concat([df, tmp], axis = 1)
df2 = pd.concat([df, tmp], axis = 0)
print(f'axis = 1: \n{df1}\n')
print(f'axis = 0: \n{df2}\n')

# ------- join demo -------
print('------- join demo --------')
df1 = pd.concat([df, tmp], join = 'outer') # 聯集
df2 = pd.concat([df, tmp], join = 'inner') # 交集
print(f'join = outer: \n{df1}\n')
print(f'join = inner: \n{df2}\n')

df: 
                col0      col1      col2      col3
2021-01-01  0.634531  0.330870  1.730830  0.765785
2021-01-02 -0.000893  0.752005 -0.848354 -1.940398
2021-01-03 -0.251021  0.641798  0.492876 -0.512745
2021-01-04 -0.584171  1.173616  0.596560 -0.537101
2021-01-05 -1.140709 -0.419344 -0.588302  0.062054
2021-01-06 -0.424615 -0.085499  1.659300 -0.698844

tmp: 
                col2      col3      col4      col5
2021-01-01 -0.815611 -0.209063  0.901397  0.528315
2021-01-02 -0.587821 -0.419782 -0.276112 -1.986292
2021-01-03  0.666643 -0.008753  0.220077  0.208773
2021-01-04  1.460349 -1.097022  0.413291 -0.316742
2021-01-05 -0.132234 -1.984694  0.771532 -0.731836
2021-01-06  0.046154  0.127128  0.608383 -1.290315

------- axis demo -------
axis = 1: 
                col0      col1      col2      col3      col2      col3  \
2021-01-01  0.634531  0.330870  1.730830  0.765785 -0.815611 -0.209063   
2021-01-02 -0.000893  0.752005 -0.848354 -1.940398 -0.587821 -0.419782   
2021-01-03 -0.

In [7]:
# ------- merge -------
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

print(f'left:\n{left}\n')


print(f'right:\n{right}\n')

# -------- demo how -------
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(f'how = inner:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(f'how = outer:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(f'how = left:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(f'how = right:\n{res}\n')

left:
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3

right:
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3

how = inner:
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2

how = outer:
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3

how = left:
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN

how = right:
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3



## 4. 統計函數

In [21]:
df = pd.DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"])

# ------- 平均數 -------
print('------- 平均數 -------')
print (f"mean: \n{df.mean()}\n")

# ------- 標準差 -------
print('------- 標準差 -------')
print (f"std: \n{df.std()}\n")

# ------- 共變異數 -------
print('------- 共變異數 -------')

# Cov(a, b)
print (f"Cov(a, b) = {df['a'].cov(df['b'])}\n")

# Cov(all)
print (f"Cov(all): \n{df.cov()}\n")

# ------- 相關係數 -------
print('------- 相關係數 -------')
# Corr(a, b)
print (f"Corr(a, b) = {df['a'].corr(df['b'])}\n")

# Corr(all)
print (f"Corr(all): \n{df.corr()}\n")



------- 平均數 -------
mean: 
a    0.239071
b    0.179996
c   -0.396319
d   -0.064789
e   -0.339403
dtype: float64

------- 標準差 -------
std: 
a    0.788147
b    0.729703
c    0.896874
d    0.780933
e    1.092634
dtype: float64

------- 共變異數 -------
Cov(a, b) = 0.10165984211286627

Cov(all): 
          a         b         c         d         e
a  0.621175  0.101660 -0.169685  0.461521  0.018407
b  0.101660  0.532466 -0.401042  0.023025 -0.329124
c -0.169685 -0.401042  0.804382 -0.136661 -0.050360
d  0.461521  0.023025 -0.136661  0.609856  0.118855
e  0.018407 -0.329124 -0.050360  0.118855  1.193849

------- 相關係數 -------
Corr(a, b) = 0.17676510994632588

Corr(all): 
          a         b         c         d         e
a  1.000000  0.176765 -0.240052  0.749844  0.021375
b  0.176765  1.000000 -0.612791  0.040406 -0.412800
c -0.240052 -0.612791  1.000000 -0.195119 -0.051390
d  0.749844  0.040406 -0.195119  1.000000  0.139293
e  0.021375 -0.412800 -0.051390  0.139293  1.000000



## 5. 其他
- IO(Input & Output)
- Something about nan

In [40]:
# -------- IO -------
df = pd.read_csv('homework.csv')
# df.to_csv(index = False)

In [45]:
# ------- Something about nan -------
df = pd.read_csv('homework.csv')

# drop 
df1 = df.dropna(axis = 0, how = 'any')

# 補零
df2 = df.fillna(value = 0)

# 用一個 dictionary 填補
df3 = df.fillna(value = {f"col{i}" : i for i in range(df.shape[1])})

# 判斷 df 裡是否有 nan
True in np.asarray(df.isnull())

True