In [1]:
import pandas as pd
import numpy as np

In [5]:
rng = np.random.RandomState(42)
a = rng.randint(10, size=(3, 4))
print(a)

[[6 3 7 4]
 [6 9 2 6]
 [7 4 3 7]]


In [6]:
print(a[0]) # a row
print(a - a[0])

[6 3 7 4]
[[ 0  0  0  0]
 [ 0  6 -5  2]
 [ 1  1 -4  3]]


In [7]:
df = pd.DataFrame(a, columns = list("QRST"))
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [8]:
# first row, just notice taht it is showed vertically
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [9]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [8]:
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [10]:
# other columns subtract column R
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,3,0,4,1
1,-3,0,-7,-3
2,3,0,-1,3


In [11]:
# extract row 0 and step = 2
half_row = df.iloc[0, ::2]
half_row

Q    6
S    7
Name: 0, dtype: int32

In [12]:
# we can't write in this way
df - half_row

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,-5.0,
2,1.0,,-4.0,


# Operating on Null values
- pandas provides several useful functions for detecting, removing and replacing null values
- isnull(): generate the boolean mask indicating missing values
- notnull(): opposite of isnull()
- fillna(): missing values filled

## Detecting null values

In [16]:
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [17]:
# use mask to print the non-null values
data[data.notnull()]

0        1
2    hello
dtype: object

In [19]:
ser1 = data.dropna()
ser1

0        1
2    hello
dtype: object

# Operating on DataFrames

In [4]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5], 
                  [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [5]:
df.dropna()
# by default, every row that contains NaN will be droped

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [6]:
df.dropna(axis = 1)
# when axis = 1 or axis = 'column', every column that contains NaN will be droped

Unnamed: 0,2
0,2
1,5
2,6


In [7]:
# create a new NaN column 
df[3] = np.nan # 寫一個就夠了不用三個
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [8]:
# if all values in the column are NaN, then drop the column
df.dropna(axis = 'columns', how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [9]:
# we want to keep at least 3 values when removing NaN
df.dropna(axis = 'rows', thresh = 3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


# Filling null values

In [10]:
data = pd.Series([1, np.nan, 2, None, 3], index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [11]:
# fill every NaN with zero
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [12]:
# forward fill: 用 NaN 前面的值去填那個 NaN
data.fillna(method = 'ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [13]:
# backward fill: 用 NaN 後面的值去填那個 NaN
data.fillna(method = 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [14]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [15]:
# 以 column 為單位，一行一行往右跑
df.fillna(method = 'bfill', axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,2.0,
1,2.0,3.0,5.0,
2,4.0,4.0,6.0,


In [16]:
# 以 row 為單位，一行一行往下跑
df.fillna(method = 'bfill', axis = 0)

Unnamed: 0,0,1,2,3
0,1.0,3.0,2,
1,2.0,3.0,5,
2,,4.0,6,


In [19]:
df.fillna(method = 'ffill', axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


# Hierarchical indexing

In [20]:
# tuples as keys
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

population = [33871648, 37253956,
              18976457, 19378102,
              20851820, 25145561]

In [21]:
pop = pd.Series(population, index = index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [22]:
# slicing, notice that a index is a tuple
pop[('New York', 2000): ('Texas', 2010)]

# e.g. pop['New York':'Texas'] doesn't work

(New York, 2000)    18976457
(New York, 2010)    19378102
(Texas, 2000)       20851820
(Texas, 2010)       25145561
dtype: int64

In [23]:
# convert tuple multiIndex into pandas multiIndex

index2 = pd.MultiIndex.from_tuples(index)
index2

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [24]:
pop = pop.reindex(index2)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [38]:
# the new index is a string, while it is hierarchical
pop['New York':'Texas']

New York  2000    18976457
          2010    19378102
Texas     2000    20851820
          2010    25145561
dtype: int64

In [27]:
# multiIndex 的兩層 index 就像是 dataframe 的 row, column，想取值時寫法一樣

pop[:, 2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [28]:
# convert multiIndex series into normal dataframe using unstack()
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [29]:
# use the 'pop' Series to create a new df. Name a column 'total' and create a new column 'under18'
# 因為 pop 已經有 index 了，pandas 不會一直吵著要加 index?
pop_df =  pd.DataFrame({'total':pop,
                        'under18': [9267089, 9284094,
                                    4687371, 4318033,
                                    5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687371
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [30]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [12]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


# Methods of multiIndex creation
- List
- Dictionary 
- MultiIndex constructors

In [31]:
df = pd.DataFrame(np.random.rand(4, 2), 
                  index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns= ['data1', 'data2'])
df

# indices are a1, a2, b1, b2

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.497921,0.423628
a,2,0.95846,0.291268
b,1,0.395133,0.016655
b,2,0.471219,0.990683


In [32]:
# create a dictionary and use tuples as keys.
data = {('California', 2000): 33871648, 
        ('California', 2010): 37253956,
        ('New York', 2000): 18976457, 
        ('New York', 2010): 19378102,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 20851820}
data

{('California', 2000): 33871648,
 ('California', 2010): 37253956,
 ('New York', 2000): 18976457,
 ('New York', 2010): 19378102,
 ('Texas', 2000): 20851820,
 ('Texas', 2010): 20851820}

In [33]:
pd.Series(data)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    20851820
dtype: int64

In [42]:
# I finally got how to produce the same fucking df!!!!!
pd.DataFrame(data, index=['population']).T

Unnamed: 0,Unnamed: 1,population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,20851820


# Explicit multiIndex constructor

In [43]:
# from arrays
pd.MultiIndex.from_arrays([('a', 'a', 'b', 'b'), [1, 2, 1, 2]])

# it's ok to mix tuple ('a', 'a', 'b', 'b') and list [1, 2, 1, 2]

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [44]:
# from tuples
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [46]:
# from a cartesian product of single series
# notice it is 'from_product', not 'products' !!!!!!!!

pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [47]:
pop # remember pop is a series with multiIndex

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [48]:
# give the columns names
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

# MultiIndex for columns

In [15]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names = ['year', 'visit'])

# although they are columns, their datatype can be multiIndex as well. 
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['Heart_Rate', 'Temperature']],
                                      names = ['patient', 'measurement'])
# year visit
# 2013 1
#      2
# 2014 1
#      2

# patient      Bob                      Guido                    Sue
# measurement  Heart_Rate  Temperature  Heart_Rate  Temperature  Heart_Rate  Temperature

In [16]:
# np.random.randn(4, 6): 產生4x6 隨機陣列，其值可能是負或正，可能大於一或小於一
# np.round(..., 1): 取整數到小數點後第 1 位

data = np.round(np.random.randn(4, 6), 1) # round of 1
data

array([[-0.1, -0.1,  0.8,  0.6,  0.6,  0.3],
       [-2.9, -0.4,  0.6, -0.6,  0.2,  0.4],
       [-0.2,  0.7, -0.1, -1. ,  0.3, -0.1],
       [ 0.2,  0.4,  0.1,  0.4,  1. , -1.8]])

In [17]:
# get all the rows and step = 2

data[:, ::2] *= 10
data

array([[ -1. ,  -0.1,   8. ,   0.6,   6. ,   0.3],
       [-29. ,  -0.4,   6. ,  -0.6,   2. ,   0.4],
       [ -2. ,   0.7,  -1. ,  -1. ,   3. ,  -0.1],
       [  2. ,   0.4,   1. ,   0.4,  10. ,  -1.8]])

In [18]:
data +=37
print(abs(data)) # make it more likely in a heart rate and temperature range

[[36.  36.9 45.  37.6 43.  37.3]
 [ 8.  36.6 43.  36.4 39.  37.4]
 [35.  37.7 36.  36.  40.  36.9]
 [39.  37.4 38.  37.4 47.  35.2]]


In [24]:
health_data = pd.DataFrame(data, index = index, columns = columns)
health_data

Unnamed: 0_level_0,patient,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,measurement,Heart_Rate,Temperature,Heart_Rate,Temperature,Heart_Rate,Temperature
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,36.0,36.9,45.0,37.6,43.0,37.3
2013,2,8.0,36.6,43.0,36.4,39.0,37.4
2014,1,35.0,37.7,36.0,36.0,40.0,36.9
2014,2,39.0,37.4,38.0,37.4,47.0,35.2


In [31]:
# Guido's health data
health_data['Guido']

Unnamed: 0_level_0,measurement,Heart_Rate,Temperature
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.0,37.6
2013,2,43.0,36.4
2014,1,36.0,36.0
2014,2,38.0,37.4


In [32]:
# Guido's health data in 2013
health_data.loc[2013, 'Guido']

# 注意我們都是用外層的 index

measurement,Heart_Rate,Temperature
visit,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45.0,37.6
2,43.0,36.4


In [33]:
# Guido's heart rate
health_data['Guido', 'Heart_Rate']

year  visit
2013  1        45.0
      2        43.0
2014  1        36.0
      2        38.0
Name: (Guido, Heart_Rate), dtype: float64

In [35]:
# Bob and Guido's records in 2013
health_data.iloc[:2, :4]

Unnamed: 0_level_0,patient,Bob,Bob,Guido,Guido
Unnamed: 0_level_1,measurement,Heart_Rate,Temperature,Heart_Rate,Temperature
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013,1,36.0,36.9,45.0,37.6
2013,2,8.0,36.6,43.0,36.4


In [52]:
# Guido's heart rate in 2013 and 2014
health_data.loc[:, ('Guido', 'Heart_Rate')]

year  visit
2013  1        29.0
      2        33.0
2014  1        43.0
      2        47.0
Name: (Guido, Heart_Rate), dtype: float64

In [53]:
# all patients' heart rate in the first visit of 2013 and 2014
# but we can't write in this way
health_data.iloc[(:, 1), {:, 'Heart_Rate'}]

SyntaxError: invalid syntax (1656083080.py, line 3)

In [36]:
# magical function makes we slice the data layer by layer with .loc[]
idx = pd.IndexSlice 

# all patients' heart rate in the first visit of 2013 and 2014
health_data.loc[idx[:, 1], idx[:, 'Heart_Rate']]

Unnamed: 0_level_0,patient,Bob,Guido,Sue
Unnamed: 0_level_1,measurement,Heart_Rate,Heart_Rate,Heart_Rate
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,36.0,45.0,43.0
2014,1,35.0,36.0,40.0


In [39]:
# Bob's heart rate in the first visit of 2014
health_data.loc[idx[2014, 1], idx['Bob', 'Heart_Rate']]

35.0

In [40]:
# Bob's and Sue's record in the first visit of 2014
health_data.loc[idx[2014, 1], idx[['Bob', 'Sue'], :]]

patient  measurement
Bob      Heart_Rate     35.0
         Temperature    37.7
Sue      Heart_Rate     40.0
         Temperature    36.9
Name: (2014, 1), dtype: float64

In [41]:
# the mean of all records in 2013 and 2014 on average

data_mean = health_data.mean(level='year')  # row wise
data_mean

  data_mean = health_data.mean(level='year')  # row wise


patient,Bob,Bob,Guido,Guido,Sue,Sue
measurement,Heart_Rate,Temperature,Heart_Rate,Temperature,Heart_Rate,Temperature
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,22.0,36.75,44.0,37.0,41.0,37.35
2014,37.0,37.55,37.0,36.7,43.5,36.05


In [42]:
data_mean.mean(axis = 1, level = 'measurement') # column wise

  data_mean.mean(axis = 1, level = 'measurement') # column wise


measurement,Heart_Rate,Temperature
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,35.666667,37.033333
2014,39.166667,36.766667


# Combine datasets
numpy can only concatenate list, tuple and so on. It is not suitable for contatenating Series or DataFrames.

In [61]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z]) 
# we CAN NOT specify axis because x, y and z are only 1-dim arrays

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [57]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x])

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [58]:
np.concatenate([x, x], axis=1) 
# we CAN specify axis because x is 2-dim array
# horizontally = column wise

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [59]:
x = (1, 2, 3)
y = (4, 5, 6)
np.concatenate([x, y])

array([1, 2, 3, 4, 5, 6])

# Concatenate Series and DataFrame

In [63]:
ser1 = pd.Series(['A', 'B', 'C'], index = [1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index = [4, 5, 6])

pd.concat([ser1, ser2]) # default: vertically = row wise 

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [64]:
pd.concat([ser1, ser2], axis=1)

Unnamed: 0,0,1
1,A,
2,B,
3,C,
4,,D
5,,E
6,,F


In [66]:
df1 = pd.DataFrame(np.arange(0, 9).reshape(3, 3), 
                   index = [1, 2, 3], columns = ['a', 'b', 'c'])
df1

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8


In [67]:
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), 
                   index = [4, 5, 6], columns = ['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
4,9,10,11
5,12,13,14
6,15,16,17


In [68]:
# combine them row wise (vertically), notice that the indices should be the same.
pd.concat([df1, df2])

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8
4,9,10,11
5,12,13,14
6,15,16,17


In [61]:
df3 = pd.DataFrame(np.arange(18, 27).reshape(3, 3), 
                   index = [1, 2, 3], columns = ['d', 'e', 'f'])
df3

Unnamed: 0,d,e,f
1,18,19,20
2,21,22,23
3,24,25,26


In [62]:
# combine them column wise (horizontally), notice that the indices should be the same.
pd.concat([df1, df3], axis = 1)

Unnamed: 0,a,b,c,d,e,f
1,0,1,2,18,19,20
2,3,4,5,21,22,23
3,6,7,8,24,25,26
