In [253]:
# Data Manipulation with Pandas

# pd is built on top of numpy - dealing with dataframe effectively and efficiently

# 这个文档 介绍 pandas


import numpy as np
import pandas as pd


In [254]:
test1 = pd.Series(np.arange(1,100), index = np.arange(2,101))
print(test1)

2       1
3       2
4       3
5       4
6       5
       ..
96     95
97     96
98     97
99     98
100    99
Length: 99, dtype: int32


In [255]:
# pd has 3 data structures: series, dataframe, and Index

# 1. pd series, which is one dimensional, but more general and flexible compared to a one dimensional array, because it has both value and index (we can get access to the value via the index), 
data= pd.Series(np.arange(0, 1, 0.25), index = [3,4,7,6])
print(data)

3    0.00
4    0.25
7    0.50
6    0.75
dtype: float64


In [256]:
print(data.iloc[2:4])
print(data.loc[3:4])

7    0.50
6    0.75
dtype: float64
3    0.00
4    0.25
dtype: float64


In [257]:
print(data.values) # if .values, then the output exclude the index
print(data[1:2])   
print(data.iloc[2:4]) #extract data based on the location of the index variable, iloc = index location
print(data.iloc[2:3,])
print(data.loc[3:7,]) # extract data based on the value of the index variable
print(data.loc[7])

print(data.loc[data.index.isin([3,7])])
print(data.loc[[3,7]])

[0.   0.25 0.5  0.75]
4    0.25
dtype: float64
7    0.50
6    0.75
dtype: float64
7    0.5
dtype: float64
3    0.00
4    0.25
7    0.50
dtype: float64
0.5
3    0.0
7    0.5
dtype: float64
3    0.0
7    0.5
dtype: float64


In [258]:
data.index

Int64Index([3, 4, 7, 6], dtype='int64')

In [259]:
# e.g., Series as generalized NumPy array - because pd series can explicitly define the index (np array implicitly use intergers as index)

data= pd.Series(np.arange(0,1, 0.25), index = ['a', 'b', 'c', 'd'])
print(data)
print(data['b'])
print(data[['b','c']])
print(data['b':'d'])

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64
0.25
b    0.25
c    0.50
dtype: float64
b    0.25
c    0.50
d    0.75
dtype: float64


In [260]:
data1 = pd.Series([0.25, 0.5, 0.75, 1.0],
index=[2, 5, 3, 7])
print(data1)

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64


In [261]:
data1.index = [3,4,55,6]
print(data1)

3     0.25
4     0.50
55    0.75
6     1.00
dtype: float64


In [262]:
# In this way, we can think of a Pandas Series a bit like a specialization of a Python
# dictionary. 我们可以从dict类型的数据直接创建pd series，By default, a Series will be created where the index is drawn from the sorted keys.

population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}

population = pd.Series(population_dict)
print(population)
print(population.index)
print(population['California':'Texas'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
California    38332521
Texas         26448193
dtype: int64


In [263]:
#Constructing Series objects

index = ['a', 'b', 'c', 'd']
pd.Series(data, index= index)


a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64

In [264]:
pd.Series(2, index = [1,2,3,4,5,6,7,8])

1    2
2    2
3    2
4    2
5    2
6    2
7    2
8    2
dtype: int64

In [265]:
pd.Series({2:'a', 3:'c', 5:'dd'})

2     a
3     c
5    dd
dtype: object

In [266]:
# The Pandas DataFrame Object
# a DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names.

In [267]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
print(area_dict)
area= pd.Series(area_dict)
print(area)

{'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [268]:
print(population)
print(area)

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [269]:
print(population)
states = pd.DataFrame({'population': population, 'area':area})
print(states)
print(type(states))

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
<class 'pandas.core.frame.DataFrame'>


In [270]:
print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [271]:
# We can think of a DataFrame as a specialization of a dictionary. Where a dictionary maps a key to a value, a DataFrame maps a column name to a Series of column data.

print(states['area'])



California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [272]:
print(states)

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995


In [273]:
print(states.iloc[:,1])
print(states.loc[:,'area'])

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [274]:
# Notice the potential point of confusion here: in a two-dimensional NumPy array,
# data[0] will return the first row. For a DataFrame, data['col0'] will return the first
# column. Because of this, it is probably better to think about DataFrames as generalized
# dictionaries rather than generalized arrays, though both ways of looking at the situation
# can be useful.

In [275]:
# Constructing DataFrame objects
# A DataFrame is a collection of Series objects, and a singlecolumn DataFrame can be constructed from a single Series:
# 我们可以从一个pd series 创建dataframe

pd.DataFrame(population, columns = ['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [276]:
# 我们也可以从一个dict object创建dataframe
data= [{'a': i, 'b': 2 * i} for i in range(3)]
print(data)
print(pd.DataFrame(data))

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]
   a  b
0  0  0
1  1  2
2  2  4


In [277]:
# 当从dict 创建dataframe时，pd 会自动补全missing value 
pd.DataFrame([{'a' :1 , 'b':2}, {'b':3, 'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [278]:
pd.DataFrame({'population': population , 'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [279]:
# 我们可以从一个multdimentional array创建一个dataframe， 注意相应的columns和index要匹配
pd.DataFrame(np.random.rand(4,2), columns = ['foo', 'bar'], index = ['a', 'b', 'c', 'd'])

Unnamed: 0,foo,bar
a,0.052901,0.254122
b,0.61323,0.938197
c,0.791225,0.498679
d,0.052215,0.206166


In [280]:
pd.DataFrame(np.random.rand(4,2), columns = ['foo', 'bar']) #或者索性用默认的index

Unnamed: 0,foo,bar
0,0.048794,0.636546
1,0.086716,0.699196
2,0.6193,0.452641
3,0.571105,0.688388


In [281]:
# 我们可以从 structured array 创建 pd dataframe
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(A)
print(pd.DataFrame(A))

[(0, 0.) (0, 0.) (0, 0.)]
   A    B
0  0  0.0
1  0  0.0
2  0  0.0


In [282]:
# 我们可以把 dataframe 的index看作一个 immutable array, 它与通常意义上的array的不同之处在于它包含的value是不可更改的
ind = pd.Index([2,3,4,7,11])
print(ind)
print(ind[3])
print(ind.values)

Int64Index([2, 3, 4, 7, 11], dtype='int64')
7
[ 2  3  4  7 11]


In [283]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indAB = indA & indB
print(indAB) # indA and indB


Int64Index([3, 5, 7], dtype='int64')


  indAB = indA & indB


In [284]:
indAorB = indA | indB
print(indAorB) # indA and indB



Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')


  indAorB = indA | indB


In [285]:
# data indexing and selection

In [286]:
data = pd.Series(np.arange(0,1,0.25), index=['a', 'b', 'c', 'd'])
print(data)

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64


In [287]:
data['b']

0.25

In [288]:
'a' in data

True

In [289]:
data.keys()[3]

print(data.keys().values)
print(data)

['a' 'b' 'c' 'd']
a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64


In [290]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [291]:
print(data)

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64


In [292]:
print(data.items())

for i in data.items() :
    print(i)

<zip object at 0x000001675C9B0680>
('a', 0.0)
('b', 0.25)
('c', 0.5)
('d', 0.75)


In [293]:
list(data.items()) # items() is usually used for iteration for index with values.

[('a', 0.0), ('b', 0.25), ('c', 0.5), ('d', 0.75)]

In [294]:
print(data)
# 这里data是一个pd series， 我们可以通过指定的index 来access
print(data['a':'c'])
#或者通过隐含的 index 来access
print(data[0:2])

a    0.00
b    0.25
c    0.50
d    0.75
dtype: float64
a    0.00
b    0.25
c    0.50
dtype: float64
a    0.00
b    0.25
dtype: float64


In [295]:
# 可以使用masking 来获取相关value
print((data > 0.3) & (data < 0.8))
print(data[(data > 0.3) & (data < 0.8)])

a    False
b    False
c     True
d     True
dtype: bool
c    0.50
d    0.75
dtype: float64


In [296]:
# # Indexers: loc, iloc, and ix
# 当pd series 的index 是数字的时候，有可能出现误读，例如


data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data


1    a
3    b
5    c
dtype: object

In [297]:
# 为了避免误读，我们用  loc 指代我们在series 中明确的index
print(data.loc[1])
print(data.loc[1:3])
print(data.iloc[1:3])

a
1    a
3    b
dtype: object
3    b
5    c
dtype: object


In [298]:
print(data)

1    a
3    b
5    c
dtype: object


In [299]:

# 我们用iloc 来指代我们在series 中默认的index
#  iloc = implicit location
print(data.iloc[1])
print(data.iloc[1:3])

print(data.loc[data.index.isin([1,5])])
print(data.loc[[1,5]])

b
3    b
5    c
dtype: object
1    a
5    c
dtype: object
1    a
5    c
dtype: object


In [300]:
import pandas as pd
# One guiding principle of Python code is that “explicit is better than implicit.”

In [301]:
# data selection in dataframe

area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
print(data)

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


In [302]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [303]:
data.iloc[3,1]

19552860

In [304]:
data.iloc[:,0]

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [305]:
# print(data.loc[:,'area'])
data[data['area']>300000].loc[:, 'area']

California    423967
Texas         695662
Name: area, dtype: int64

In [306]:
# we can use attribute-style access with column names that are strings
print(data.area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [307]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [308]:
data['area'] is data.area

True

In [309]:
# we need to avoid using data.x1 = 'xx' to allocate value, because data.x1 may get confused with py functions.
# instead, we should use data['x1'] = 'xx' to set value
data['density'] = data['pop'] / data['area']
print(data)

              area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [310]:
print(data.iloc[:,1].values)
print(type(data.iloc[:,1]))
print(type(data.iloc[:,1].values))

xx = pd.Series(data.iloc[:,1].values) # we can convert the values of one column of pd dataframe from np array to one dimensional df.

print(xx)

[38332521 26448193 19651127 19552860 12882135]
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
0    38332521
1    26448193
2    19651127
3    19552860
4    12882135
dtype: int64


In [311]:
print(type(data.values))
print(type(data))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [312]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [313]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [314]:
# iloc 和 loc 都是pd dataframe 定位数据的方式，iloc使用默认index， loc使用自定义的index，当自定义的index是数字方式时，两者尤其要区分
# while indexing refers to columns, slicing refers to rows:

# indexing
print(data[1:3])

# slicing
print(data.iloc[:3,:2])

            area       pop     density
Texas     695662  26448193   38.018740
New York  141297  19651127  139.076746
              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127


In [315]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [316]:
data.iloc[[2,3],2]

New York    139.076746
Florida     114.806121
Name: density, dtype: float64

In [317]:
data['density']>100

California    False
Texas         False
New York       True
Florida        True
Illinois      False
Name: density, dtype: bool

In [318]:
# 带条件的indexing，结果是针对row的
data.loc[data['density']>100, :]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [319]:
# can be simplified as :
print(data[data['density']>100])

            area       pop     density
New York  141297  19651127  139.076746
Florida   170312  19552860  114.806121


In [320]:
# 带条件的slicing， 结果是针对column的
data.loc[data['density'] > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [321]:
data.iloc[0,2] = 90
print(data)

              area       pop     density
California  423967  38332521   90.000000
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [322]:
# Operating on Data in Pandas

# Pandas inherits much of the functionality of ufuncs from NumPy

# Ufuncs: Index Preservation

rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [323]:
df = pd.DataFrame(rng.randint(0,10, (3,4)), columns = ['a', 'b', 'c', 'd'])

In [324]:
print(df)

   a  b  c  d
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4


In [325]:
# If we apply a NumPy ufunc on either of these objects, the result will be another Pandas
# object with the indices preserved: e.g., the index will still be 0 1 2 3

np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [326]:
# e.g., the index will still be 0 1 2 
np.exp(df)


# %timeit print((df)**2)


Unnamed: 0,a,b,c,d
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158
2,1096.633158,7.389056,148.413159,54.59815


In [327]:
print(type(np.exp(df)))

print(df +33)

<class 'pandas.core.frame.DataFrame'>
    a   b   c   d
0  39  42  35  39
1  40  37  36  40
2  40  35  38  37


In [328]:
%timeit print(np.power(df,2))

    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4  25  16
    a   b   c   d
0  36  81   4  36
1  49  16   9  49
2  49   4 

In [329]:
test1 = pd.Series([3,3,4,5,5,56,6], index = np.arange(3,10), name="xxx")
print(test1.name)
print(test1)
print(test1.index)

xxx
3     3
4     3
5     4
6     5
7     5
8    56
9     6
Name: xxx, dtype: int64
Int64Index([3, 4, 5, 6, 7, 8, 9], dtype='int64')


In [330]:
# ufuncs: index alignment
# Index alignment in Series


area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area1')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population1')

print(area)
print(population)

Alaska        1723337
Texas          695662
California     423967
Name: area1, dtype: int64
California    38332521
Texas         26448193
New York      19651127
Name: population1, dtype: int64


In [331]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [332]:
area.index | population.index

  area.index | population.index


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [333]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A, B, A + B)

0    2
1    4
2    6
dtype: int64 1    1
2    3
3    5
dtype: int64 0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64


In [334]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [335]:
# Index alignment in DataFrame

A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
columns= ['A', 'B'])
print(A)

   A   B
0  1  11
1  5   1


In [336]:
B = pd.DataFrame(rng.randint(0, 10, (3,3)), columns= list(['B','A','C']))
B

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [337]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [338]:
print(A)
print(A.stack())

print(A.stack().mean())

   A   B
0  1  11
1  5   1
0  A     1
   B    11
1  A     5
   B     1
dtype: int32
4.5


In [339]:
fill = A.stack().mean()
print(fill)
print(A.stack())
A.add(B, fill_value= fill) #以这种方式把B 加到A上， 同时补全所有不匹配的数值

4.5
0  A     1
   B    11
1  A     5
   B     1
dtype: int32


Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


In [340]:
print(B.subtract(A))  # 另外，出来 add之外还有其他的运算符
print(B.mul(A))

     A    B   C
0 -1.0 -7.0 NaN
1  3.0  4.0 NaN
2  NaN  NaN NaN
      A     B   C
0   0.0  44.0 NaN
1  40.0   5.0 NaN
2   NaN   NaN NaN


In [341]:
# Operations between a DataFrame and a Series are similar to operations between a two-dimensional and one-dimensional
# NumPy array.

A = rng.randint(10, size = (3, 4))
A

array([[3, 8, 2, 4],
       [2, 6, 4, 8],
       [6, 1, 3, 8]])

In [342]:
# modifying values with fancy indexing
A - A[0]

array([[ 0,  0,  0,  0],
       [-1, -2,  2,  4],
       [ 3, -7,  1,  4]])

In [343]:
A - A[0,0]

array([[ 0,  5, -1,  1],
       [-1,  3,  1,  5],
       [ 3, -2,  0,  5]])

In [344]:
# pandas also follow the broadcasting rule in numpy
df = pd.DataFrame(A, columns= list('QRST'))
print(df)
print(df - df.iloc[0])


   Q  R  S  T
0  3  8  2  4
1  2  6  4  8
2  6  1  3  8
   Q  R  S  T
0  0  0  0  0
1 -1 -2  2  4
2  3 -7  1  4


In [345]:
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [346]:
df.loc[:,'R']

0    8
1    6
2    1
Name: R, dtype: int32

In [347]:
df.subtract(df['R'], axis= 0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [348]:
df.subtract(df.loc[:,'R'], axis= 0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [349]:
df.subtract(df['R'], axis= 0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [350]:
df

Unnamed: 0,Q,R,S,T
0,3,8,2,4
1,2,6,4,8
2,6,1,3,8


In [351]:
halfrow = df.iloc[0, ::2]
print(halfrow)

Q    3
S    2
Name: 0, dtype: int32


In [352]:
df

Unnamed: 0,Q,R,S,T
0,3,8,2,4
1,2,6,4,8
2,6,1,3,8


In [353]:
df -halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-1.0,,2.0,
2,3.0,,1.0,


In [354]:
# handling missing data 

# 两种方式：
# 用一个mask array标注missing value的index，或者把missing value 替换成一个约定的数值，例如-9999, NaN, or None

vals1 = np.array([1, None, 3, 4])
print(vals1)


[1 None 3 4]


In [355]:
vals1

array([1, None, 3, 4], dtype=object)

In [356]:
# vals1.sum() will get an error

In [357]:
# NaN means Not a Number , NaN 参与运算但结果结尾NaN
vals2 = np.array([1, np.nan, 3,4])
vals2.dtype
print(vals2)

[ 1. nan  3.  4.]


In [358]:
1 + np.nan

nan

In [359]:
0 * np.nan

nan

In [360]:
vals2.sum()

nan

In [361]:
# 也可以使用一些特殊的方式来运算 - 可以忽略NaN 
print(np.nansum(vals2))
print(np.nanmax(vals2))

8.0
4.0


In [362]:
# NaN and None in pandas, NaN and None are used interchangably.

pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [363]:
x= pd.Series(range(2), dtype= int)
x

0    0
1    1
dtype: int32

In [364]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

In [365]:
# detecting missing values
data= pd.Series([1, np.nan, 'hello', None])

data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [366]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [367]:
data[data.notna()]

0        1
2    hello
dtype: object

In [368]:
data.dropna()

0        1
2    hello
dtype: object

In [369]:
df= pd.DataFrame([[1, np.nan, 2], [2,3,5], [np.nan, 4,6]], columns= ['A','B','C'])
print(df)

     A    B  C
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [370]:
df.dropna() # 我们不能drop 一个df内的单一数值，我们只能drop一整行或者一整列

Unnamed: 0,A,B,C
1,2.0,3.0,5


In [371]:
df.dropna(axis= 'columns')

Unnamed: 0,C
0,2
1,5
2,6


In [372]:
df[3] = np.nan
df

Unnamed: 0,A,B,C,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [373]:
df.iloc[:,2] = np.nan
df

Unnamed: 0,A,B,C,3
0,1.0,,,
1,2.0,3.0,,
2,,4.0,,


In [374]:
df.dropna(axis= 'columns', how= 'all') # 我们可以挑选drop 的条件，例如，可以仅仅drop 数值全为non的column, how= 'all', 或许至少有多少none- NaN的columns才保留, e.g., thresh = 3

Unnamed: 0,A,B
0,1.0,
1,2.0,3.0
2,,4.0


In [375]:
df.dropna(axis='columns', thresh= 3)

0
1
2


In [376]:
# filling null values 

data= pd.Series([1, np.nan, 2, None, 3], index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [377]:
data.fillna(99)

a     1.0
b    99.0
c     2.0
d    99.0
e     3.0
dtype: float64

In [378]:
data.fillna(method = 'ffill') # fill the none value with the previous value

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [379]:
data.fillna(method= 'bfill') # fill the none value with the next value backward

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [380]:
df

Unnamed: 0,A,B,C,3
0,1.0,,,
1,2.0,3.0,,
2,,4.0,,


In [381]:
# we can do this for dataframe as well 
df.fillna(method = 'ffill', axis=1)

Unnamed: 0,A,B,C,3
0,1.0,1.0,1.0,1.0
1,2.0,3.0,3.0,3.0
2,,4.0,4.0,4.0


In [382]:
df.fillna(method = 'ffill', axis=0)

Unnamed: 0,A,B,C,3
0,1.0,,,
1,2.0,3.0,,
2,2.0,4.0,,


In [383]:
df.fillna(method = 'ffill')

Unnamed: 0,A,B,C,3
0,1.0,,,
1,2.0,3.0,,
2,2.0,4.0,,


In [384]:
# hierarchical indexing


# a multi indexed series

index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]

pop = pd.Series(populations, index=index)
print(pop)

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64


In [385]:
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [386]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [387]:
kk = list([1,3,2,4,2,3,2,4,11])

kk1 = list([1,3,2,4,'xx',3,2,4,3])
kk2 = list(zip(kk,kk1))
print(kk2)



[(1, 1), (3, 3), (2, 2), (4, 4), (2, 'xx'), (3, 3), (2, 2), (4, 4), (11, 3)]


In [388]:
x1 = list([i for i in kk2])

print(x1)

n = 1
xx =  [x[n] for x in x1]

print(xx)

[(1, 1), (3, 3), (2, 2), (4, 4), (2, 'xx'), (3, 3), (2, 2), (4, 4), (11, 3)]
[1, 3, 2, 4, 'xx', 3, 2, 4, 3]


In [389]:
x2 = pd.Series([i[0] for i in x1], index = kk)
print(x2)

1      1
3      3
2      2
4      4
2      2
3      3
2      2
4      4
11    11
dtype: int64


In [390]:
# The better way: Pandas MultiIndex

index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]

index = pd.MultiIndex.from_tuples(index)
print(index)

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )


In [391]:
pop = pop.reindex(index)
print(pop)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [392]:
# MultiIndex as extra dimension
# page 130


pop_df = pop.unstack()
print(pop_df)

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [393]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [394]:
test1 = pop_df.stack()
print(test1)

print(test1.unstack())

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [395]:
pop_df = pd.DataFrame({'total': pop,'under18': [9267089, 9284094,4687374, 4318033,5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [396]:
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())

                2000      2010
California  0.273594  0.249211
New York    0.247010  0.222831
Texas       0.283251  0.273568


In [397]:
# Methods of MultiIndex Creationd

f = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'c', 'b', 'b'], [1, 2, 1, 2]],columns=['data1', 'data2'])
df

Unnamed: 0,A,B,C,3
0,1.0,,,
1,2.0,3.0,,
2,,4.0,,


In [398]:
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
pd.Series(data).unstack()

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [399]:
# Explicit MultiIndex constructors 建立多维索引的方式


In [400]:
list1 = ['a', 'a', 'b', 'b']
list2 = [1, 2, 1, 2]

index1 = pd.MultiIndex.from_arrays([list1, list2])
x0 = pd.Series('xxx', index = index1)

print(x0)

a  1    xxx
   2    xxx
b  1    xxx
   2    xxx
dtype: object


In [401]:
x1 = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
print(x1)

test1 = pd.DataFrame([ range(333,337), range(1,5), range(1,5), range(1,5)], index = x1 )

print(test1.unstack())

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
     0       1       2       3   
     1  2    1  2    1  2    1  2
a  333  1  334  2  335  3  336  4
b    1  1    2  2    3  3    4  4


In [243]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [404]:
# MultiIndex level names 给刚才建立的index 命名

pop.index.names = ['state', 'year']
print(pop)
# pop= pd.DataFrame(columns=["xxx1", "xxx3"])

print(pop)

pop1 = pd.DataFrame(pop, columns=['A' ])
print(pop1)


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
                        A
state      year          
California 2000  33871648
           2010  37253956
New York   2000  18976457
           2010  19378102
Texas      2000  20851820
           2010  25145561


In [405]:
# MultiIndex for columns  对于dataframe，我们可以有多维的行，也可以有多维的列
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])


In [406]:
print(index)

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])


In [407]:
print(columns)

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])


In [408]:
data = np.round(np.random.randn(4,6), 1)
# randn 是给出一个sample 的随机数
print(data)

[[-0.7 -0.2  0.8  0.4  0.6 -0.4]
 [-0.3  0.3  1.1 -0.3  0.  -0.4]
 [ 1.2 -0.3  1.4  0.2 -0.3  2.5]
 [-0.7  0.3  0.8 -0.5 -1.  -0.5]]


In [409]:
data[:, ::2]*= 10
data += 37
print(data)

[[30.  36.8 45.  37.4 43.  36.6]
 [34.  37.3 48.  36.7 37.  36.6]
 [49.  36.7 51.  37.2 34.  39.5]
 [30.  37.3 45.  36.5 27.  36.5]]


In [410]:
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,30.0,36.8,45.0,37.4,43.0,36.6
2013,2,34.0,37.3,48.0,36.7,37.0,36.6
2014,1,49.0,36.7,51.0,37.2,34.0,39.5
2014,2,30.0,37.3,45.0,36.5,27.0,36.5


In [412]:
print(type(health_data))

<class 'pandas.core.frame.DataFrame'>


In [415]:
health_data.iloc[:, 2]

year  visit
2013  1        45.0
      2        48.0
2014  1        51.0
      2        45.0
Name: (Guido, HR), dtype: float64

In [426]:
health_data.loc[:,"Guido"]
print(type(health_data.loc[:,"Guido"]))
print(type(health_data["Guido"]))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [411]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.0,37.4
2013,2,48.0,36.7
2014,1,51.0,37.2
2014,2,45.0,36.5


In [427]:
health_data['Guido']['HR']

year  visit
2013  1        45.0
      2        48.0
2014  1        51.0
      2        45.0
Name: HR, dtype: float64

In [428]:
# page 134 Indexing and Slicing a MultiIndex
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [357]:
pop['California', 2000]

33871648

In [358]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [429]:
pop.loc['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [430]:
pop[:, 2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [431]:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [432]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [367]:
# Multiply indexed DataFrames
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,237.0,37.1,237.0,37.8,37.0,36.9
2013,2,-1463.0,37.6,1437.0,36.9,937.0,36.3
2014,1,1837.0,37.8,737.0,37.1,-763.0,35.8
2014,2,37.0,36.6,237.0,36.9,837.0,38.4


In [433]:
print(health_data['Guido', 'HR'])
print(health_data['Guido']['HR'])

year  visit
2013  1        45.0
      2        48.0
2014  1        51.0
      2        45.0
Name: (Guido, HR), dtype: float64
year  visit
2013  1        45.0
      2        48.0
2014  1        51.0
      2        45.0
Name: HR, dtype: float64


In [434]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,30.0,36.8
2013,2,34.0,37.3


In [435]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        30.0
      2        34.0
2014  1        49.0
      2        30.0
Name: (Bob, HR), dtype: float64

In [436]:
# 在tuple 内部创建一个index 是行不通的，例如
health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (Temp/ipykernel_7112/2978687719.py, line 2)

In [440]:
# 我们可以通过创建一个index slice object来实现

idx = pd.IndexSlice




health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,30.0,45.0,43.0
2014,1,49.0,51.0,34.0


In [441]:
# Rearranging Multi-Indices, page 137
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
np.random.seed(122)
data = pd.Series(np.random.randn(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.484680
      2     -0.822161
c     1     -0.336742
      2     -1.663389
b     1      1.719288
      2     -1.417075
dtype: float64

In [391]:
# 如果我们使用slice从index中截取一段，例如，直接使用 data['a':'b']
# 会导致出错 - pandas并不支持此做法
# 我们需要首先对index进行排序

In [392]:
data= data.sort_index()

In [394]:
data['a':'b']

char  int
a     1      0.484680
      2     -0.822161
b     1      1.719288
      2     -1.417075
dtype: float64

In [455]:
# Stacking and unstacking indices

pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [451]:
pop_s= pop.unstack() #unstack 使得之前的某些index中重复的value变成了columns
print(pop_s)

year            2000      2010
state                         
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [452]:
print(list(pop_s.columns))

[2000, 2010]


In [453]:
# Index setting and resetting

In [456]:
pop_flat = pop.reset_index()
print(pop_flat)

        state  year         0
0  California  2000  33871648
1  California  2010  37253956
2    New York  2000  18976457
3    New York  2010  19378102
4       Texas  2000  20851820
5       Texas  2010  25145561


In [459]:
# 有时候一种简单粗暴的方法，是直接reset index， 把所有的index变成columns
pop_flat = pop.reset_index(name='population')
  
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [460]:
# 基于以上，我们可以再重新把其中的state 和year 设置成为Index
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [461]:
# Data Aggregations on Multi-Indices
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,30.0,36.8,45.0,37.4,43.0,36.6
2013,2,34.0,37.3,48.0,36.7,37.0,36.6
2014,1,49.0,36.7,51.0,37.2,34.0,39.5
2014,2,30.0,37.3,45.0,36.5,27.0,36.5


In [462]:
data_mean = health_data.groupby(level='year').mean()
print(data_mean)

subject   Bob        Guido          Sue      
type       HR   Temp    HR   Temp    HR  Temp
year                                         
2013     32.0  37.05  46.5  37.05  40.0  36.6
2014     39.5  37.00  48.0  36.85  30.5  38.0


In [423]:
data_mean = health_data.groupby(level='type', axis=1).mean()
print(data_mean)

type                HR       Temp
year visit                       
2013 1      170.333333  37.266667
     2      303.666667  36.933333
2014 1      603.666667  36.900000
     2      370.333333  37.300000
