In [1]:
import pandas as pd

In [3]:
# Series 和 DataFrame基本操作
s = pd.Series([i+1 for i in range(5)], index=['a','b','c','d','e'])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [4]:
# 也可以通过python中的dict来创建
records = {
    'a': 1,
    'b': 2,
    'c': 3,
    'd': 4,
    'e': 5
}
s = pd.Series(records)
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
# Series也支持numpy的索引和切片
s[1:3]

b    2
c    3
dtype: int64

In [6]:
s[(s>1)&(s<5)]

b    2
c    3
d    4
dtype: int64

In [8]:
s[s > s.median()]

d    4
e    5
dtype: int64

In [10]:
s[s > s.mean()]

d    4
e    5
dtype: int64

In [11]:
s['c']

3

In [12]:
# DataFrame是Pandas中最常用的数据结构
# 它可以包含多个列，每一列可以是不同的数据类型
# 可将其看作电子表格、SQL表或Series对象的字典
# 通过如下方式创建DataFrame
data = {
    'a': [1,2,3,4],
    'b': [5,6,7,8],
    'c': [9,10,11,12]
}
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


In [13]:
df1 = pd.DataFrame(data, columns=['a', 'b', 'c'])
df1

Unnamed: 0,a,b,c
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


In [18]:
# 然后通过行索引对数据进行选取
df1.loc[1]

a     2
b     6
c    10
Name: 1, dtype: int64

In [19]:
df1.iloc[2]

a     3
b     7
c    11
Name: 2, dtype: int64

In [21]:
# 也可以采用类似dict的方式
# 按列选取、设置和删除数据
df1['c']

0     9
1    10
2    11
3    12
Name: c, dtype: int64

In [22]:
df1['c'] = df1['c'] + 1
df1['c']

0    10
1    11
2    12
3    13
Name: c, dtype: int64

In [24]:
# 删除
del df1['c']
df1

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [25]:
# 上述按列选取也可以通过df1.c来实现
# 另外，还可以通过布尔值选取
df1[df1.a > 2]

Unnamed: 0,a,b
2,3,7
3,4,8


In [26]:
import numpy as np
# 2.算数运算和数据对其
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,3)), columns=['a','b','c'])
df

Unnamed: 0,a,b,c
0,8,9,7
1,3,2,9
2,6,7,6


In [27]:
# DataFrame元素均乘2
df = df * 2
df

Unnamed: 0,a,b,c
0,16,18,14
1,6,4,18
2,12,14,12


In [28]:
# 元素取平方
df = df ** 2
df

Unnamed: 0,a,b,c
0,256,324,196
1,36,16,324
2,144,196,144


In [29]:
# NumPy中的一些函数也可直接应用于DataFrame，如exp、log
# 求以e为底的指数
np.exp(df)

Unnamed: 0,a,b,c
0,1.511428e+111,5.145317e+140,1.323483e+85
1,4311232000000000.0,8886111.0,5.145317e+140
2,3.454661e+62,1.323483e+85,3.454661e+62


In [30]:
# 求以e为底的对数
np.log(df)

Unnamed: 0,a,b,c
0,5.545177,5.780744,5.278115
1,3.583519,2.772589,5.780744
2,4.969813,5.278115,4.969813


In [33]:
# 除了单个DataFrame的运算之外，Pandas也支持多个DataFrame之间的算术运算
df1 = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,3)), columns=['a','b','c'])
df2 = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,3)), columns=['a','b','c'])
df1

Unnamed: 0,a,b,c
0,9,6,9
1,3,9,8
2,5,2,1


In [34]:
df2

Unnamed: 0,a,b,c
0,8,6,8
1,3,1,8
2,9,3,6


In [35]:
df1 + df2 * 2

Unnamed: 0,a,b,c
0,25,18,25
1,9,11,24
2,23,8,13


In [36]:
(df1 + df2)/df1

Unnamed: 0,a,b,c
0,1.888889,2.0,1.888889
1,2.0,1.111111,2.0
2,2.8,2.5,7.0


In [39]:
# 此外，Pandas可以根据索引实现数据自动对齐
# 索引不重合的部分被置为NaN
df1 = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,4)), columns=['a','b','c','d'])
df2 = pd.DataFrame(np.random.randint(low=1, high=10, size=(4,5)), columns=['a','b','c','d','e'])

In [40]:
df1

Unnamed: 0,a,b,c,d
0,1,2,5,2
1,6,2,4,4
2,9,2,3,6


In [41]:
df2

Unnamed: 0,a,b,c,d,e
0,7,2,2,8,6
1,7,5,6,1,6
2,3,3,9,3,8
3,5,3,4,4,2


In [42]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,8.0,4.0,7.0,10.0,
1,13.0,7.0,10.0,5.0,
2,12.0,5.0,12.0,9.0,
3,,,,,


In [43]:
# 3 统计和汇总数据
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,4)), columns=['a','b','c','e'])
df

Unnamed: 0,a,b,c,e
0,5,4,6,7
1,8,1,8,6
2,9,1,8,9


In [44]:
# sum方法不指定参数默认按列求和
df.sum()

a    22
b     6
c    22
e    22
dtype: int64

In [45]:
# 按列求平均值
df.mean(0)

a    7.333333
b    2.000000
c    7.333333
e    7.333333
dtype: float64

In [50]:
# 行
df.mean(1)

0    5.50
1    5.75
2    6.75
dtype: float64

In [51]:
# 直接通过describe()函数计算各种统计信息
df.describe()

Unnamed: 0,a,b,c,e
count,3.0,3.0,3.0,3.0
mean,7.333333,2.0,7.333333,7.333333
std,2.081666,1.732051,1.154701,1.527525
min,5.0,1.0,6.0,6.0
25%,6.5,1.0,7.0,6.5
50%,8.0,1.0,8.0,7.0
75%,8.5,2.5,8.0,8.0
max,9.0,4.0,8.0,9.0


In [53]:
# 4 数据排序
# Pandas支持多种方式的排序
# 如按索引排序、按值排序等。
# 通过sort_index（）方法可实现按索引级别对Pandas对象（如Series、DataFrame等）进行排序
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,4)), columns=['b','c','a','d'])
df

Unnamed: 0,b,c,a,d
0,7,8,2,3
1,8,1,8,5
2,5,7,8,4


In [54]:
# 按行索引排序
df.sort_index()

Unnamed: 0,b,c,a,d
0,7,8,2,3
1,8,1,8,5
2,5,7,8,4


In [55]:
# 按列索引排序
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
0,2,7,8,3
1,8,8,1,5
2,8,5,7,4


In [56]:
# 按值排序
df.sort_values(by='c')

Unnamed: 0,b,c,a,d
1,8,1,8,5
2,5,7,8,4
0,7,8,2,3


In [57]:
# 5 函数的应用
# Pandas支持通过apply（）方法将自定义函数应用到DataFrame的行和列上
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,4)), columns=['b','c','a','d'])
df

Unnamed: 0,b,c,a,d
0,9,5,1,2
1,9,6,1,4
2,6,5,8,3


In [58]:
# 示例1
func = lambda x:x.max() - x.min()
df.apply(func)

b    3
c    1
a    7
d    2
dtype: int64

In [59]:
# 示例2
def func1(df, a, b=1):
    return (df.max() - df.min() + a) * b
df.apply(func1, args=(2,),b=2)

b    10
c     6
a    18
d     8
dtype: int64

In [61]:
# 缺省值处理
series = pd.Series([1,2,3,np.nan,5])
series.isnull()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [62]:
series.notnull()

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [64]:
# 可以通过dropna方法丢弃包含缺省值的行与列
# 默认是丢弃含有的行
# 通过参数控制丢弃只丢弃全是缺省值的行
series.dropna()

0    1.0
1    2.0
2    3.0
4    5.0
dtype: float64

In [65]:
series.dropna(how='all')

0    1.0
1    2.0
2    3.0
4    5.0
dtype: float64

In [66]:
# 也可以对缺省值填充处理
series.fillna(series.mean())

0    1.00
1    2.00
2    3.00
3    2.75
4    5.00
dtype: float64

In [67]:
# 7. 时间序列
time = pd.Series(np.random.randn(8), index=pd.date_range('2018-06-01',periods=8))
time

2018-06-01   -0.437153
2018-06-02   -0.618781
2018-06-03   -1.123876
2018-06-04    1.444378
2018-06-05   -0.787557
2018-06-06   -1.164841
2018-06-07   -1.065258
2018-06-08   -0.358076
Freq: D, dtype: float64

In [68]:
# 时间的切片索引
time['2018-06-01']

-0.4371529366986666

In [70]:
time['2018/06/03']

-1.123876383684177

In [72]:
time['2018-06-01':'2018-06-05']

2018-06-01   -0.437153
2018-06-02   -0.618781
2018-06-03   -1.123876
2018-06-04    1.444378
2018-06-05   -0.787557
Freq: D, dtype: float64

In [73]:
time['2018-06']

2018-06-01   -0.437153
2018-06-02   -0.618781
2018-06-03   -1.123876
2018-06-04    1.444378
2018-06-05   -0.787557
2018-06-06   -1.164841
2018-06-07   -1.065258
2018-06-08   -0.358076
Freq: D, dtype: float64

In [74]:
# 对于带有重复索引的时间序列，可以通过groupby（）对数据进行聚合
dates = pd.DatetimeIndex(
    [
        '2018-06-06',
        '2018-06-07',
        '2018-06-07',
        '2018-06-07',
        '2018-06-08',
        '2018-06-09'
    ]
)
time = pd.Series(np.arange(6), index=dates)
time

2018-06-06    0
2018-06-07    1
2018-06-07    2
2018-06-07    3
2018-06-08    4
2018-06-09    5
dtype: int32

In [75]:
time.groupby(level=0).sum()

2018-06-06    0
2018-06-07    6
2018-06-08    4
2018-06-09    5
dtype: int32

In [76]:
# 8 数据存取
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(3,4)), columns=['b','c','a','d'])
df

Unnamed: 0,b,c,a,d
0,3,4,1,3
1,1,9,3,3
2,9,4,3,9


In [82]:
# 保存为csv文件
df.to_csv('demo_data.csv', index=False)

In [83]:
# 读取csv文件
df_csv = pd.read_csv('demo_data.csv')
df_csv

Unnamed: 0,b,c,a,d
0,3,4,1,3
1,1,9,3,3
2,9,4,3,9


In [84]:
# 保存为json文件
df.to_json('data.json')

In [85]:
# 读取json文件
df_json = pd.read_json('data.json')
df_json


Unnamed: 0,b,c,a,d
0,3,4,1,3
1,1,9,3,3
2,9,4,3,9
