# Pandas

## Series

### Series的创建

In [16]:
import pandas as pd

s = pd.Series([10,2,3,4,5])
print(s)
# 第一列 索引 第一列以后是值

# 自定义索引
s = pd.Series([10,2,3,4,5], index=['A','B','C','D','E'])
print(s)

# 定义name
s = pd.Series([10,2,3,4,5], index=['A','B', 'C', 'D', 'E'], name = '月份')
print(s)

0    10
1     2
2     3
3     4
4     5
dtype: int64
A    10
B     2
C     3
D     4
E     5
dtype: int64
A    10
B     2
C     3
D     4
E     5
Name: 月份, dtype: int64


In [17]:
# 通过字典创建
# s = pd.Series([10,2,3,4,5], index=['A','B','C','D','E'])
s = pd.Series({'a':1, 'b':2, 'c':3, 'd':4, 'e':5})
print(s)
# 在s基础上取一个index获取一个新的Series 
s1 = pd.Series(s, index=['a','c'])
print(s1)

a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
c    3
dtype: int64


### Series的属性

In [14]:
print(s.index)
print(s.values)
print(s.shape, s.ndim, s.size, sep='\n')
s.name = 'test'
print(s.dtype, s.name)
print(s)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[1 2 3 4 5]
(5,)
1
5
int64 test
a    1
b    2
c    3
d    4
e    5
Name: test, dtype: int64


In [22]:
print(s.loc['b']) # 显示索引 location
print(s.iloc[1])  # 隐式索引

print(s.iloc[0:2])
print(s.at['a']) # at不支持切片
print(s.iat[0])

2
2
a    1
b    2
dtype: int64
1
1


In [32]:
# 访问数据
# print(s[1]) # 1标签还是位置，python搞不清
print(s['a'])
print(s[s<3])
s['f'] = 6 #增加一行
print(s)
print(s.head()) # s.head()获取数据最前N行，默认只打印五行

print(s.tail()) # s.head()获取数据最后N行，默认只打印五行

1
a    1
b    2
dtype: int64
a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
b    2
c    3
d    4
e    5
f    6
dtype: int64


### Series的常见函数

In [38]:
import numpy as np
s = pd.Series([10,2,np.nan,None,3,4,5], index=['A', 'B', 'C', 'D', 'E', 'F', 'G'], name='Data')
print(s)

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
Name: Data, dtype: float64


In [None]:
s.head(3) # 取前3行
s.tail(2) # 取后2行

F    4.0
G    5.0
Name: Data, dtype: float64

In [None]:
# 查看所有的描述性信息
s.describe() # 忽略缺失值

count     5.000000
mean      4.800000
std       3.114482
min       2.000000
25%       3.000000
50%       4.000000
75%       5.000000
max      10.000000
Name: Data, dtype: float64

In [None]:
# 获取元素个数(忽略缺失值)
print(s.count())
print(s.keys()) # 方法
print(s.index)  # 属性

5
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')


In [52]:
print(s.isna()) # 检查Series里的每一个元素是否为缺失值
s.isin([4,5,6])

A    False
B    False
C     True
D     True
E    False
F    False
G    False
Name: Data, dtype: bool


A    False
B    False
C    False
D    False
E    False
F     True
G     True
Name: Data, dtype: bool

In [56]:
print(s)
print(s.mean())
print(s.std())
print(s.var())
print(s.min())
print(s.max())
print(s.median())

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
Name: Data, dtype: float64
4.8
3.1144823004794877
9.700000000000001
2.0
10.0
4.0


In [None]:
print(s.sort_values())
print(s.quantile(0.8))
# ---------------------
# 2   3   4   5   10
# 4（四段）*0.8=3.2
# 比第三段（4-5）大，所以比5大，比10小
# 5 + （10-5）* 0.2 = 6

B     2.0
E     3.0
F     4.0
G     5.0
A    10.0
C     NaN
D     NaN
Name: Data, dtype: float64
6.000000000000001


In [None]:
# 众数 
print(s.mode()) # 出现频率一样

s['H'] = 4
print(s.mode()) # 此时显示4，出现次数最多

0     2.0
1     3.0
2     4.0
3     5.0
4    10.0
Name: Data, dtype: float64
0    4.0
Name: Data, dtype: float64


In [64]:
print(s.value_counts())

Data
4.0     2
10.0    1
2.0     1
3.0     1
5.0     1
Name: count, dtype: int64


In [67]:
# 去重
print(s.drop_duplicates())
print(s.unique())
print(s.nunique()) # 去重后的元素个数


A    10.0
B     2.0
C     NaN
E     3.0
F     4.0
G     5.0
Name: Data, dtype: float64
[10.  2. nan  3.  4.  5.]
5


In [None]:
# 排序、值、索引
print(s.sort_index())  # 按索引排序
print(s.sort_values()) # 按值排序

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
H     4.0
Name: Data, dtype: float64
B     2.0
E     3.0
F     4.0
H     4.0
G     5.0
A    10.0
C     NaN
D     NaN
Name: Data, dtype: float64
