# Pandas入门

## Pandas基本数据结构-Series

In [20]:
import pandas as pd
import numpy as np

In [3]:
obj = Series([4, 0, 2, -1], index=['d', 'a', 'b', 'c'])
obj

d    4
a    0
b    2
c   -1
dtype: int64

In [7]:
dic = {'a':1, 'b':2, 'c':3, 'd':4}
pd.Series(dic)

a    1
b    2
c    3
d    4
dtype: int64

In [9]:
states = ['e', 'a', 'd', 'h']
pd.Series(dic, index=states)

e    NaN
a    1.0
d    4.0
h    NaN
dtype: float64

In [10]:
obj.index = ['e', 'f', 'g', 'h']
pd.obj

e    4
f    0
g    2
h   -1
dtype: int64

## Pandas基本数据结构-DataFrame

In [14]:
data = {'a':[1.2, 4.1, 7.1], 
        'b':['x', 'y', 'z'], 
        'c':[100, 200, 300]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,a,b,c
0,1.2,x,100
1,4.1,y,200
2,7.1,z,300


In [15]:
pd.DataFrame(data, columns=['c','b','a'])

Unnamed: 0,c,b,a
0,100,x,1.2
1,200,y,4.1
2,300,z,7.1


In [16]:
frame.a

0    1.2
1    4.1
2    7.1
Name: a, dtype: float64

In [17]:
frame['a']

0    1.2
1    4.1
2    7.1
Name: a, dtype: float64

In [26]:
frame['d'] = 15
frame

Unnamed: 0,a,b,c,d,e,f
0,1.2,x,100,15,0,
1,4.1,y,200,15,1,
2,7.1,z,300,15,2,44.0


In [27]:
frame['e'] = np.arange(3)
frame

Unnamed: 0,a,b,c,d,e,f
0,1.2,x,100,15,0,
1,4.1,y,200,15,1,
2,7.1,z,300,15,2,44.0


In [28]:
val = pd.Series([44], index=[2])
frame['f'] = val
frame

Unnamed: 0,a,b,c,d,e,f
0,1.2,x,100,15,0,
1,4.1,y,200,15,1,
2,7.1,z,300,15,2,44.0


In [39]:
frame['g'] = frame['e']==0
frame

Unnamed: 0,a,b,c,d,e,f,g
0,1.2,x,100,15,0,,True
1,4.1,y,200,15,1,,False
2,7.1,z,300,15,2,44.0,False


In [40]:
del frame['g'] # 删除一列
frame.columns

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [58]:
dic = {
    'x':{'a':1,'b':2,'c':3},
    'y':{'b':4,'c':5}
}
frame = pd.DataFrame(dic)
frame

Unnamed: 0,x,y
a,1,
b,2,4.0
c,3,5.0


In [59]:
# 转置
frame.T

Unnamed: 0,a,b,c
x,1.0,2.0,3.0
y,,4.0,5.0


In [60]:
# 设置横纵的名称
frame.index.name = 'abc'
frame.columns.name = 'xy'
frame

xy,x,y
abc,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,
b,2,4.0
c,3,5.0


In [61]:
# 值
frame.values

array([[ 1., nan],
       [ 2.,  4.],
       [ 3.,  5.]])

# Pandas进阶

# Pandas删除操作

### 在numpy和pandas中出现的axis=0和axis=1的区分方法：   
### axis=0 表示跨行，axis=1表示跨列，df.sum(axis=1)代表沿着水平方向计算均值(跨列)(沿着列标签横向执行sum方法);df.dropna( axis=1,how='any') 代表将列标签们中含有nan的列沿着水平的方向依次删掉(沿着列标签方向执行dropna方法) 细品

In [62]:
frame

xy,x,y
abc,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,
b,2,4.0
c,3,5.0


In [63]:
frame.drop('x',axis=1) # 删除列 后面必须有axis=1

xy,y
abc,Unnamed: 1_level_1
a,
b,4.0
c,5.0


In [64]:
frame.drop(['b','c']) # 删除两行

xy,x,y
abc,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,


## Series索引

In [68]:
s = pd.Series([1,2,3,4], index=['a','b','c','d'])
s

a    1
b    2
c    3
d    4
dtype: int64

In [69]:
s['b':'d'] # 用index切片时末端是包括在内的

b    2
c    3
d    4
dtype: int64

In [70]:
s[1:3]    # 用下标切片时末端不包括

b    2
c    3
dtype: int64

## 数据运算与算数对齐

In [71]:
# Series 类似于并集
s1 = Series([1,2,3], index=['a','c','d'])
s2 = Series([100,100,100], index=['b','c','d'])
s1 + s2

a      NaN
b      NaN
c    102.0
d    103.0
dtype: float64

In [78]:
# DataFrame 同时在行与列上对齐
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list('abc'), index=['2000','2001','2002'])
df2 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list('bcde'), index=['2001','2002','2003'])

In [81]:
df1 + df2

Unnamed: 0,a,b,c,d,e
2000,,,,,
2001,,4.0,6.0,,
2002,,11.0,13.0,,
2003,,,,,


In [82]:
df1.add(df2)

Unnamed: 0,a,b,c,d,e
2000,,,,,
2001,,4.0,6.0,,
2002,,11.0,13.0,,
2003,,,,,


In [85]:
df1

Unnamed: 0,a,b,c
2000,0.0,1.0,2.0
2001,3.0,4.0,5.0
2002,6.0,7.0,8.0


In [88]:
df2

Unnamed: 0,b,c,d,e
2001,0.0,1.0,2.0,3.0
2002,4.0,5.0,6.0,7.0
2003,8.0,9.0,10.0,11.0


In [83]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
2000,0.0,1.0,2.0,,
2001,3.0,4.0,6.0,2.0,3.0
2002,6.0,11.0,13.0,6.0,7.0
2003,,8.0,9.0,10.0,11.0


In [84]:
df1.add(df2).fillna(0)

Unnamed: 0,a,b,c,d,e
2000,0.0,0.0,0.0,0.0,0.0
2001,0.0,4.0,6.0,0.0,0.0
2002,0.0,11.0,13.0,0.0,0.0
2003,0.0,0.0,0.0,0.0,0.0


In [87]:
df1

Unnamed: 0,a,b,c
2000,0.0,1.0,2.0
2001,3.0,4.0,5.0
2002,6.0,7.0,8.0


In [92]:
f = lambda x:x.max()-x.min()
df1.apply(f) # 应用于列

a    6.0
b    6.0
c    6.0
dtype: float64

In [93]:
df1.apply(f,axis=1) # 应用于行

2000    2.0
2001    2.0
2002    2.0
dtype: float64

In [95]:
format = lambda x : "%.2f" % x
df1.applymap(format)

Unnamed: 0,a,b,c
2000,0.0,1.0,2.0
2001,3.0,4.0,5.0
2002,6.0,7.0,8.0


In [97]:
df1['c'].map(format)

2000    2.00
2001    5.00
2002    8.00
Name: c, dtype: object

## 排序

In [112]:
obj = pd.DataFrame(np.array([[1,2,3],[0,4,-1]]), columns=['x','y','z'], index=['d', 'c'])
obj.sort_index(ascending=False) # ascending=False 降序

Unnamed: 0,x,y,z
d,1,2,3
c,0,4,-1


In [110]:
frame = pd.DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame.sort_values(by='b') # 根据'b'排序

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [113]:
frame.index.is_unique  # 判断index是否是唯一的

True

In [114]:
frame.sum(axis=1)  # 行求和

0    4
1    8
2   -3
3    3
dtype: int64

In [115]:
frame.mean(axis=1,skipna=False) # skipna=False表示不跳过NaN，计算的结果自然是NaN

0    2.0
1    4.0
2   -1.5
3    1.5
dtype: float64

In [120]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [119]:
frame.cumsum() # 累积求和，默认是按照列

Unnamed: 0,b,a
0,4,0
1,11,1
2,8,1
3,10,2


In [121]:
frame.describe()

Unnamed: 0,b,a
count,4.0,4.0
mean,2.5,0.5
std,4.203173,0.57735
min,-3.0,0.0
25%,0.75,0.0
50%,3.0,0.5
75%,4.75,1.0
max,7.0,1.0


In [124]:
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.unique()

array(['a', 'b', 'c'], dtype=object)

In [125]:
obj.value_counts()

a    8
b    4
c    4
dtype: int64

## 缺失值处理

In [126]:
data = Series([1, np.nan, 2, np.nan])
data.dropna()

0    1.0
2    2.0
dtype: float64

In [129]:
# dataFrame中的dropna默认丢弃含有nan的行，加上how='all'，将只丢弃全为nan的行
data = pd.DataFrame([[1, 4, 7], [np.nan, 2, 5], [np.nan, np.nan, np.nan], [4, np.nan, np.nan]])
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,4.0,7.0
1,,2.0,5.0
3,4.0,,


In [130]:
# 填充
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,4.0,7.0
1,0.0,2.0,5.0
2,0.0,0.0,0.0
3,4.0,0.0,0.0


In [131]:
data.fillna({0:100, 2:-100, 3:1000}) # 通过传入字典来有针对性地填充

Unnamed: 0,0,1,2
0,1.0,4.0,7.0
1,100.0,2.0,5.0
2,100.0,,-100.0
3,4.0,,-100.0


In [132]:
data.fillna(0, inplace=True) # 在原数据上填充
data

Unnamed: 0,0,1,2
0,1.0,4.0,7.0
1,0.0,2.0,5.0
2,0.0,0.0,0.0
3,4.0,0.0,0.0


# 字符串操作

In [133]:
val = 'a,b, ccc'

In [134]:
val.split(',') # split()字符串拆分

['a', 'b', ' ccc']

In [135]:
pieces = [x.strip() for x in val.split(',')] # strip()删掉空格
pieces

['a', 'b', 'ccc']

In [136]:
s1, s2, s3 = pieces
print(s1 + '::' + s2 + '::' + s3)  # 用+对字符串进行拼接
print('::'.join(pieces))           # 用join拼接

a::b::ccc
a::b::ccc


In [137]:
'ccc' in pieces  # in 检测是否子串

True

In [138]:
val.index('b')   # 返回子串的位置，等同于val.find('b')

2

In [139]:
val.count(',')   # 计算子串出现的次数

2

In [141]:
val.replace(',', '::')  # 替换

'a::b:: ccc'

In [142]:
import torch
a = torch.tensor([1,2])
a

tensor([1, 2])

In [143]:
a[0]

tensor(1)

In [145]:
a[0].item()

1