In [5]:
## 使用一个二维数组并指定columns和index创建DataFrame对象
import pandas as pd
import numpy as np
a = pd.DataFrame(np.random.randint(0, 10, size=(3, 2)), columns=list('ab'), index=list('efg'))
a

Unnamed: 0,a,b
e,2,9
f,8,3
g,0,6


In [4]:
### 获取series, DataFrame中的元素

In [6]:
display('根据key获取', a['a'])

'根据key获取'

e    2
f    8
g    0
Name: a, dtype: int32

In [27]:
data = pd.Series([5, 6, 7, 8], index=['a', 'b', 'c', 'd'], name='data')
data

a    5
b    6
c    7
d    8
Name: data, dtype: int64

In [19]:
# 根据key值索引对象
data.loc['c']
# 获取连续的三列 data.loc[:, 'b':'d']
# 获取两列 data.loc['a': 'b']
# 同时获取ac行,ABD列 data.loc[['a', 'c'], ['A', 'B', 'C']]

7

In [21]:
# 根据索引索取对象
data.iloc[1]
# 获取第一列 data.iloc[:,0]
# 获取第一列和第三列 data.iloc[:, [0,2]]
# 获取连续的三列 data.iloc[:,1:4]
# 获取ac行,ABD列,使用位置索引 data.iloc[[0,2], [0,1,3]]

6

In [22]:
# 计算data的平均值
data.mean()

6.5

In [23]:
# 计算data的最大值
data.max()

8

In [24]:
# 计算data的最小值
data.min()

5

In [25]:
# 计算data的标准差
data.std()

1.2909944487358056

In [28]:
# 对data进行降序排列
data.sort_values(ascending=False)

d    8
c    7
b    6
a    5
Name: data, dtype: int64

In [29]:
# 使用pandas对数据集进行过滤
data = pd.DataFrame({
    'Name' : ['zs', 'lisi', 'ww'],
    'Sno' : ['1001', '1002', '1003'],
    'Sex' : ['male', 'female', 'male'],
    'Age' : [10, 20, 30],
    'Score' : [100, 200, 300]
}, columns=['Name', 'Sno', 'Age', 'Score'], index=['zs', 'lisi', 'ww'])
display('数据集',data)
scores = data['Score']
display('筛选出成绩大于平均值的数据: ', scores[scores>scores.mean()])

'数据集'

Unnamed: 0,Name,Sno,Age,Score
zs,zs,1001,10,100
lisi,lisi,1002,20,200
ww,ww,1003,30,300


'筛选出成绩大于平均值的数据: '

ww    300
Name: Score, dtype: int64

In [30]:
## 缺失值查看, 缺失值会被标记为NAN
df = pd.DataFrame([[1, 2, np.nan], [4, np.nan, 6], [4, 5, 9]])
df

Unnamed: 0,0,1,2
0,1,2.0,
1,4,,6.0
2,4,5.0,9.0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3 non-null      int64  
 1   1       2 non-null      float64
 2   2       2 non-null      float64
dtypes: float64(2), int64(1)
memory usage: 204.0 bytes


In [32]:
## 查看是否是缺值
df.isnull()

Unnamed: 0,0,1,2
0,False,False,True
1,False,True,False
2,False,False,False


In [33]:
## 获取缺失值
df[df.isnull()]

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,


In [34]:
# 获取非空值
df[df.notnull()] 

Unnamed: 0,0,1,2
0,1,2.0,
1,4,,6.0
2,4,5.0,9.0


In [35]:
# 缺失值一般分为两种,一种是一行中某个字段是缺失值, 另一种是一行字段全是缺失值
## 缺失值删改默认以行为单位
df.dropna()

Unnamed: 0,0,1,2
2,4,5.0,9.0


In [37]:
## 更改为以列为单位进行删除 
df.dropna(axis='columns')

Unnamed: 0,0
0,1
1,4
2,4


In [39]:
# 删除空白行
## 所有都为nan的时候才进行删除
df.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1,2.0,
1,4,,6.0
2,4,5.0,9.0


In [40]:
df.dropna(how = 'any')

Unnamed: 0,0,1,2
2,4,5.0,9.0


In [42]:
# 缺失值的填充
dataDF = pd.Series([3, 4, np.nan, 1, 5, None], name = 'dataDF')
dataDF

0    3.0
1    4.0
2    NaN
3    1.0
4    5.0
5    NaN
Name: dataDF, dtype: float64

In [45]:
# dataDF.fillna(0) # 使用0进行填充

In [48]:
# dataDF.fillna(method = 'ffill') # 使用前一个数进行填充
# dataDF.fillna(method = 'bfill') # 使用后一个数进行填充
# dataDF.fillna(method = 'bfill').fillna(method = 'ffill') # 先使用前一个数进行填充，后使用后一个数进行填充
# dataDF.fillna(method = 'bfill', axis = 1) # 使用列的后一个进行填充

In [49]:
# 使用列的平均值进行填充
for i in df.columns:
    df[i] = df[i].fillna(np.nanmean(df[i]))
df

Unnamed: 0,0,1,2
0,1,2.0,7.5
1,4,3.5,6.0
2,4,5.0,9.0


In [54]:
# 对Series对象进行拼接
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([1, 2, 3, 4, 5])
pd.concat([ser1, ser2])

0    1
1    2
2    3
3    4
4    5
0    1
1    2
2    3
3    4
4    5
dtype: int64

In [55]:
def make_df(cols, index): 
    data = {c:[str(c)+str(i) for i in index] for c in cols}
    return pd.DataFrame(data, index=index)

In [57]:
df1 = make_df(['a', 'b', 'c', 'd'], index=['5', '6', '7', '8'])
df2 = make_df(['a', 'b', 'c', 'd'], index=['1', '2', '3', '4'])
df1

Unnamed: 0,a,b,c,d
5,a5,b5,c5,d5
6,a6,b6,c6,d6
7,a7,b7,c7,d7
8,a8,b8,c8,d8


In [58]:
df2

Unnamed: 0,a,b,c,d
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3
4,a4,b4,c4,d4


In [62]:
pd.concat([df1, df2], axis = 0, ignore_index=True, keys=list('xy')) # 防止索引重复问题可以添加ignore_index属性,也可以使用keys属性进行添加

Unnamed: 0,a,b,c,d
0,a5,b5,c5,d5
1,a6,b6,c6,d6
2,a7,b7,c7,d7
3,a8,b8,c8,d8
4,a1,b1,c1,d1
5,a2,b2,c2,d2
6,a3,b3,c3,d3
7,a4,b4,c4,d4


In [67]:
# merge的使用方法 : 对同时具有column和value的DataFrame进行合并
## how的使用方法有四种 how = ['left', 'right', 'outer', 'inner'], 默认值为 how = 'inner'
left = pd.DataFrame({
    'key' : ['k0', 'k1', 'k2', 'k3'],
    'A' : ['A0', 'A1', 'A2', 'A3'],
    'B' : ['B0', 'B1', 'B2', 'B3'],
})
right = pd.DataFrame({
    'key' : ['k0', 'k2', 'k3', 'k1'],
    'C' : ['C0', 'C1', 'C2', 'C3'],
    'D' : ['D0', 'D1', 'D2', 'D3'],
})
result = pd.merge(left, right, on='key', how='outer')
result

Unnamed: 0,key,A,B,C,D
0,k0,A0,B0,C0,D0
1,k1,A1,B1,C3,D3
2,k2,A2,B2,C1,D1
3,k3,A3,B3,C2,D2


In [68]:
result2 = pd.merge(left, right, on='key', how='inner')
result

Unnamed: 0,key,A,B,C,D
0,k0,A0,B0,C0,D0
1,k1,A1,B1,C3,D3
2,k2,A2,B2,C1,D1
3,k3,A3,B3,C2,D2


In [69]:
result3 = pd.merge(left, right, on='key', how='left')
result3

Unnamed: 0,key,A,B,C,D
0,k0,A0,B0,C0,D0
1,k1,A1,B1,C3,D3
2,k2,A2,B2,C1,D1
3,k3,A3,B3,C2,D2
