# Pandas 的一些常用操作

In [1]:
import pandas as pd
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
                    'data':[4,3,2,1,12,3,4,5,7]})
data

Unnamed: 0,data,group
0,4,a
1,3,a
2,2,a
3,1,b
4,12,b
5,3,b
6,4,c
7,5,c
8,7,c


### 排序
* .sort_values(by = ,ascending = )

In [2]:
data.sort_values(by=['group','data'],ascending = [False,True],inplace=True)
# 这里的意思是在group降序的前提下，data做升序
data

Unnamed: 0,data,group
6,4,c
7,5,c
8,7,c
3,1,b
5,3,b
4,12,b
2,2,a
1,3,a
0,4,a


#### ============================================================

In [3]:
data = pd.DataFrame({'k1':['one']*3+['two']*4,
                   'k2':[3,2,1,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
4,two,3
5,two,4
6,two,4


In [4]:
# 按照k2排序
data.sort_values(by='k2')

Unnamed: 0,k1,k2
2,one,1
1,one,2
0,one,3
3,two,3
4,two,3
5,two,4
6,two,4


### 删除重复的数据
* .drop_duplicates()

In [5]:
data.drop_duplicates() #按照k1和k2都一样的时候去重

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
5,two,4


In [6]:
data.drop_duplicates(subset='k1')  #按照k1里面一样的数据去重

Unnamed: 0,k1,k2
0,one,3
3,two,3


#### ============================================================

In [7]:
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
data

Unnamed: 0,data,food
0,1,A1
1,2,A2
2,3,B1
3,4,B2
4,5,B3
5,6,C1
6,7,C2


### 希望把A1和A2都归类为A类; B1,B2,B3归为B类; C1,C2归为C类
* .apply()

或

* .map()   

.map()中传入一个字典

用apply()函数可以对数据的每一行或每一列都进行同样的操作

In [8]:
def food_map(series):
    if series['food'] == 'A1':
        return 'A'
    elif series['food'] == 'A2':
        return 'A'
    elif series['food'] == 'B1':
        return 'B'
    elif series['food'] == 'B2':
        return 'B'
    elif series['food'] == 'B3':
        return 'B'
    elif series['food'] == 'C1':
        return 'C'
    elif series['food'] == 'C2':
        return 'C'
data['food_map'] = data.apply(food_map,axis = 'columns')
data

Unnamed: 0,data,food,food_map
0,1,A1,A
1,2,A2,A
2,3,B1,B
3,4,B2,B
4,5,B3,B
5,6,C1,C
6,7,C2,C


In [9]:
food2Upper = {
    'A1':'A',
    'A2':'A',
    'B1':'B',
    'B2':'B',
    'B3':'B',
    'C1':'C',
    'C2':'C'
}
data['upper'] = data['food'].map(food2Upper)
data

Unnamed: 0,data,food,food_map,upper
0,1,A1,A,A
1,2,A2,A,A
2,3,B1,B,B
3,4,B2,B,B
4,5,B3,B,B
5,6,C1,C,C
6,7,C2,C,C


#### =====================================================================

In [10]:
import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})
df2 = df.assign(ration = df['data1']/df['data2'])
df2

Unnamed: 0,data1,data2,ration
0,0.424443,-0.888068,-0.477939
1,1.316645,-1.126307,-1.168993
2,0.829255,-0.055029,-15.069374
3,-2.495661,0.097727,-25.537163
4,-0.420608,1.228682,-0.342324


### 删除指定的一列
* .drop(, axis = 'columns',)

In [11]:
df2.drop('ration',axis='columns',inplace=True)
df2

Unnamed: 0,data1,data2
0,0.424443,-0.888068
1,1.316645,-1.126307
2,0.829255,-0.055029
3,-2.495661,0.097727
4,-0.420608,1.228682


### 替换

In [12]:
data = pd.Series([1,2,3,4,5,6,7,8,9])
data

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int64

In [14]:
data.replace(9,np.nan,inplace=True) #替换多个值则传入list
data

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
7    8.0
8    NaN
dtype: float64

### 区间分割(将连续值进行离散化)
* pd.cut()

In [15]:
ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
bins_res

[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
Categories (2, interval[int64]): [(10, 40] < (40, 80]]

In [16]:
bins_res.labels #查看各个值是属于第几个bins

  """Entry point for launching an IPython kernel.


array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int8)

* pd.value_counts

In [18]:
pd.value_counts(bins_res) #统计在当前区间有多少个值

(10, 40]    6
(40, 80]    4
dtype: int64

In [19]:
pd.cut(ages,[10,30,50,80])

[(10, 30], (10, 30], (10, 30], (10, 30], (10, 30], (30, 50], (30, 50], (50, 80], (50, 80], (50, 80]]
Categories (3, interval[int64]): [(10, 30] < (30, 50] < (50, 80]]

### 也可以指定切割完后不同区间的名字
* pd.cut(,labels= )

In [20]:
group_names = ['Yonth','Mille','Old']
#pd.cut(ages,[10,20,50,80],labels=group_names)
pd.value_counts(pd.cut(ages,[10,20,50,80],labels=group_names))

Mille    4
Old      3
Yonth    3
dtype: int64

### ==========================================================

In [21]:
df = pd.DataFrame([range(3),[0, np.nan,0],[0,0,np.nan],range(3)])
df

Unnamed: 0,0,1,2
0,0,1.0,2.0
1,0,,0.0
2,0,0.0,
3,0,1.0,2.0


### 查找缺失值
* .isnull()

In [22]:
df.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [24]:
df.isnull().any(axis = 0) #按照列进行查找是否有缺失值
#df.isnull().any()

0    False
1     True
2     True
dtype: bool

In [25]:
df.isnull().any(axis = 1) #按照行进行查找是否有缺失值

0    False
1     True
2     True
3    False
dtype: bool

### 填充缺失值
* .fillna()

In [26]:
df.fillna(5)

Unnamed: 0,0,1,2
0,0,1.0,2.0
1,0,5.0,0.0
2,0,0.0,5.0
3,0,1.0,2.0


In [27]:
df[df.isnull().any(axis = 1)] #取出有缺失值的行

Unnamed: 0,0,1,2
1,0,,0.0
2,0,0.0,
