#  pandas数据处理

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

### 1、删除重复元素

In [2]:
df = DataFrame({"color":["red","white","red","green"],
               "size":[10,20,10,30]})
df

Unnamed: 0,color,size
0,red,10
1,white,20
2,red,10
3,green,30


使用duplicated()函数检测重复的行，返回元素为布尔类型的Series对象，每个元素对应一行，如果该行不是第一次出现，则元素为True

In [3]:
df.duplicated()

0    False
1    False
2     True
3    False
dtype: bool

使用drop_duplicates()函数删除重复的行

In [4]:
df.drop_duplicates()

Unnamed: 0,color,size
0,red,10
1,white,20
3,green,30


### 2. 映射
映射的含义：创建一个映射关系列表，把values元素和一个特定的标签或者字符串绑定

需要使用字典：

map = {
    'label1':'value1',
    'label2':'value2',
    ...
    }

包含三种操作：

replace()函数：替换元素
最重要：map()函数：新建一列
rename()函数：替换索引
#### 1) replace()函数：替换元素
使用replace()函数，对values进行替换操作

In [5]:
df

Unnamed: 0,color,size
0,red,10
1,white,20
2,red,10
3,green,30


In [6]:
# 定义一个字典
color = {"red":10,"green":20}

In [7]:
# 调用replace
df.replace(color,inplace=True)
df

Unnamed: 0,color,size
0,10,10
1,white,20
2,10,10
3,20,30


replace还经常用来替换NaN元素

In [8]:
df.loc[1] = np.nan

In [9]:
df

Unnamed: 0,color,size
0,10.0,10.0
1,,
2,10.0,10.0
3,20.0,30.0


In [10]:
v = {np.nan:0.1}
df.replace(v)

Unnamed: 0,color,size
0,10.0,10.0
1,0.1,0.1
2,10.0,10.0
3,20.0,30.0


#### 2) map()函数：新建一列
    通过已有的列新建一列,适合处理某一单独的列。
    

In [11]:
df = DataFrame(np.random.randint(0,100,size = (4,4)), 
              columns = ["Python","Java","PHP","HTML"],
              index = ["Tom","Jerry","Spicke","Bob"])
df

Unnamed: 0,Python,Java,PHP,HTML
Tom,25,64,18,14
Jerry,91,23,6,65
Spicke,24,77,97,13
Bob,15,7,69,40


In [14]:
# 新建一个字典
v = {25:52,91:46,24:81,15:41}
df["Go"] = df["Python"].map(v)
df

Unnamed: 0,Python,Java,PHP,HTML,Go
Tom,25,64,18,14,52
Jerry,91,23,6,65,46
Spicke,24,77,97,13,81
Bob,15,7,69,40,41


map()函数中可以使用lambda函数

In [15]:
df["C"] = df["Go"].map(lambda x : x-3)
df

Unnamed: 0,Python,Java,PHP,HTML,Go,C
Tom,25,64,18,14,52,49
Jerry,91,23,6,65,46,43
Spicke,24,77,97,13,81,78
Bob,15,7,69,40,41,38


In [17]:
# 还可以传一个回调函数
def mp(x):
    if x < 40:
        return "Fail"
    else:
        return "excellent"

In [19]:
df["score"] = df["Python"].map(mp)
df

Unnamed: 0,Python,Java,PHP,HTML,Go,C,score
Tom,25,64,18,14,52,49,Fail
Jerry,91,23,6,65,46,43,excellent
Spicke,24,77,97,13,81,78,Fail
Bob,15,7,69,40,41,38,Fail


transform()和map()类似

In [20]:
df["score2"] = df["C"].transform(mp)
df

Unnamed: 0,Python,Java,PHP,HTML,Go,C,score,score2
Tom,25,64,18,14,52,49,Fail,excellent
Jerry,91,23,6,65,46,43,excellent,excellent
Spicke,24,77,97,13,81,78,Fail,excellent
Bob,15,7,69,40,41,38,Fail,Fail


In [None]:
"""
map函数传三种  一种是字典， 一种是lambda 函数， 一种回调函数
"""

#### 3) rename()函数：替换索引

In [21]:
df 

Unnamed: 0,Python,Java,PHP,HTML,Go,C,score,score2
Tom,25,64,18,14,52,49,Fail,excellent
Jerry,91,23,6,65,46,43,excellent,excellent
Spicke,24,77,97,13,81,78,Fail,excellent
Bob,15,7,69,40,41,38,Fail,Fail


In [22]:
def cols(x):
    if x == "Python":
        return "蟒蛇"
    if x == "PHP":
        return "php"
    else:
        return x

In [23]:
df.rename(columns=cols)

Unnamed: 0,蟒蛇,Java,php,HTML,Go,C,score,score2
Tom,25,64,18,14,52,49,Fail,excellent
Jerry,91,23,6,65,46,43,excellent,excellent
Spicke,24,77,97,13,81,78,Fail,excellent
Bob,15,7,69,40,41,38,Fail,Fail


In [24]:
inds = {"Tom":"Cat","Spicke":"Dog"}
df.rename(index=inds, columns = cols)

Unnamed: 0,蟒蛇,Java,php,HTML,Go,C,score,score2
Cat,25,64,18,14,52,49,Fail,excellent
Jerry,91,23,6,65,46,43,excellent,excellent
Dog,24,77,97,13,81,78,Fail,excellent
Bob,15,7,69,40,41,38,Fail,Fail


### 3. 异常值检测和过滤
使用describe()函数查看每一列的描述性统计量

In [25]:
df

Unnamed: 0,Python,Java,PHP,HTML,Go,C,score,score2
Tom,25,64,18,14,52,49,Fail,excellent
Jerry,91,23,6,65,46,43,excellent,excellent
Spicke,24,77,97,13,81,78,Fail,excellent
Bob,15,7,69,40,41,38,Fail,Fail


In [26]:
df.describe()

Unnamed: 0,Python,Java,PHP,HTML,Go,C
count,4.0,4.0,4.0,4.0,4.0,4.0
mean,38.75,42.75,47.5,33.0,55.0,52.0
std,35.122405,33.129795,42.8369,24.725156,17.907168,17.907168
min,15.0,7.0,6.0,13.0,41.0,38.0
25%,21.75,19.0,15.0,13.75,44.75,41.75
50%,24.5,43.5,43.5,27.0,49.0,46.0
75%,41.5,67.25,76.0,46.25,59.25,56.25
max,91.0,77.0,97.0,65.0,81.0,78.0


使用std()函数可以求得DataFrame对象每一列的标准差

In [27]:
df.std()  # 描述一个数据的离散程度

Python    35.122405
Java      33.129795
PHP       42.836900
HTML      24.725156
Go        17.907168
C         17.907168
dtype: float64

根据每一列的标准差，对DataFrame元素进行过滤。

借助any()函数, 测试是否有True，有一个或以上返回True，反之返回False

对每一列应用筛选条件,去除标准差太大的数据

删除特定索引df.drop(labels,inplace = True)

In [28]:
n = np.random.randn(10000,3)
df = DataFrame(n)
df

Unnamed: 0,0,1,2
0,-0.921324,0.723924,1.086944
1,0.359821,-1.226488,-0.161812
2,0.025334,0.031136,0.388976
3,0.551758,-0.206229,-0.627329
4,-1.511677,1.058655,0.533971
5,-0.054549,-1.659614,-0.544527
6,-0.282066,-0.133029,-0.674234
7,0.693662,1.260988,0.898516
8,0.695469,0.115862,0.333974
9,-0.447224,1.757729,-0.344110


In [30]:
cond = np.abs(df) > 3 * df.std()
cond

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [31]:
drop_index = df[cond.any(axis=1)].index

In [32]:
drop_index

Int64Index([  30,   49,  104,  572,  596,  665,  666,  671,  686,  819,  892,
            1018, 1137, 1169, 1223, 1278, 1287, 1430, 1454, 1499, 1730, 1765,
            1898, 2149, 2270, 2375, 2382, 2396, 2587, 2702, 2717, 2810, 2875,
            2943, 2988, 3235, 3246, 3302, 3444, 3473, 3590, 3712, 3723, 3964,
            4071, 4123, 4149, 4451, 4490, 4552, 4613, 4736, 4785, 4997, 5140,
            5320, 5324, 5672, 5720, 5871, 5885, 6065, 6083, 6137, 6303, 6329,
            6341, 6367, 6479, 6483, 6495, 6574, 6718, 6734, 6844, 7142, 7363,
            7553, 7728, 7870, 7893, 8246, 8480, 8542, 8571, 8580, 8633, 8634,
            8832, 9137, 9397, 9513, 9583, 9893],
           dtype='int64')

In [33]:
df2 = df.drop(drop_index)
df2.shape

(9906, 3)

### 4.排序
使用.take()函数排序

可以借助np.random.permutation()函数随机排序

In [34]:
df = DataFrame(np.random.randint(0,100,size = (4,4)), 
              columns = ["Python","Java","PHP","HTML"],
              index = ["Tom","Jerry","Spicke","Bob"])
df

Unnamed: 0,Python,Java,PHP,HTML
Tom,52,39,15,20
Jerry,49,32,44,44
Spicke,7,85,27,59
Bob,96,88,83,34


In [35]:
df.take([2,1,0,3])

Unnamed: 0,Python,Java,PHP,HTML
Spicke,7,85,27,59
Jerry,49,32,44,44
Tom,52,39,15,20
Bob,96,88,83,34


In [36]:
ind = np.random.permutation(4)
ind

array([1, 2, 0, 3])

In [37]:
df.take(ind)

Unnamed: 0,Python,Java,PHP,HTML
Jerry,49,32,44,44
Spicke,7,85,27,59
Tom,52,39,15,20
Bob,96,88,83,34


#### 随机抽样
当DataFrame规模足够大时，直接使用np.random.randint()函数，就配合take()函数实现随机抽样

In [39]:
df2 = DataFrame(np.random.randn(10000,3))
df2

Unnamed: 0,0,1,2
0,-0.764153,-0.336722,0.162577
1,-1.195598,-1.089385,2.257381
2,0.190486,-0.662317,-0.106455
3,1.378749,1.559726,0.539209
4,0.699820,1.479218,-0.992535
5,0.095530,-0.332046,-0.337414
6,0.690454,0.302051,1.332080
7,1.458370,1.688033,0.678764
8,0.476406,0.077611,-1.832954
9,-1.042857,0.618565,-0.683368


In [40]:
ind = np.random.randint(0,10000,size = 10)
ind

array([1940,  144, 1197, 1938, 7857, 9943, 4162, 7026, 5619,  158])

In [41]:
df2.take(ind)

Unnamed: 0,0,1,2
1940,-1.290047,1.844277,1.139849
144,-0.051027,-0.454274,0.808537
1197,-1.199029,0.869545,-0.684923
1938,-0.319641,0.829842,0.344155
7857,-0.95854,-0.960169,1.618757
9943,-0.277098,-0.978857,-1.473671
4162,-2.090547,0.005401,0.924486
7026,-1.081062,-0.448093,2.293405
5619,0.688181,0.786217,0.832769
158,-0.179757,0.709034,-0.037557


### 5. 数据聚合【重点】
数据聚合是数据处理的最后一步，通常是要使每一个数组生成一个单一的数值。

数据分类处理：

分组：先把数据分为几组
用函数处理：为不同组的数据应用不同的函数以转换数据
合并：把不同组得到的结果合并起来
数据分类处理的核心： groupby()函数

In [42]:
#分组：非常重要， 但是很简单， 几乎百分八十以上的数据分析都会用到 分组
df = DataFrame({"color":["red","white","red","cyan","cyan","green",'white',"cyan"],
               "price":np.random.randint(0,8,size = 8),
               "weight":np.random.randint(50,55,size = 8)})
df

Unnamed: 0,color,price,weight
0,red,7,54
1,white,5,53
2,red,5,50
3,cyan,5,50
4,cyan,5,51
5,green,1,50
6,white,3,53
7,cyan,5,51


In [43]:
#对颜色来进行分组,groupby之后的数据是一个对象 ，一定要和聚合函数连用
df_sum_weight = df.groupby(["color"])[["weight"]].sum()
df_sum_weight

Unnamed: 0_level_0,weight
color,Unnamed: 1_level_1
cyan,152
green,50
red,104
white,106


In [44]:
df_price_mean = df.groupby(["color"])[["price"]].min()
df_price_mean

Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
cyan,5
green,1
red,5
white,3


### 6.0 高级数据聚合
可以使用pd.merge()函数将聚合操作的计算结果添加到df的每一行
使用groupby分组后调用加和等函数进行运算，让后最后可以调用add_prefix()，来修改列名
#### 可以使用transform和apply实现相同功能
在transform或者apply中传入函数即可

In [45]:
df

Unnamed: 0,color,price,weight
0,red,7,54
1,white,5,53
2,red,5,50
3,cyan,5,50
4,cyan,5,51
5,green,1,50
6,white,3,53
7,cyan,5,51


In [46]:
df.groupby("color").transform(sum)

Unnamed: 0,price,weight
0,12,104
1,8,106
2,12,104
3,15,152
4,15,152
5,1,50
6,8,106
7,15,152


In [47]:
df.groupby("color").apply(sum)

Unnamed: 0_level_0,color,price,weight
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cyan,cyancyancyan,15,152
green,green,1,50
red,redred,12,104
white,whitewhite,8,106
