# pandas数据处理

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

## 1、删除重复元素

In [2]:
df = DataFrame({
    "name":["Jack","xiaoming","xiaohuang","xiaoming","laowang","laozhang","laowang"],
    "score":[120,100,100,50,90,100,90]
})
df

Unnamed: 0,name,score
0,Jack,120
1,xiaoming,100
2,xiaohuang,100
3,xiaoming,50
4,laowang,90
5,laozhang,100
6,laowang,90


检测重复，duplicated(),返回值是一个Series，记录某一行是否重复，如果这一行不是第一次出现，就认为是重复

In [59]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

删除重复，drop_duplicates()

In [60]:
df.drop_duplicates(inplace=True)

In [61]:
df

Unnamed: 0,name,score
0,Jack,120
1,xiaoming,100
2,xiaohuang,100
3,xiaoming,50
4,laowang,90
5,laozhang,100


In [62]:
df1 = pd.concat([df,df],axis=1)
df1

Unnamed: 0,name,score,name.1,score.1
0,Jack,120,Jack,120
1,xiaoming,100,xiaoming,100
2,xiaohuang,100,xiaohuang,100
3,xiaoming,50,xiaoming,50
4,laowang,90,laowang,90
5,laozhang,100,laozhang,100


In [63]:
df1.duplicated() # 如果列上有重复，这个方法不能用

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [64]:
df1.drop_duplicates()

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

## 2. 映射

映射的含义：一个关系表，把某些值和某个特定的键绑定在一起

字典{"12":123,"w":laowang,"t":[1,2,43]}

y = 2x+3

def func(x,y):
   
   print(x+y)
   return x**y
一个函数，入口是参数，出口是返回值
由入口的参数映射到出口的返回值

lambda x: x**2

### 1) replace()函数：替换元素

In [65]:
df

Unnamed: 0,name,score
0,Jack,120
1,xiaoming,100
2,xiaohuang,100
3,xiaoming,50
4,laowang,90
5,laozhang,100


In [66]:
# 把所有的xiaoming替换成daming
df["name"][df["name"] == "xiaoming"] = "daming"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [67]:
df

Unnamed: 0,name,score
0,Jack,120
1,daming,100
2,xiaohuang,100
3,daming,50
4,laowang,90
5,laozhang,100


In [68]:
# 用映射的方式把daming替换成xiaoming
dic = {"daming":"xiaoming"}
df.replace(dic,inplace=True) 
# replace函数的作用：根据我们传过来的那个字典，把df表中能够和字典键匹配的那些值替换成
# 字典的值

In [69]:
df

Unnamed: 0,name,score
0,Jack,120
1,xiaoming,100
2,xiaohuang,100
3,xiaoming,50
4,laowang,90
5,laozhang,100


In [70]:
def func1(name):
    
    return "daming" 

In [71]:
df.replace(func1) # replace函数只支持字典映射，不支持函数映射

TypeError: cannot replace [<function func1 at 0x000002322B7EF7B8>] with method pad on a DataFrame

============================================

练习19：

    假设张三李四的课表里有满分的情况，老师认为是作弊，把所有满分的情况（包括150,300分）都记0分，如何实现？

============================================

### 2) map()函数：新建一列

In [72]:
df = DataFrame(np.random.randint(0,150,size=(4,4)),
               index=["张三","李四","达康","陈院长"],
               columns=["python","外语","数学","java"])
df

Unnamed: 0,python,外语,数学,java
张三,24,69,57,22
李四,127,43,100,111
达康,95,148,119,0
陈院长,142,90,59,51


In [73]:
# 用映射新增一列
score = {148:120,110:100,94:67,126:127,110:180}
df["go"] = df["python"] # 用python这一列生成了一个go这一列

df["go"] = df["python"].map(score) # 通过map函数可以将一列的内容映射出另一列一个series
df

Unnamed: 0,python,外语,数学,java,go
张三,24,69,57,22,
李四,127,43,100,111,
达康,95,148,119,0,
陈院长,142,90,59,51,


lambda函数映射

In [74]:
v = lambda x : x*2

In [75]:
v(10)

20

In [76]:
df["外语"] = df["外语"].map(v) 
# 如果映射是一个函数，会把series里面所有的值依次取出，带入函数中，得到映射结果

In [77]:
df

Unnamed: 0,python,外语,数学,java,go
张三,24,138,57,22,
李四,127,86,100,111,
达康,95,296,119,0,
陈院长,142,180,59,51,


用普通函数映射

In [78]:
def mp(x):
    if x > 120:
        return "优秀"
    elif x > 90:
        return "及格"
    else:
        return "不及格"
    

In [79]:
df["数学成绩"] = df["数学"].map(mp)

In [80]:
df

Unnamed: 0,python,外语,数学,java,go,数学成绩
张三,24,138,57,22,,不及格
李四,127,86,100,111,,及格
达康,95,296,119,0,,及格
陈院长,142,180,59,51,,不及格


============================================

练习20：

新增两列，分别为张三、李四的成绩状态，如果分数低于90，则为"failed"，如果分数高于120，则为"excellent"，其他则为"pass"

【提示】使用函数作为map的参数

============================================


### 3) rename()函数：替换索引

In [81]:
df

Unnamed: 0,python,外语,数学,java,go,数学成绩
张三,24,138,57,22,,不及格
李四,127,86,100,111,,及格
达康,95,296,119,0,,及格
陈院长,142,180,59,51,,不及格


In [85]:
cols = {"外语":"English"}
rows = {"陈院长":"MrChen"}


In [87]:
df.rename(rows,cols)
# 第一个参数是指替换行的那个映射
# 第二个参数替换列的那个映射

Unnamed: 0,python,English,数学,java,go,数学成绩
张三,24,138,57,22,,不及格
李四,127,86,100,111,,及格
达康,95,296,119,0,,及格
MrChen,142,180,59,51,,不及格


In [88]:
def func2(x):
    return "李富贵"

In [92]:
df.rename(columns=func2,index=func2)

Unnamed: 0,李富贵,李富贵.1,李富贵.2,李富贵.3,李富贵.4,李富贵.5
李富贵,24,138,57,22,,不及格
李富贵,127,86,100,111,,及格
李富贵,95,296,119,0,,及格
李富贵,142,180,59,51,,不及格


## 3. 异常值检测和过滤

In [93]:
df = DataFrame(np.random.randint(0,100,size=(4,4)),
               index=["LiJiacheng","JackMa","Wangjianlin","MrFan"],
               columns=["money","age","score","yanzhi"])
df

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,18,14,42,54
JackMa,21,53,21,57
Wangjianlin,91,78,32,34
MrFan,20,9,96,75


In [94]:
df.describe()
# 描述数据表属性相关的统计量

Unnamed: 0,money,age,score,yanzhi
count,4.0,4.0,4.0,4.0
mean,37.5,38.5,47.75,55.0
std,35.688467,32.868425,33.290389,16.792856
min,18.0,9.0,21.0,34.0
25%,19.5,12.75,29.25,49.0
50%,20.5,33.5,37.0,55.5
75%,38.5,59.25,55.5,61.5
max,91.0,78.0,96.0,75.0


In [96]:
df.std(axis=0)

money     35.688467
age       32.868425
score     33.290389
yanzhi    16.792856
dtype: float64

In [98]:
df.mean(axis=1)

LiJiacheng     32.00
JackMa         38.00
Wangjianlin    58.75
MrFan          50.00
dtype: float64

什么是异常值？根据需求来定义

In [100]:
# 定义大于标准差的2倍为异常值
cond = df>df.std()*2
cond # 符合异常条件都是True

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,False,False,False,True
JackMa,False,False,False,True
Wangjianlin,True,True,False,True
MrFan,False,False,True,True


In [101]:
# 输出
df[cond]

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,,,,54
JackMa,,,,57
Wangjianlin,91.0,78.0,,34
MrFan,,,96.0,75


In [106]:
# 输出正常值
df1 = df[cond.replace({False:True,True:False})]
df1

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,18.0,14.0,42.0,
JackMa,21.0,53.0,21.0,
Wangjianlin,,,32.0,
MrFan,20.0,9.0,,


In [107]:
df1.dropna()

Unnamed: 0,money,age,score,yanzhi


处理异常：所有大于平均值2倍都是异常

In [109]:
# 1、根据过滤条件查找异常
cond = df > df.mean()*2
cond

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,False,False,False,False
JackMa,False,False,False,False
Wangjianlin,True,True,False,False
MrFan,False,False,True,False


In [112]:
df2 = df[cond.replace({False:True,True:False})]
df2

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,18.0,14.0,42.0,54
JackMa,21.0,53.0,21.0,57
Wangjianlin,,,32.0,34
MrFan,20.0,9.0,,75


In [113]:
df2.dropna()

Unnamed: 0,money,age,score,yanzhi
LiJiacheng,18.0,14.0,42.0,54
JackMa,21.0,53.0,21.0,57


============================================

练习21：

    新建一个形状为10000*3的标准正态分布的DataFrame(np.random.randn)，去除掉所有满足以下情况的行：其中任一元素绝对值大于3倍标准差

============================================

In [114]:
df = DataFrame(np.random.randn(10000,3))
df

Unnamed: 0,0,1,2
0,-0.591876,-0.009506,-0.103773
1,0.390625,-1.798268,0.623583
2,0.040504,-0.072063,0.176879
3,0.136485,0.684327,-1.189321
4,-1.117789,0.516447,-2.366432
5,1.018096,1.431084,-0.095307
6,0.622351,-0.644622,-0.186716
7,0.391134,-1.439146,-0.218426
8,-1.071934,-0.167831,1.412367
9,1.985704,0.086298,-0.410040


In [118]:
cond = np.abs(df) > df.std()*3
cond

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [121]:
df1 = df[cond.replace({False:True,True:False})]
df1

Unnamed: 0,0,1,2
0,-0.591876,-0.009506,-0.103773
1,0.390625,-1.798268,0.623583
2,0.040504,-0.072063,0.176879
3,0.136485,0.684327,-1.189321
4,-1.117789,0.516447,-2.366432
5,1.018096,1.431084,-0.095307
6,0.622351,-0.644622,-0.186716
7,0.391134,-1.439146,-0.218426
8,-1.071934,-0.167831,1.412367
9,1.985704,0.086298,-0.410040


In [122]:
df2 = df1.dropna()

In [123]:
df2.shape

(9924, 3)

## 4. 随机排序

In [124]:
df1 = DataFrame(np.random.randint(0,100,size=(100,4)))
df1

Unnamed: 0,0,1,2,3
0,14,89,67,36
1,97,46,49,33
2,28,58,24,15
3,71,2,23,15
4,9,96,33,36
5,29,95,84,10
6,82,44,40,32
7,53,34,5,95
8,43,24,20,38
9,43,3,65,89


1、随机打乱次序

In [128]:
ind = np.random.permutation(100)
ind

array([28, 83,  6, 62, 92, 18, 91, 71, 68,  3, 54, 80, 66, 34, 88, 98, 51,
       29, 73, 47, 78, 64, 82, 31,  7, 26,  2, 27, 19, 85, 42, 65, 97, 89,
       55, 25, 22, 17,  1, 94, 53, 77, 67, 44, 75, 59, 56, 37, 46,  0, 30,
       99, 45, 13, 93, 69, 63, 81, 72, 40, 58, 41, 57, 87, 74, 21, 14, 50,
        4, 95, 11, 48, 36, 39, 38, 52, 76, 32, 70, 43, 79, 84, 20, 61, 86,
       33, 35,  8,  9, 90, 12,  5, 49, 96, 60, 15, 10, 23, 16, 24])

In [130]:
ind[30:40]

array([42, 65, 97, 89, 55, 25, 22, 17,  1, 94])

2、在任意一个连续的区间中按照需求取样

In [132]:
df1.loc[ind[20:30]]

Unnamed: 0,0,1,2,3
78,52,64,73,27
64,88,71,30,68
82,99,35,4,55
31,81,61,75,86
7,53,34,5,95
26,71,2,41,93
2,28,58,24,15
27,51,97,17,25
19,23,84,33,3
85,16,88,86,1


In [133]:
df.take(ind[30:40])

Unnamed: 0,0,1,2
42,0.338968,-1.538887,0.895249
65,-0.638738,0.035171,-0.907494
97,-1.44304,-0.888129,-0.20966
89,-1.225739,-1.683341,-1.688559
55,0.502262,-2.146719,0.013832
25,1.165169,-0.112182,-1.848606
22,0.68828,1.331032,-0.380653
17,1.47274,-1.43238,1.402332
1,0.390625,-1.798268,0.623583
94,0.585316,0.086405,-2.547239


============================================
练习22：

   假设有张三李四王老五的期中考试成绩ddd2，对着三名同学随机排序

============================================

## 5. 数据聚合【重点】

分组聚合：

根据属性中值的不同先进行分组，然后再组内聚合

分组：根据某个属性的值进行分组

聚合：根据属性分组以后，对另外一个属性组内聚合

处理过程：分组=>聚合=>得到一个聚合以后的数据结构=>把得到的数据结构拼接到原表中

核心函数：groupby()



In [137]:
df = DataFrame({
    "item":["萝卜","白菜","辣椒","冬瓜","萝卜","萝卜","白菜","辣椒"],
    "color":["白","白","青","青","红","青","青","红"],
    "weight":np.random.randint(1,100,size=8),
    "price":np.random.randint(1,10,size=8)
})
df

Unnamed: 0,color,item,price,weight
0,白,萝卜,4,88
1,白,白菜,6,75
2,青,辣椒,9,86
3,青,冬瓜,8,46
4,红,萝卜,9,6
5,青,萝卜,5,54
6,青,白菜,7,3
7,红,辣椒,3,96


In [138]:
df.sum()

color             白白青青红青青红
item      萝卜白菜辣椒冬瓜萝卜萝卜白菜辣椒
price                   51
weight                 454
dtype: object

In [139]:
df.mean()

price      6.375
weight    56.750
dtype: float64

In [140]:
df.var()

price        5.125000
weight    1327.642857
dtype: float64

分组聚合

In [145]:
price_sum = df.groupby(["color"])[["price"]].sum()
price_sum

Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
白,10
红,12
青,29


In [149]:
df.merge(price_sum,left_on="color",right_index=True,suffixes=["_单价","_总价"])

Unnamed: 0,color,item,price_单价,weight,price_总价
0,白,萝卜,4,88,10
1,白,白菜,6,75,10
2,青,辣椒,9,86,29
3,青,冬瓜,8,46,29
5,青,萝卜,5,54,29
6,青,白菜,7,3,29
4,红,萝卜,9,6,12
7,红,辣椒,3,96,12


============================================

练习23：

   假设菜市场张大妈在卖菜，有以下属性：
   
   菜品(item)：萝卜，白菜，辣椒，冬瓜
   
   颜色(color)：白，青，红
   
   重量(weight)
   
   价格(price)
   
1. 要求以属性作为列索引，新建一个ddd
2. 对ddd进行聚合操作，求出颜色为白色的价格总和
3. 对ddd进行聚合操作，求出萝卜的所有重量(包括白萝卜，胡萝卜，青萝卜）以及平均价格
4. 使用merge合并总重量及平均价格

============================================

## 6.0 高级数据聚合

可以使用pd.merge()函数将聚合操作的计算结果添加到df的每一行  
使用groupby分组后调用加和等函数进行运算，让后最后可以调用add_prefix()，来修改列名

============================================

练习24：

   使用transform与apply实现练习23的功能

============================================