# 数据转换
## 移除重复数据

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                'k2': [1, 1, 2, 3, 3, 4, 4]})

In [3]:
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [4]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [5]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [6]:
data['va'] = range(7)

In [7]:
data

Unnamed: 0,k1,k2,va
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [8]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,va
0,one,1,0
3,two,3,3


In [9]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,va
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


In [10]:
help(data.drop_duplicates)

Help on method drop_duplicates in module pandas.core.frame:

drop_duplicates(subset=None, keep='first', inplace=False) method of pandas.core.frame.DataFrame instance
    Return DataFrame with duplicate rows removed, optionally only
    considering certain columns
    
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Drop duplicates except for the first occurrence.
        - ``last`` : Drop duplicates except for the last occurrence.
        - False : Drop all duplicates.
    take_last : deprecated
    inplace : boolean, default False
        Whether to drop duplicates in place or to return a copy
    
    Returns
    -------
    deduplicated : DataFrame



In [11]:
data.drop_duplicates(['k1', 'k2'], keep=False)

Unnamed: 0,k1,k2,va
2,one,2,2


In [12]:
data

Unnamed: 0,k1,k2,va
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


## 利用函数或映射进行数据转换

In [13]:
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                          'forned beef', 'Bacon', 'pastrami', 'honey ham',
                          'nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [14]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,forned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [15]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'cow',
    'pastrami': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [16]:
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)

In [17]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,cow
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,forned beef,7.5,
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [18]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
# why？

KeyError: 'forned beef'

## 替换值