## Removing Duplicates

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.DataFrame({'k1': ['one'] * 2 + ['two'] * 2 + ['three'] * 2 + ['four'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,two,2
3,two,3
4,three,3
5,three,4
6,four,4


In [3]:
data.duplicated()

0    False
1     True
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,two,2
3,two,3
4,three,3
5,three,4
6,four,4


In [5]:
data['v1'] = range(7)

In [6]:
data.drop_duplicates('k1')

Unnamed: 0,k1,k2,v1
0,one,1,0
2,two,2,2
4,three,3,4
6,four,4,6


In [7]:
data.drop_duplicates(['k1', 'k2'], keep='last') 

Unnamed: 0,k1,k2,v1
1,one,1,1
2,two,2,2
3,two,3,3
4,three,3,4
5,three,4,5
6,four,4,6


## Transforming Data Using a Function or Mapping 

In [8]:
v2 = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6}

In [9]:
data['v2'] = data['k1'].map(v2)

In [10]:
data

Unnamed: 0,k1,k2,v1,v2
0,one,1,0,1
1,one,1,1,1
2,two,2,2,2
3,two,3,3,2
4,three,3,4,3
5,three,4,5,3
6,four,4,6,4


In [11]:
data['k1'].map(lambda x: v2[x.lower()])

0    1
1    1
2    2
3    2
4    3
5    3
6    4
Name: k1, dtype: int64

## Replacing Values 

In [12]:
data1 = pd.Series([-999, 10, 77, 4.59, -12.96])

In [13]:
data1

0   -999.00
1     10.00
2     77.00
3      4.59
4    -12.96
dtype: float64

In [14]:
data1.replace(-999, np.nan)

0      NaN
1    10.00
2    77.00
3     4.59
4   -12.96
dtype: float64

In [15]:
data1.replace([-999, -12.96], [np.nan, 0])

0      NaN
1    10.00
2    77.00
3     4.59
4     0.00
dtype: float64

In [16]:
data1.replace({-999:0, -12.96:np.nan})

0     0.00
1    10.00
2    77.00
3     4.59
4      NaN
dtype: float64

## Renaming Axis Indexes 

In [7]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)), 
                  index=['Ohio', 'Colorado', 'New York'],

                  columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [5]:
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [6]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [10]:
 data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'two/2'}) 

Unnamed: 0,one,two,two/2,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11
