# Data Transformation
## Removing duplicates

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = pd.DataFrame({"k1":["one", "two"]* 3 + ["two"],
                    "k2": [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [5]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [6]:
data.drop_duplicates(subset='k1')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [7]:
data.drop_duplicates(subset='k2')

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [10]:
data.drop_duplicates(["k1", "k2",])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [11]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


Notice how the last row is preserved while dropping the duplicates instead of the default one where the first one is preserved.

## Transforming Data Using a Function or Mapping

In [13]:
data = pd.DataFrame({"food": ["cookie", "pulled york", "cookie", "pasta", "corned cheez", "cookie", "pasta", "honey", "nova"],
                        "ounces": [4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,cookie,4.0
1,pulled york,3.0
2,cookie,12.0
3,pasta,6.0
4,corned cheez,7.5
5,cookie,8.0
6,pasta,3.0
7,honey,5.0
8,nova,6.0


In [16]:
food_to_ingre = {
    "cookie": "milk",
    "pulled york": "milk",
    "pasta": "chocolate",
    "corned cheez": "chocolate",
    "honey": "milk",
    "nova": "bread"
}

In [17]:
data['ingre'] = data['food'].map(food_to_ingre)
data

Unnamed: 0,food,ounces,ingre
0,cookie,4.0,milk
1,pulled york,3.0,milk
2,cookie,12.0,milk
3,pasta,6.0,chocolate
4,corned cheez,7.5,chocolate
5,cookie,8.0,milk
6,pasta,3.0,chocolate
7,honey,5.0,milk
8,nova,6.0,bread


In [18]:
def get_ingredient(x):
    return food_to_ingre[x]

In [19]:
data['food'].map(get_ingredient)

0         milk
1         milk
2         milk
3    chocolate
4    chocolate
5         milk
6    chocolate
7         milk
8        bread
Name: food, dtype: object

## Replacing Values

In [20]:
data = pd.Series([1.,-999,2.,-999.,-1000,3.])

In [21]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [22]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [23]:
data.replace([-999,-1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [24]:
data.replace([-999,-1000],[np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [25]:
data.replace({-999:np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

> `data.replace` method is disctinct from `data.str.replace`, which performs element wise string substitution.
## Renaming Axis Indexes

In [26]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

In [27]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [28]:
def transform(x):
    return x[:4].upper()

In [29]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [30]:
data.index = data.index.map(transform)

In [31]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


- If we want to create a transformed version of the dataset without modifying the original, a useful method is `rename`

In [32]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [33]:
data.rename(index=str.lower, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
ohio,0,1,2,3
colo,4,5,6,7
new,8,9,10,11


In [34]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## Discretization and Binning