# Data Transformation
## Removing duplicates

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = pd.DataFrame({"k1":["one", "two"]* 3 + ["two"],
                    "k2": [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [5]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [6]:
data.drop_duplicates(subset='k1')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [7]:
data.drop_duplicates(subset='k2')

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [10]:
data.drop_duplicates(["k1", "k2",])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [11]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


Notice how the last row is preserved while dropping the duplicates instead of the default one where the first one is preserved.

## Transforming Data Using a Function or Mapping

In [13]:
data = pd.DataFrame({"food": ["cookie", "pulled york", "cookie", "pasta", "corned cheez", "cookie", "pasta", "honey", "nova"],
                        "ounces": [4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,cookie,4.0
1,pulled york,3.0
2,cookie,12.0
3,pasta,6.0
4,corned cheez,7.5
5,cookie,8.0
6,pasta,3.0
7,honey,5.0
8,nova,6.0


In [16]:
food_to_ingre = {
    "cookie": "milk",
    "pulled york": "milk",
    "pasta": "chocolate",
    "corned cheez": "chocolate",
    "honey": "milk",
    "nova": "bread"
}

In [17]:
data['ingre'] = data['food'].map(food_to_ingre)
data

Unnamed: 0,food,ounces,ingre
0,cookie,4.0,milk
1,pulled york,3.0,milk
2,cookie,12.0,milk
3,pasta,6.0,chocolate
4,corned cheez,7.5,chocolate
5,cookie,8.0,milk
6,pasta,3.0,chocolate
7,honey,5.0,milk
8,nova,6.0,bread


In [18]:
def get_ingredient(x):
    return food_to_ingre[x]

In [19]:
data['food'].map(get_ingredient)

0         milk
1         milk
2         milk
3    chocolate
4    chocolate
5         milk
6    chocolate
7         milk
8        bread
Name: food, dtype: object

## Replacing Values

In [20]:
data = pd.Series([1.,-999,2.,-999.,-1000,3.])

In [21]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [22]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [23]:
data.replace([-999,-1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [24]:
data.replace([-999,-1000],[np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [25]:
data.replace({-999:np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

> `data.replace` method is disctinct from `data.str.replace`, which performs element wise string substitution.
## Renaming Axis Indexes

In [26]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

In [27]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [28]:
def transform(x):
    return x[:4].upper()

In [29]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [30]:
data.index = data.index.map(transform)

In [31]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


- If we want to create a transformed version of the dataset without modifying the original, a useful method is `rename`

In [32]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [33]:
data.rename(index=str.lower, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
ohio,0,1,2,3
colo,4,5,6,7
new,8,9,10,11


In [34]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## Discretization and Binning

Continous data is often discretized or otherwise separated into "bins" for analysis. Suppose we have data about a group of people in a study, and we want to group them into discrete age buckets.

In [40]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

Lets divide these into bins of 18 to 25, 26 to 35, 36 to 60 and finally 61 and older.
For this, we will have to use the `pandas.cut`

In [38]:
bins = [18, 25, 35, 60, 100]

In [41]:
age_categories = pd.cut(ages, bins)

In [42]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

- The object pandas returns is a special Categorical object. 
- The output we see describes the bins computed by `pd.cut`.
- Each bin is identified by a special (unique to pandas) interval value type containing the lower and upper limit of each bin

In [43]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [44]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [45]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [46]:
age_categories.codes[0]

np.int8(0)

In [47]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

Note, that the `pd.value_counts(categories)` are the bin counts for the result of `pandas.cut`

We can change which side is closed in bin by passing `right=False`

In [48]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

- We can override the default interval-based bin labeling by passing a list of array to `labels` option

In [49]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

In [50]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

- We can pass an integer number of bins to `pandas.cut` instead of explicit bin edges.
- It will compute equal-length bins based on the minimum and maximum values in the data.
- Consider the case of some uniformly distributed data chopped into fourths:

In [51]:
data = np.random.uniform(size=20)
data

array([0.56154529, 0.64079031, 0.54605278, 0.43535345, 0.17566608,
       0.39566613, 0.02800989, 0.29128847, 0.65130272, 0.06638601,
       0.93824308, 0.29744336, 0.41561017, 0.15086483, 0.73257327,
       0.49323869, 0.18492483, 0.5353046 , 0.27634234, 0.1198374 ])

In [52]:
pd.cut(data, 4, precision=2)

[(0.48, 0.71], (0.48, 0.71], (0.48, 0.71], (0.26, 0.48], (0.027, 0.26], ..., (0.48, 0.71], (0.027, 0.26], (0.48, 0.71], (0.26, 0.48], (0.027, 0.26]]
Length: 20
Categories (4, interval[float64, right]): [(0.027, 0.26] < (0.26, 0.48] < (0.48, 0.71] < (0.71, 0.94]]

The `precision=2` option limits the decimal precision to two digits.

- A closely related function, `pandas.qcut`, bins the data based on sample quantiles.
- Depending on the distribution of the data, using `pandas.cut` will not usually result in each bin having the same number of data points.
- Since `pandas.qcut` uses sample quantiles instead, we will obtain roughly equally sized bins.

In [53]:
data = np.random.standard_normal(1000)

In [55]:
quartiles = pd.qcut(data, 4, precision=2)

In [56]:
quartiles

[(0.032, 0.65], (0.032, 0.65], (0.65, 3.6], (-0.68, 0.032], (-3.3099999999999996, -0.68], ..., (-3.3099999999999996, -0.68], (0.65, 3.6], (0.032, 0.65], (-0.68, 0.032], (0.032, 0.65]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.3099999999999996, -0.68] < (-0.68, 0.032] < (0.032, 0.65] < (0.65, 3.6]]

In [57]:
pd.value_counts(quartiles)

  pd.value_counts(quartiles)


(-3.3099999999999996, -0.68]    250
(-0.68, 0.032]                  250
(0.032, 0.65]                   250
(0.65, 3.6]                     250
Name: count, dtype: int64