# Data Cleaning and Preparation

In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [103]:
pd.options.display.max_rows =10

## Handling Missing Data

Missing data occurs commonly in many data analysis applications. One of the goals of pandas is to make working with missing data as painless as possible. For example, all of the descriptive statistics on pandas objects exclude missing data by default. The way that missing data is represented in pandas objects is somewhat imperfect, but it is functional for a lot of users. For numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data. We call this a sentinel value that can be easily detected:

(Page 191). 

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In pandas, we’ve adopted a convention used in the R programming language by refer‐ ring to missing data as NA, which stands for not available. In statistics applications, NA data may either be data that does not exist or that exists but was not observed (through problems with data collection, for example). When cleaning up data for analysis, it is often important to do analysis on the missing data itself to identify data collection problems or potential biases in the data caused by missing data.

(Page 192). 

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

There is work ongoing in the pandas project to improve the internal details of how missing data is handled, but the user API functions, like pandas.isnull, abstract away many of the annoying details. See Table 7-1 for a list of some functions related to missing data handling.

(Page 192). 

![T7-1](images/T7-1.png)

### Filtering Out Missing Data

In [7]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()

In [11]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [15]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [35]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.476985,1.073002,0.53693
1,-0.243824,-0.369032,1.495228
2,-0.905491,-2.611725,-0.463787
3,1.285044,0.687209,1.825932
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


In [36]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.476985,,
1,-0.243824,,
2,-0.905491,,-0.463787
3,1.285044,,1.825932
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


In [37]:
df.dropna()

Unnamed: 0,0,1,2
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


A related way to filter out DataFrame rows tends to concern time series data. Suppose you want to keep only rows containing a certain number of observations. You can indicate this with the thresh argument:

(Page 194). 

In [38]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.905491,,-0.463787
3,1.285044,,1.825932
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


### Filling In Missing Data

In [39]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.476985,0.0,0.0
1,-0.243824,0.0,0.0
2,-0.905491,0.0,-0.463787
3,1.285044,0.0,1.825932
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


Calling fillna with a dict, you can use a different fill value for each column:

(Page 195). 

In [40]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.476985,0.5,0.0
1,-0.243824,0.5,0.0
2,-0.905491,0.5,-0.463787
3,1.285044,0.5,1.825932
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


fillna returns a new object, but you can modify the existing object in-place:

(Page 195). 

In [41]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.476985,0.0,0.0
1,-0.243824,0.0,0.0
2,-0.905491,0.0,-0.463787
3,1.285044,0.0,1.825932
4,0.542931,-0.379708,1.09684
5,1.629229,-0.427563,-0.127973
6,-2.355273,1.130334,-0.582393


In [42]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df


Unnamed: 0,0,1,2
0,0.153235,0.069835,-0.150774
1,0.717003,1.168658,-0.703106
2,-0.610642,,0.154646
3,0.214508,,1.732811
4,0.770218,,
5,-1.502118,,


In [43]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.153235,0.069835,-0.150774
1,0.717003,1.168658,-0.703106
2,-0.610642,1.168658,0.154646
3,0.214508,1.168658,1.732811
4,0.770218,1.168658,1.732811
5,-1.502118,1.168658,1.732811


In [44]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.153235,0.069835,-0.150774
1,0.717003,1.168658,-0.703106
2,-0.610642,1.168658,0.154646
3,0.214508,1.168658,1.732811
4,0.770218,,1.732811
5,-1.502118,,1.732811


In [45]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

![T7-2](images/T7-2.png)

## Data Transformation

### Removing Duplicates

In [46]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [47]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [48]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Both of these methods by default consider all of the columns; alternatively, you can specify any subset of them to detect duplicates. Suppose we had an additional column of values and wanted to filter duplicates only based on the 'k1' column:

(Page 198). 

In [49]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [50]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping

In [70]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [71]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

meat_to_animal

{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}

The map method on a Series accepts a function or dict-like object containing a map‐ ping, but here we have a small problem in that some of the meats are capitalized and others are not. Thus, we need to convert each value to lowercase using the str.lower Series method:

(Page 199). 

In [54]:
lowercased = data['food'].str.lower()
lowercased


0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [55]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [56]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

> Using map is a convenient way to perform element-wise transformations and other data cleaning–related operations.



As you’ve already seen, map can be used to modify a subset of values in an object but replace provides a simpler and more flexible way to do so. Let’s con‐ sider this Series

In [73]:
data['food'].replace(meat_to_animal)

0         pig
1         pig
2         pig
3    Pastrami
4         cow
5       Bacon
6         cow
7         pig
8      salmon
Name: food, dtype: object

### Replacing Values

Filling in missing data with the fillna method is a special case of more general value replacement. As you’ve already seen, map can be used to modify a subset of values in an object but replace provides a simpler and more flexible way to do so. Let’s con‐ sider this Series:

(Page 200). 

In [59]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [60]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [61]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [62]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [63]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

The data.replace method is distinct from ***data.str.replace***, which performs string substitution element-wise. We look at these string methods on Series later in the chapter.



### Renaming Axis Indexes

In [76]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Like a Series, the axis indexes have a map method:

(Page 202). 

In [77]:
data.index.map(lambda x : x[:4].upper())

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [80]:
data.index = data.index.map(lambda x : x[:4].upper())
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a dataset without modifying the origi‐ nal, a useful method is rename:

(Page 202). 

In [81]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


Notably, rename can be used in conjunction with a dict-like object providing new val‐ ues for a subset of the axis labels:

(Page 202). 

In [82]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


rename saves you from the chore of copying the DataFrame manually and assigning to its index and columns attributes. Should you wish to modify a dataset in-place, pass inplace=True:

(Page 202). 

In [83]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

Continuous data is often discretized or otherwise separated into “bins” for analysis. Suppose you have data about a group of people in a study, and you want to group them into discrete age buckets:

(Page 203). 

In [84]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [85]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

The object pandas returns is a special Categorical object. The output you see describes the bins computed by pandas.cut. You can treat it like an array of strings indicating the bin name; internally it contains a categories array specifying the dis‐ tinct category names along with a labeling for the ages data in the codes attribute:

(Page 203). 

In [86]:
cats.codes


array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [87]:
cats.categories


IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [88]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

Note that pd.value_counts(cats) are the bin counts for the result of pandas.cut. 

Consistent with mathematical notation for intervals, a parenthesis means that the side is open, while the square bracket means it is closed (inclusive). You can change which side is closed by passing right=False:

(Page 203). 

In [89]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [90]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

If you pass an integer number of bins to cut instead of explicit bin edges, it will com‐ pute equal-length bins based on the minimum and maximum values in the data. Consider the case of some uniformly distributed data chopped into fourths:

(Page 204). 

In [98]:
data = np.random.rand(20)
cats = pd.cut(data, 4, precision=2)

In [100]:
pd.value_counts(cats)

(0.7, 0.92]      6
(0.27, 0.49]     5
(0.055, 0.27]    5
(0.49, 0.7]      4
dtype: int64

A closely related function, qcut, bins the data based on sample quantiles. Depending on the distribution of the data, using cut will not usually result in each bin having the same number of data points. Since qcut uses sample quantiles instead, by definition you will obtain roughly equal-size bins:

(Page 204). 

In [92]:
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats


[(-2.9019999999999997, -0.619], (-0.619, -0.0565], (-0.619, -0.0565], (-0.619, -0.0565], (-0.0565, 0.647], ..., (-0.0565, 0.647], (-0.619, -0.0565], (-0.619, -0.0565], (-2.9019999999999997, -0.619], (0.647, 3.217]]
Length: 1000
Categories (4, interval[float64]): [(-2.9019999999999997, -0.619] < (-0.619, -0.0565] < (-0.0565, 0.647] < (0.647, 3.217]]

In [93]:
pd.value_counts(cats)

(0.647, 3.217]                   250
(-0.0565, 0.647]                 250
(-0.619, -0.0565]                250
(-2.9019999999999997, -0.619]    250
dtype: int64

In [94]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.192, -0.0565], (-1.192, -0.0565], (-1.192, -0.0565], (-1.192, -0.0565], (-0.0565, 1.267], ..., (-0.0565, 1.267], (-1.192, -0.0565], (-1.192, -0.0565], (-2.9019999999999997, -1.192], (1.267, 3.217]]
Length: 1000
Categories (4, interval[float64]): [(-2.9019999999999997, -1.192] < (-1.192, -0.0565] < (-0.0565, 1.267] < (1.267, 3.217]]

### Detecting and Filtering Outliers

Filtering or transforming outliers is largely a matter of applying array operations. Consider a DataFrame with some normally distributed data:

(Page 205). 

In [112]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.027114,0.025821,-0.019453,0.01567
std,1.023789,0.972475,0.980299,1.000029
min,-3.117113,-2.750725,-3.432328,-3.227351
25%,-0.666207,-0.630401,-0.659293,-0.635584
50%,-0.010172,0.066768,-0.055669,0.001547
75%,0.780902,0.693279,0.600337,0.689406
max,3.137467,3.07158,3.17649,3.176421


In [118]:
col = data[2]
col

0      0.980769
1      0.509258
2     -1.590049
3     -0.509216
4     -1.579272
         ...   
995   -0.862144
996    1.973399
997   -0.204456
998   -0.555508
999   -0.402180
Name: 2, Length: 1000, dtype: float64

Suppose you wanted to find values in one of the columns exceeding 3 in absolute value:

(Page 205). 

In [120]:
col[np.abs(col) > 3]

30     3.075242
364    3.176490
415   -3.432328
451   -3.282686
Name: 2, dtype: float64

To select all rows having a value exceeding 3 or –3, you can use the any method on a boolean DataFrame:

(Page 206). 

In [127]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
30,-1.272166,-1.569578,3.075242,-0.488119
37,-3.117113,1.329890,0.129447,0.595403
38,-0.045376,1.138740,-0.595461,3.139557
54,-3.088012,1.707443,-0.830500,-0.590273
271,3.137467,0.474998,1.379609,0.720891
...,...,...,...,...
415,1.771129,1.108603,-3.432328,-1.391596
451,-0.093961,0.605784,-3.282686,0.066836
697,1.026916,3.071580,0.599561,-0.346484
857,0.918342,-0.301441,1.377244,3.176421


Values can be set based on these criteria. Here is code to cap values outside the inter‐ val –3 to 3:

(Page 206). 

In [143]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.027181,0.025749,-0.01899,0.015581
std,1.022762,0.972253,0.977186,0.998347
min,-3.0,-2.750725,-3.0,-3.0
25%,-0.666207,-0.630401,-0.659293,-0.635584
50%,-0.010172,0.066768,-0.055669,0.001547
75%,0.780902,0.693279,0.600337,0.689406
max,3.0,3.0,3.0,3.0


The statement np.sign(data) produces 1 and –1 values based on whether the values in data are positive or negative:

(Page 206). 

In [144]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,-1.0
1,1.0,-1.0,1.0,-1.0
2,1.0,1.0,-1.0,1.0
3,-1.0,-1.0,-1.0,-1.0
4,1.0,-1.0,-1.0,-1.0


### Permutation and Random Sampling

Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using the numpy.random.permutation function. Calling permutation with the length of the axis you want to permute produces an array of integers indicating the new ordering:

(Page 206). 

In [145]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [146]:
sampler = np.random.permutation(5)
sampler

array([1, 0, 4, 3, 2])

That array can then be used in iloc-based indexing or the equivalent take function:

(Page 207). 

In [147]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11


In [148]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
4,16,17,18,19


To generate a sample with replacement (to allow repeat choices), pass replace=True to sample:

(Page 207). 

In [151]:
choices = pd.Series([5, 7, -1, 6, 4])


In [150]:
draws = choices.sample(n=10, replace=True)
draws

0    5
2   -1
3    6
2   -1
1    7
0    5
2   -1
4    4
2   -1
2   -1
dtype: int64

### Computing Indicator/Dummy Variables

Another type of transformation for statistical modeling or machine learning applica‐ tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a column in a DataFrame has k distinct values, you would derive a matrix or Data‐ Frame with k columns containing all 1s and 0s. pandas has a get_dummies function for doing this, though devising one yourself is not difficult. Let’s return to an earlier example DataFrame:

(Page 208). 

In [152]:
df = pd.DataFrame({'key':   ['b','b','a','c','a','b'],
                   'data1': range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [153]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [158]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [159]:
mnames = ['movie_id', 'title', 'genres']

In [199]:
movies = pd.read_table('datasets/movielens/movies.dat', sep='::', 
                        header=None, names=mnames)
movies

  


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


Adding indicator variables for each genre requires a little bit of wrangling. First, we extract the list of unique genres in the dataset:

(Page 209). 

In [205]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [206]:
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [207]:
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [209]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now, iterate through each movie and set entries in each row of dummies to 1. To do this, we use the dummies.columns to compute the column indices for each genre:

(Page 209). 

In [211]:
gen = movies.genres[0]
gen.split('|')


['Animation', "Children's", 'Comedy']

In [212]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

Then, we can use .iloc to set values based on these indices:

(Page 210). 

In [213]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [217]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [219]:
dummies.sum()

Animation      105.0
Children's     251.0
Comedy        1200.0
Adventure      283.0
Fantasy         68.0
               ...  
War            143.0
Musical        114.0
Mystery        106.0
Film-Noir       44.0
Western         68.0
Length: 18, dtype: float64

Then, as before, you can combine this with movies:

(Page 210). 

In [214]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                      1
title                          Toy Story (1995)
genres              Animation|Children's|Comedy
Genre_Animation                               1
Genre_Children's                              1
                               ...             
Genre_War                                     0
Genre_Musical                                 0
Genre_Mystery                                 0
Genre_Film-Noir                               0
Genre_Western                                 0
Name: 0, Length: 21, dtype: object

A useful recipe for statistical applications is to combine get_dummies with a discreti‐ zation function like cut:

(Page 210). 

In [215]:
np.random.seed(12345)
values = np.random.rand(10)
values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


## String Manipulation

### String Object Methods

In [None]:
val = 'a,b,  guido'
val.split(',')

In [None]:
pieces = [x.strip() for x in val.split(',')]
pieces

In [None]:
first, second, third = pieces
first + '::' + second + '::' + third

In [None]:
'::'.join(pieces)

In [None]:
'guido' in val
val.index(',')
val.find(':')

In [None]:
val.index(':')

In [None]:
val.count(',')

In [None]:
val.replace(',', '::')
val.replace(',', '')

### Regular Expressions

In [None]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

In [None]:
regex = re.compile('\s+')
regex.split(text)

In [None]:
regex.findall(text)

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
regex.findall(text)

In [None]:
m = regex.search(text)
m
text[m.start():m.end()]

In [None]:
print(regex.match(text))

In [None]:
print(regex.sub('REDACTED', text))

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')
m.groups()

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

### Vectorized String Functions in pandas

In [None]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()

In [None]:
data.str.contains('gmail')

In [None]:
pattern
data.str.findall(pattern, flags=re.IGNORECASE)

In [None]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

In [None]:
matches.str.get(1)
matches.str[0]

In [None]:
data.str[:5]

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

## Conclusion