In [None]:
'''
Other Useful Features
    - Series have many useful string methods (accessed via 'str')
    - replace all instances of a value (supports 'inplace=True' argument)
    - map values to other values
    - convert a range of values into descriptive groups
    - create dummy variables for 'continent' and add them to the DataFrame
    - randomly sample a DataFrame
'''

In [None]:
# limit which rows are read when reading in a file
None          # only read first 10 rows
None          # skip the first two rows of data



In [None]:
# replace existing column headers when reading in a file
col_names = ['country', 'beer', 'spirit', 'wine', 'alcohol', 'continent']
pd.read_csv('./data/drinks.csv', header=0, names=col_names)



In [None]:
# create a DataFrame from a dictionary of lists
pd.DataFrame({'state':['AL', 'AK', 'AZ'], 'capital':['Montgomery', 'Juneau', 'Phoenix']})



In [None]:
# Series have many useful string methods (accessed via 'str')
drinks.country.str.upper()                  # returns uppercase Series
drinks.country.str.contains('Aus')          # returns a Series of booleans...
drinks[drinks.country.str.contains('Aus')]  # ...which can be used for filtering



In [None]:
# only select columns with names that match a specific pattern
cols = pd.Series(drinks.columns)
drinks[cols[cols.str.contains('servings')]]



In [None]:
# replace all instances of a value (supports 'inplace=True' argument)
drinks.continent.replace('EU', 'EUR')   # replace values in a Series
drinks.replace('USA', 'United States')  # replace values throughout a DataFrame



In [None]:
# map values to other values
drinks['hemisphere'] = drinks.continent.map({'NA':'West', 'SA':'West', 'EU':'East', 'AF':'East', 'AS':'East', 'OC':'East'})


In [None]:
# convert a range of values into descriptive groups
drinks['beer_level'] = 'low'    # initially set all values to 'low'
drinks.loc[drinks.beer_servings.between(101, 200), 'beer_level'] = 'med'    # change 101-200 to 'med'
drinks.loc[drinks.beer_servings.between(201, 400), 'beer_level'] = 'high'   # change 201-400 to 'high'



In [None]:
# display a cross-tabulation of two Series
pd.crosstab(drinks.continent, drinks.beer_level)



In [None]:
# convert 'beer_level' into the 'category' data type (new in pandas 0.15.0)
drinks['beer_level'] = pd.Categorical(drinks.beer_level, categories=['low', 'med', 'high'])
drinks.sort_index(by='beer_level')      # sorts by the categorical ordering (low to high)



In [None]:
# create dummy variables for 'continent' and add them to the DataFrame
cont_dummies = pd.get_dummies(drinks.continent, prefix='cont').iloc[:, 1:]  # exclude first column
drinks = pd.concat([drinks, cont_dummies], axis=1)  # axis=0 for rows, axis=1 for columns



In [None]:
# randomly sample a DataFrame
mask = np.random.rand(len(drinks)) < 0.66   # create a Series of booleans
train = drinks[mask]                        # will contain about 66% of the rows
test = drinks[~mask]                        # will contain the remaining rows

## Answer

In [None]:
'''
Other Useful Features
'''

# limit which rows are read when reading in a file
pd.read_csv('../data/drinks.csv', nrows=10)         # only read first 10 rows
pd.read_csv('../data/drinks.csv', skiprows=[1, 2])  # skip the first two rows of data

# replace existing column headers when reading in a file
col_names = ['country', 'beer', 'spirit', 'wine', 'alcohol', 'continent']
pd.read_csv('../data/drinks.csv', header=0, names=col_names)

# create a DataFrame from a dictionary of lists
pd.DataFrame({'state':['AL', 'AK', 'AZ'], 'capital':['Montgomery', 'Juneau', 'Phoenix']})

# Series have many useful string methods (accessed via 'str')
drinks.country.str.upper()                  # returns uppercase Series
drinks.country.str.contains('Aus')          # returns a Series of booleans...
drinks[drinks.country.str.contains('Aus')]  # ...which can be used for filtering

# only select columns with names that match a specific pattern
cols = pd.Series(drinks.columns)
drinks[cols[cols.str.contains('servings')]]

# replace all instances of a value (supports 'inplace=True' argument)
drinks.continent.replace('EU', 'EUR')   # replace values in a Series
drinks.replace('USA', 'United States')  # replace values throughout a DataFrame

# map values to other values
drinks['hemisphere'] = drinks.continent.map({'NA':'West', 'SA':'West', 'EU':'East', 'AF':'East', 'AS':'East', 'OC':'East'})

# convert a range of values into descriptive groups
drinks['beer_level'] = 'low'    # initially set all values to 'low'
drinks.loc[drinks.beer_servings.between(101, 200), 'beer_level'] = 'med'    # change 101-200 to 'med'
drinks.loc[drinks.beer_servings.between(201, 400), 'beer_level'] = 'high'   # change 201-400 to 'high'

# display a cross-tabulation of two Series
pd.crosstab(drinks.continent, drinks.beer_level)

# convert 'beer_level' into the 'category' data type (new in pandas 0.15.0)
drinks['beer_level'] = pd.Categorical(drinks.beer_level, categories=['low', 'med', 'high'])
drinks.sort_index(by='beer_level')      # sorts by the categorical ordering (low to high)

# create dummy variables for 'continent' and add them to the DataFrame
cont_dummies = pd.get_dummies(drinks.continent, prefix='cont').iloc[:, 1:]  # exclude first column
drinks = pd.concat([drinks, cont_dummies], axis=1)  # axis=0 for rows, axis=1 for columns

# randomly sample a DataFrame
mask = np.random.rand(len(drinks)) < 0.66   # create a Series of booleans
train = drinks[mask]                        # will contain about 66% of the rows
test = drinks[~mask]                        # will contain the remaining rows
