In [None]:
'''
Adding, Renaming, and Removing Columns

    - add a new column as a function of existing columns
    - rename a column
    - hide a column (temporarily)
    - delete a column (permanently)

Advanced Filtering (of rows) and Selecting (of columns)

    - loc: filter rows by LABEL, and select columns by LABEL
    - iloc: filter rows by POSITION, and select columns by POSITION
    - mixing: select columns by LABEL, then filter rows by POSITION

'''

In [1]:
import pandas as pd
import numpy as np

In [3]:
# reset the DataFrame
drinks = pd.read_csv('./data/drinks.csv', na_filter=False)



In [4]:
# add a new column as a function of existing columns
# note: can't (usually) assign to an attribute (e.g., 'drinks.total_servings')
drinks['total_servings'] = drinks.beer_servings + drinks.spirit_servings + drinks.wine_servings
drinks['alcohol_mL'] = drinks.total_litres_of_pure_alcohol * 1000
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,total_servings,alcohol_mL
0,Afghanistan,0,0,0,0.0,AS,0,0.0
1,Albania,89,132,54,4.9,EU,275,4900.0
2,Algeria,25,0,14,0.7,AF,39,700.0
3,Andorra,245,138,312,12.4,EU,695,12400.0
4,Angola,217,57,45,5.9,AF,319,5900.0


In [None]:
# alternative method: default is column sums, 'axis=1' does row sums instead
drinks['total_servings'] = drinks.loc[:, 'beer_servings':'wine_servings'].sum(axis=1)



In [5]:
# rename a column
drinks.rename(columns={'total_litres_of_pure_alcohol':'alcohol_litres'}, inplace=False)


Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,alcohol_litres,continent,total_servings,alcohol_mL
0,Afghanistan,0,0,0,0.0,AS,0,0.0
1,Albania,89,132,54,4.9,EU,275,4900.0
2,Algeria,25,0,14,0.7,AF,39,700.0
3,Andorra,245,138,312,12.4,EU,695,12400.0
4,Angola,217,57,45,5.9,AF,319,5900.0
5,Antigua & Barbuda,102,128,45,4.9,,275,4900.0
6,Argentina,193,25,221,8.3,SA,439,8300.0
7,Armenia,21,179,11,3.8,EU,211,3800.0
8,Australia,261,72,212,10.4,OC,545,10400.0
9,Austria,279,75,191,9.7,EU,545,9700.0


In [6]:
# hide a column (temporarily)
drinks.drop(['alcohol_mL'], axis=1)     # use 'axis=0' to drop rows instead
drinks[drinks.columns[:-1]]             # slice 'columns' attribute like a list


Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,total_servings
0,Afghanistan,0,0,0,0.0,AS,0
1,Albania,89,132,54,4.9,EU,275
2,Algeria,25,0,14,0.7,AF,39
3,Andorra,245,138,312,12.4,EU,695
4,Angola,217,57,45,5.9,AF,319
5,Antigua & Barbuda,102,128,45,4.9,,275
6,Argentina,193,25,221,8.3,SA,439
7,Armenia,21,179,11,3.8,EU,211
8,Australia,261,72,212,10.4,OC,545
9,Austria,279,75,191,9.7,EU,545


In [7]:
# delete a column (permanently)
del drinks['alcohol_mL']

## Advanced Filtering (of rows) and Selecting (of columns)

In [10]:
# read 'u.user' into 'users'
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('./data/u.user', header = None, sep='|', names = u_cols,
                      index_col='user_id', dtype={'zip_code':str})

In [11]:
users.head(3)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067


In [None]:
# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]                        # row with label 1
users.loc[1:3]                      # rows with labels 1 through 3
users.loc[1:3, 'age':'occupation']  # rows 1-3, columns 'age' through 'occupation'
users.loc[:, 'age':'occupation']    # all rows, columns 'age' through 'occupation'
users.loc[[1,3], ['age','gender']]  # rows 1 and 3, columns 'age' and 'gender'

In [15]:
users.loc[[1,3], ['age','gender']]

Unnamed: 0_level_0,age,gender
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,24,M
3,23,M


In [None]:
# iloc: filter rows by POSITION, and select columns by POSITION
users.iloc[0]                       # row with 0th position (first row)
users.iloc[0:3]                     # rows with positions 0 through 2 (not 3)
users.iloc[0:3, 0:3]                # rows and columns with positions 0 through 2
users.iloc[:, 0:3]                  # all rows, columns with positions 0 through 2
users.iloc[[0,2], [0,1]]            # 1st and 3rd row, 1st and 2nd column

In [17]:
users.iloc[0:3]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067


In [None]:
# mixing: select columns by LABEL, then filter rows by POSITION
users.age[0:3]
users[['age', 'gender', 'occupation']][0:3]

