## Unit 2 - Subsetting DataFrames and Series

In [28]:
import pandas as pd

In [29]:
df_raw = pd.read_csv('../data/airbnb_rooms.csv')
columns = ['room_id', 'host_id', 'room_type', 'neighborhood', 'reviews',
           'overall_satisfaction', 'accommodates', 'bedrooms', 'bathrooms', 'price', 'minstay']
df_raw = df_raw[columns]
df_raw = df_raw.set_index('room_id', drop=True, verify_integrity=True).sort_index()
df = df_raw.head(7)

## Row selection

### Selecting rows by index position - iloc

1 - Select the first row and get it as a pandas Series

In [30]:
df.iloc[0]

host_id                           14455
room_type               Entire home/apt
neighborhood                      Belém
reviews                               8
overall_satisfaction                  5
accommodates                          2
bedrooms                              1
bathrooms                           NaN
price                                57
minstay                             NaN
Name: 6499, dtype: object

2 - Select the first row and get it as a pandas DataFrame

In [31]:
df.iloc[[0]]

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6499,14455,Entire home/apt,Belém,8,5.0,2,1.0,,57.0,


3 - Select rows by specifying its positions with a list

In [32]:
df.iloc[[0, 1, 2, 3]]

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6499,14455,Entire home/apt,Belém,8,5.0,2,1.0,,57.0,
17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,,46.0,
25659,107347,Entire home/apt,Santa Maria Maior,63,5.0,3,1.0,,69.0,
29248,125768,Entire home/apt,Santa Maria Maior,225,4.5,4,1.0,,58.0,


4 - Select rows by specifying its positions with a list slice

In [33]:
df.iloc[:7]

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6499,14455,Entire home/apt,Belém,8,5.0,2,1.0,,57.0,
17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,,46.0,
25659,107347,Entire home/apt,Santa Maria Maior,63,5.0,3,1.0,,69.0,
29248,125768,Entire home/apt,Santa Maria Maior,225,4.5,4,1.0,,58.0,
29396,126415,Entire home/apt,Santa Maria Maior,132,5.0,4,1.0,,67.0,
29720,128075,Entire home/apt,Estrela,14,5.0,16,9.0,,1154.0,
29872,128698,Entire home/apt,Alcântara,25,5.0,2,1.0,,75.0,


### Selecting rows by index name - loc

1 - Select the row that corresponds to room 17031 and get it as a pandas Series

In [34]:
df.loc[17031]

host_id                           66015
room_type               Entire home/apt
neighborhood                   Alvalade
reviews                               0
overall_satisfaction                  0
accommodates                          2
bedrooms                              1
bathrooms                           NaN
price                                46
minstay                             NaN
Name: 17031, dtype: object

2 - Select the row that corresponds to room 17031 and get it as a pandas DataFrame

In [35]:
df.loc[[17031]]

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,,46.0,


3 - Select the row that correponds to room 1 and get a KeyError because room 1 doesn't exist

In [36]:
df.loc[[1]]

KeyError: 'None of [[1]] are in the [index]'

4 - Select the rows that correspond to rooms 17031 and 25659

In [37]:
df.loc[[17031, 25659]]

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,,46.0,
25659,107347,Entire home/apt,Santa Maria Maior,63,5.0,3,1.0,,69.0,


## Column selection

### Selecting columns by name - dot, brackets, loc

1 - Select column room_type using dot notation, always returns a pandas Series

In [38]:
df.room_type

room_id
6499     Entire home/apt
17031    Entire home/apt
25659    Entire home/apt
29248    Entire home/apt
29396    Entire home/apt
29720    Entire home/apt
29872    Entire home/apt
Name: room_type, dtype: object

2 - Select column room_type using brackets and get a pandas Series

In [39]:
df['room_type']

room_id
6499     Entire home/apt
17031    Entire home/apt
25659    Entire home/apt
29248    Entire home/apt
29396    Entire home/apt
29720    Entire home/apt
29872    Entire home/apt
Name: room_type, dtype: object

3 - Select column room_type using brackets and get a pandas DataFrame

In [40]:
df[['room_type']]

Unnamed: 0_level_0,room_type
room_id,Unnamed: 1_level_1
6499,Entire home/apt
17031,Entire home/apt
25659,Entire home/apt
29248,Entire home/apt
29396,Entire home/apt
29720,Entire home/apt
29872,Entire home/apt


4 - Select columns room_type and neighborhood using brackets

In [41]:
df[['room_type', 'neighborhood']]

Unnamed: 0_level_0,room_type,neighborhood
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6499,Entire home/apt,Belém
17031,Entire home/apt,Alvalade
25659,Entire home/apt,Santa Maria Maior
29248,Entire home/apt,Santa Maria Maior
29396,Entire home/apt,Santa Maria Maior
29720,Entire home/apt,Estrela
29872,Entire home/apt,Alcântara


5 - Select column room_type using loc

In [42]:
df.loc[:, 'room_type']

room_id
6499     Entire home/apt
17031    Entire home/apt
25659    Entire home/apt
29248    Entire home/apt
29396    Entire home/apt
29720    Entire home/apt
29872    Entire home/apt
Name: room_type, dtype: object

6 - Select columns room_type and neighborhood using loc

In [43]:
df.loc[:, ['room_type', 'neighborhood']]

Unnamed: 0_level_0,room_type,neighborhood
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6499,Entire home/apt,Belém
17031,Entire home/apt,Alvalade
25659,Entire home/apt,Santa Maria Maior
29248,Entire home/apt,Santa Maria Maior
29396,Entire home/apt,Santa Maria Maior
29720,Entire home/apt,Estrela
29872,Entire home/apt,Alcântara


## Mask function

Hiding the rows that verify a certain condition

In [44]:
df.mask(df.overall_satisfaction == 5.0)

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6499,,,,,,,,,,
17031,66015.0,Entire home/apt,Alvalade,0.0,0.0,2.0,1.0,,46.0,
25659,,,,,,,,,,
29248,125768.0,Entire home/apt,Santa Maria Maior,225.0,4.5,4.0,1.0,,58.0,
29396,,,,,,,,,,
29720,,,,,,,,,,
29872,,,,,,,,,,


## Where function

Hiding the rows that __don't__ verify a certain condition

In [45]:
df.mask(df.overall_satisfaction == 5.0)

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6499,,,,,,,,,,
17031,66015.0,Entire home/apt,Alvalade,0.0,0.0,2.0,1.0,,46.0,
25659,,,,,,,,,,
29248,125768.0,Entire home/apt,Santa Maria Maior,225.0,4.5,4.0,1.0,,58.0,
29396,,,,,,,,,,
29720,,,,,,,,,,
29872,,,,,,,,,,


## Filter data

Select the rows that verify a certain condition

In [46]:
df[df.neighborhood == 'Alvalade']

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,,46.0,


## Selecting data types

Check DataFrame's data types by using the class attribute dtypes

In [47]:
df.dtypes

host_id                   int64
room_type                object
neighborhood             object
reviews                   int64
overall_satisfaction    float64
accommodates              int64
bedrooms                float64
bathrooms               float64
price                   float64
minstay                 float64
dtype: object

To select columns based on their data type, use the function select_dtypes

In [48]:
df.select_dtypes(exclude=['float64'])

Unnamed: 0_level_0,host_id,room_type,neighborhood,reviews,accommodates
room_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6499,14455,Entire home/apt,Belém,8,2
17031,66015,Entire home/apt,Alvalade,0,2
25659,107347,Entire home/apt,Santa Maria Maior,63,3
29248,125768,Entire home/apt,Santa Maria Maior,225,4
29396,126415,Entire home/apt,Santa Maria Maior,132,4
29720,128075,Entire home/apt,Estrela,14,16
29872,128698,Entire home/apt,Alcântara,25,2


## Write data to file

In [51]:
df.to_csv('../data/airbnb_output.csv')