# SLU2 - Subsetting data in Pandas: Examples notebook

In this example notebook we will be working with airbnb's listing dataset to over the concepts learned in this unit:



In [1]:
import pandas as pd

# This is an option to preview less rows in the notebook's cells' outputs
pd.options.display.max_rows = 10

## 1 - Using the index 

### Read airbnb_input data with neighborhood as index

In [2]:
# Read the data in file airbnb_input.csv into a pandas DataFrame and use column neighborhood as the DataFrame index.
df = pd.read_csv('data/airbnb_input.csv', index_col='neighborhood')

# Preview the first rows of the DataFrame.
df.head()

Unnamed: 0_level_0,room_id,host_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Belém,6499,14455,Entire home/apt,8,5.0,2,1.0,57.0
Alvalade,17031,66015,Entire home/apt,0,0.0,2,1.0,46.0
Santa Maria Maior,25659,107347,Entire home/apt,63,5.0,3,1.0,69.0
Santa Maria Maior,29248,125768,Entire home/apt,225,4.5,4,1.0,58.0
Santa Maria Maior,29396,126415,Entire home/apt,132,5.0,4,1.0,67.0


### Sort index alphabetically (descending)

In [3]:
df = df.sort_index(ascending=False)
df

Unnamed: 0_level_0,room_id,host_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
São Vicente,1198570,3987199,Entire home/apt,226,4.5,2,1.0,58.0
São Vicente,9106791,47467727,Entire home/apt,7,4.5,4,2.0,69.0
São Vicente,1373231,2781047,Entire home/apt,171,4.5,5,3.0,69.0
São Vicente,17968508,1756107,Entire home/apt,3,5.0,4,2.0,55.0
São Vicente,9036986,2372087,Entire home/apt,18,5.0,16,6.0,346.0
...,...,...,...,...,...,...,...,...
Ajuda,17251426,109024374,Entire home/apt,0,0.0,1,1.0,58.0
Ajuda,8094751,42524269,Entire home/apt,6,4.5,5,2.0,68.0
Ajuda,1612717,3680667,Entire home/apt,5,5.0,4,2.0,58.0
Ajuda,13078189,72524317,Shared room,7,5.0,2,1.0,21.0


### Reset neighborhood index, keeping it as a column, and setting host_id as index

In [4]:
df = df.reset_index(drop=False).set_index('host_id').sort_index()
df

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14455,Belém,6499,Entire home/apt,8,5.0,2,1.0,57.0
17096,Misericórdia,18644258,Entire home/apt,0,0.0,2,1.0,80.0
37768,Avenidas Novas,14302692,Private room,1,0.0,1,1.0,16.0
51461,Santa Maria Maior,10879374,Entire home/apt,68,4.5,2,0.0,56.0
60717,Misericórdia,792377,Entire home/apt,78,5.0,4,1.0,150.0
...,...,...,...,...,...,...,...,...
135651991,São Vicente,19356644,Entire home/apt,0,0.0,6,4.0,173.0
135784989,Santa Maria Maior,19370278,Entire home/apt,0,0.0,4,2.0,184.0
135787783,Santa Maria Maior,19370769,Entire home/apt,0,0.0,4,2.0,176.0
135895465,São Domingos de Benfica,19385945,Entire home/apt,0,0.0,4,2.0,64.0


### Add host with id 1  

In [5]:
df.loc[1] = ['Alvalade',567,'Private room',2,4.5,2,1,24]
df

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14455,Belém,6499,Entire home/apt,8,5.0,2,1.0,57.0
17096,Misericórdia,18644258,Entire home/apt,0,0.0,2,1.0,80.0
37768,Avenidas Novas,14302692,Private room,1,0.0,1,1.0,16.0
51461,Santa Maria Maior,10879374,Entire home/apt,68,4.5,2,0.0,56.0
60717,Misericórdia,792377,Entire home/apt,78,5.0,4,1.0,150.0
...,...,...,...,...,...,...,...,...
135784989,Santa Maria Maior,19370278,Entire home/apt,0,0.0,4,2.0,184.0
135787783,Santa Maria Maior,19370769,Entire home/apt,0,0.0,4,2.0,176.0
135895465,São Domingos de Benfica,19385945,Entire home/apt,0,0.0,4,2.0,64.0
135915593,São Vicente,19388006,Entire home/apt,0,0.0,6,3.0,415.0


As we can see the our new host in the last poistion of our dataframe. This means that is no longer sorted along the index.

In [6]:
df = df.sort_index()

### Select last 7 rows

In [7]:
df[-7:]

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
135565767,Alcântara,19347166,Entire home/apt,0,0.0,2,1.0,75.0
135570136,Arroios,19347531,Private room,0,0.0,5,4.0,58.0
135651991,São Vicente,19356644,Entire home/apt,0,0.0,6,4.0,173.0
135784989,Santa Maria Maior,19370278,Entire home/apt,0,0.0,4,2.0,184.0
135787783,Santa Maria Maior,19370769,Entire home/apt,0,0.0,4,2.0,176.0
135895465,São Domingos de Benfica,19385945,Entire home/apt,0,0.0,4,2.0,64.0
135915593,São Vicente,19388006,Entire home/apt,0,0.0,6,3.0,415.0


In [8]:
df.iloc[-7:]

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
135565767,Alcântara,19347166,Entire home/apt,0,0.0,2,1.0,75.0
135570136,Arroios,19347531,Private room,0,0.0,5,4.0,58.0
135651991,São Vicente,19356644,Entire home/apt,0,0.0,6,4.0,173.0
135784989,Santa Maria Maior,19370278,Entire home/apt,0,0.0,4,2.0,184.0
135787783,Santa Maria Maior,19370769,Entire home/apt,0,0.0,4,2.0,176.0
135895465,São Domingos de Benfica,19385945,Entire home/apt,0,0.0,4,2.0,64.0
135915593,São Vicente,19388006,Entire home/apt,0,0.0,6,3.0,415.0


### Select between positions 25 and 33

In [9]:
df[25:33]

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
136230,Misericórdia,304902,Entire home/apt,90,4.5,4,1.0,45.0
136230,Misericórdia,5454120,Entire home/apt,28,4.5,4,1.0,93.0
136230,Misericórdia,16037005,Entire home/apt,4,4.5,4,1.0,67.0
136230,Misericórdia,1549780,Entire home/apt,23,4.5,4,1.0,75.0
136230,Misericórdia,2368850,Entire home/apt,2,0.0,8,4.0,196.0
144398,Misericórdia,33312,Entire home/apt,24,4.5,4,1.0,66.0
144484,Penha de França,196619,Entire home/apt,0,0.0,4,1.0,75.0
144484,Lumiar,33348,Private room,2,0.0,6,1.0,46.0


In [10]:
df.iloc[25:33]

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
136230,Misericórdia,304902,Entire home/apt,90,4.5,4,1.0,45.0
136230,Misericórdia,5454120,Entire home/apt,28,4.5,4,1.0,93.0
136230,Misericórdia,16037005,Entire home/apt,4,4.5,4,1.0,67.0
136230,Misericórdia,1549780,Entire home/apt,23,4.5,4,1.0,75.0
136230,Misericórdia,2368850,Entire home/apt,2,0.0,8,4.0,196.0
144398,Misericórdia,33312,Entire home/apt,24,4.5,4,1.0,66.0
144484,Penha de França,196619,Entire home/apt,0,0.0,4,1.0,75.0
144484,Lumiar,33348,Private room,2,0.0,6,1.0,46.0


### Select between host 1 and 150000

In [11]:
df.loc[1:15000]

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Alvalade,567,Private room,2,4.5,2,1.0,24.0
14455,Belém,6499,Entire home/apt,8,5.0,2,1.0,57.0


### Select columns reviews and price for hosts with id between 150000 and 600000

In [12]:
df.loc[15000:60000,['reviews','price']]

Unnamed: 0_level_0,reviews,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1
17096,0,80.0
37768,1,16.0
51461,68,56.0


### Update reviews to value 5 for for room_id 33348. 

In [13]:
df.loc[df.room_id==33348,'reviews']=5
df.loc[df.room_id==33348,:]

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,accommodates,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
144484,Lumiar,33348,Private room,5,0.0,6,1.0,46.0


### Drop *host_id* 1 and column *accommodates*

In [14]:
# Drop row
df_new = df.drop(labels=1)
# Drop column
df_new = df_new.drop(columns='accommodates')
# Show header
df_new.head()

Unnamed: 0_level_0,neighborhood,room_id,room_type,reviews,overall_satisfaction,bedrooms,price
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14455,Belém,6499,Entire home/apt,8,5.0,1.0,57.0
17096,Misericórdia,18644258,Entire home/apt,0,0.0,1.0,80.0
37768,Avenidas Novas,14302692,Private room,1,0.0,1.0,16.0
51461,Santa Maria Maior,10879374,Entire home/apt,68,4.5,0.0,56.0
60717,Misericórdia,792377,Entire home/apt,78,5.0,1.0,150.0
