In [1]:
import pandas as pd
import petl as etl

https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

In [7]:
airbnb_data = [['2539', 'Clean & quiet apt home by the park','149', '9'],
               ['2595', 'Skylit Midtown Castle',  '225', '45'],
               ['3647', 'THE VILLAGE OF HARLEM....NEW YORK !', '150', '0'],
               ['3831', 'Cozy Entire Floor of Brownstone', '89', '270'],
               ['5022', 'Entire Apt: Spacious Studio/Loft by central park', '80', '9']] 

#### Push rows down and prepend a header row

In [8]:
data = etl.pushheader(airbnb_data, ['id', 'name', 'price','number_of_reviews'])

In [9]:
data.display()

id,name,price,number_of_reviews
2539,Clean & quiet apt home by the park,149,9
2595,Skylit Midtown Castle,225,45
3647,THE VILLAGE OF HARLEM....NEW YORK !,150,0
3831,Cozy Entire Floor of Brownstone,89,270
5022,Entire Apt: Spacious Studio/Loft by central park,80,9


#### Check the type pf the price column

In [10]:
etl.typeset(data, 'price')

{'str'}

In [12]:
updated_data = etl.convert(data, 'price', float)

In [6]:
updated_data.display()

id,name,price,number_of_reviews
2539,Clean & quiet apt home by the park,149.0,9
2595,Skylit Midtown Castle,225.0,45
3647,THE VILLAGE OF HARLEM....NEW YORK !,150.0,0
3831,Cozy Entire Floor of Brownstone,89.0,270
5022,Entire Apt: Spacious Studio/Loft by central park,80.0,9


In [13]:
etl.typeset(updated_data, 'price')

{'float'}

#### Convenience function to convert all field values to numbers where possible.

In [18]:
for column in updated_data.columns():
    print(f'Type of {column} is {etl.typeset(updated_data, column)}')

Type of id is {'str'}
Type of name is {'str'}
Type of price is {'float'}
Type of number_of_reviews is {'str'}


In [19]:
updated_data = etl.convertnumbers(updated_data)

updated_data.display()

id,name,price,number_of_reviews
2539,Clean & quiet apt home by the park,149,9
2595,Skylit Midtown Castle,225,45
3647,THE VILLAGE OF HARLEM....NEW YORK !,150,0
3831,Cozy Entire Floor of Brownstone,89,270
5022,Entire Apt: Spacious Studio/Loft by central park,80,9


In [20]:
for column in updated_data.columns():
    print(f'Type of {column} is {etl.typeset(updated_data, column)}')

Type of id is {'int'}
Type of name is {'str'}
Type of price is {'int'}
Type of number_of_reviews is {'int'}


In [21]:
lower_case_conversion = etl.convert(updated_data, 'name', 'lower')

lower_case_conversion.display()

id,name,price,number_of_reviews
2539,clean & quiet apt home by the park,149,9
2595,skylit midtown castle,225,45
3647,the village of harlem....new york !,150,0
3831,cozy entire floor of brownstone,89,270
5022,entire apt: spacious studio/loft by central park,80,9


In [22]:
updated_data = etl.convert(lower_case_conversion, 
                           'name', 
                           'replace',
                           'the village of harlem....new york !', 
                           'The Villiage of Harlem')

updated_data.display()

id,name,price,number_of_reviews
2539,clean & quiet apt home by the park,149,9
2595,skylit midtown castle,225,45
3647,The Villiage of Harlem,150,0
3831,cozy entire floor of brownstone,89,270
5022,entire apt: spacious studio/loft by central park,80,9


In [23]:
update_name = etl.convert(updated_data, 
                          'name',
                          {'clean & quiet apt home by the park' : 'Apt Home by The Park in Brooklyn',
                           'skylit midtown castle' : 'Skylit Midtown Castle',
                           'cozy entire floor of brownstone' : 'Cozy Entire Floor of Brownstone in Brooklyn'})

In [24]:
update_name.display()

id,name,price,number_of_reviews
2539,Apt Home by The Park in Brooklyn,149,9
2595,Skylit Midtown Castle,225,45
3647,The Villiage of Harlem,150,0
3831,Cozy Entire Floor of Brownstone in Brooklyn,89,270
5022,entire apt: spacious studio/loft by central park,80,9


In [25]:
room_type = ['Private room', 
             'Entire home/apt', 
             'Private room',  
             'Entire home/apt', 
             'Entire home/apt' ]

In [26]:
data_new = etl.addcolumn(update_name, 'room_type', room_type )

data_new.display()

id,name,price,number_of_reviews,room_type
2539,Apt Home by The Park in Brooklyn,149,9,Private room
2595,Skylit Midtown Castle,225,45,Entire home/apt
3647,The Villiage of Harlem,150,0,Private room
3831,Cozy Entire Floor of Brownstone in Brooklyn,89,270,Entire home/apt
5022,entire apt: spacious studio/loft by central park,80,9,Entire home/apt


#### conversion can be conditional

In [27]:
update_prices =  etl.convert(data_new, 
                             'price', 
                             lambda v: v * 1.2,
                             where = lambda r: r.room_type == 'Entire home/apt')

In [28]:
update_prices.display()

id,name,price,number_of_reviews,room_type
2539,Apt Home by The Park in Brooklyn,149.0,9,Private room
2595,Skylit Midtown Castle,270.0,45,Entire home/apt
3647,The Villiage of Harlem,150.0,0,Private room
3831,Cozy Entire Floor of Brownstone in Brooklyn,106.8,270,Entire home/apt
5022,entire apt: spacious studio/loft by central park,96.0,9,Entire home/apt


In [29]:
update_room_type = etl.replace(update_prices, 
                               'room_type', 
                               'Entire home/apt', 
                               'Entire Home')

In [30]:
update_room_type.display()

id,name,price,number_of_reviews,room_type
2539,Apt Home by The Park in Brooklyn,149.0,9,Private room
2595,Skylit Midtown Castle,270.0,45,Entire Home
3647,The Villiage of Harlem,150.0,0,Private room
3831,Cozy Entire Floor of Brownstone in Brooklyn,106.8,270,Entire Home
5022,entire apt: spacious studio/loft by central park,96.0,9,Entire Home


In [31]:
format_reviews = etl.format(update_room_type, 
                            'number_of_reviews', 
                            'None',
                            where = lambda x: x.number_of_reviews == 0)

In [32]:
format_reviews.display()

id,name,price,number_of_reviews,room_type
2539,Apt Home by The Park in Brooklyn,149.0,9.0,Private room
2595,Skylit Midtown Castle,270.0,45.0,Entire Home
3647,The Villiage of Harlem,150.0,,Private room
3831,Cozy Entire Floor of Brownstone in Brooklyn,106.8,270.0,Entire Home
5022,entire apt: spacious studio/loft by central park,96.0,9.0,Entire Home


In [33]:
data_nyc = pd.read_csv('./datasets/AB_NYC_2019.csv')

In [34]:
data_nyc.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,19/10/18,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,21/05/19,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05/07/19,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19/11/18,0.1,1,0


In [35]:
data_nyc = etl.fromdataframe(data_nyc)

In [36]:
data_nyc.display()

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.647490000000005,-73.97237,Private room,149,1,9,19/10/18,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98376999999999,Entire home/apt,225,1,45,21/05/19,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05/07/19,4.64,1,194
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19/11/18,0.1,1,0


In [37]:
update_minimum_nights = etl.interpolate(data_nyc,
                                        'minimum_nights',
                                        1, 
                                        where = lambda r: r.minimum_nights == 3)

In [39]:
update_minimum_nights.display()

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.647490000000005,-73.97237,Private room,149,1,9,19/10/18,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98376999999999,Entire home/apt,225,1,45,21/05/19,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,1,0,,,1,365
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05/07/19,4.64,1,194
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19/11/18,0.1,1,0


In [40]:
update_data = etl.update(update_minimum_nights, 
                         'availability_365', 
                         '-',
                         where = lambda x: x.availability_365 == 0)

In [41]:
update_data

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.647490000000005,-73.97237,Private room,149,1,9,19/10/18,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98376999999999,Entire home/apt,225,1,45,21/05/19,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,1,0,,,1,365
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05/07/19,4.64,1,194
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19/11/18,0.1,1,-
