In [81]:
import torch
from collections import defaultdict
import csv
import numpy
import pandas as pd 
import random
import matplotlib.pyplot as plt
import math

In [82]:
print(torch.__version__)

1.1.0


### Extract dataset

In [83]:
data = pd.read_csv("../datasets/airbnb_nyc/AB_NYC_2019.csv")

In [84]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48879 non-null object
host_id                           48895 non-null int64
host_name                         48874 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64

### Headers and Sample Datapoint

In [85]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


### Identify Problems

(1) Some of the price is zero  
(2) There are some missing "name" and "hostnames"  
(3) "Last review" and "reviews per month" have lots of missing data  

In [86]:
data.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

## Detailed Analysis - Pricing

In [87]:
data['price'].describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [88]:
data['neighbourhood_group'].unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [89]:
data.groupby('neighbourhood_group')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bronx,1091.0,87.496792,106.709349,0.0,45.0,65.0,99.0,2500.0
Brooklyn,20104.0,124.383207,186.873538,0.0,60.0,90.0,150.0,10000.0
Manhattan,21661.0,196.875814,291.383183,0.0,95.0,150.0,220.0,10000.0
Queens,5666.0,99.517649,167.102155,10.0,50.0,75.0,110.0,10000.0
Staten Island,373.0,114.812332,277.620403,13.0,50.0,75.0,110.0,5000.0


In [90]:
data['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [91]:
data.groupby('room_type')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Entire home/apt,25409.0,211.794246,284.041611,0.0,120.0,160.0,229.0,10000.0
Private room,22326.0,89.780973,160.205262,0.0,50.0,70.0,95.0,10000.0
Shared room,1160.0,70.127586,101.725252,0.0,33.0,45.0,75.0,1800.0


In [92]:
data['calculated_host_listings_count'].describe()

count    48895.000000
mean         7.143982
std         32.952519
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        327.000000
Name: calculated_host_listings_count, dtype: float64

## AirBnB Hosts with >5 listings

In [101]:
prolific=data[data['calculated_host_listings_count']>5]

In [102]:
prolific['host_name'].unique()

array(['John', 'Lisel', 'Erica', 'Lissette', 'Carol Gloria', 'Vie',
       'The Box House Hotel', 'Pam', 'Henry', 'Jason', 'Jessica', 'Ariel',
       'Shai', 'Majar', 'Adam', 'Lior', 'Ollie', 'Mike', 'George Steven',
       'Izi', 'Petya', 'Karen', 'Gen', 'Randy', 'Michael', 'Vida', 'Bobi',
       'Host', 'Caroline', 'Lilia', 'Amaya', 'Anthony', 'AFI Apartments',
       'Cecile', 'Ofer', 'Benjamin', 'Richard', 'Mor', 'Mat',
       'Alex And Zeena', 'Chadanut', 'Raanan', 'Veronique Camille', 'Ota',
       'Seun And Marie', 'Jeremy & Laura', 'Nina', 'Yohan', 'Shahana',
       'Amy', 'Eddie&Vlad', 'Fatou', 'Elem', 'Freda', 'Juliana', 'Sophie',
       'Stat', 'Ann', 'Katie Graham', 'Graham And Ben', 'Ksenia & Masha',
       'Yukee', 'Alec', 'Dave', 'Laramie', 'Yasu & Akiko', 'Luis', 'Ira',
       'Brooklyn&   Breakfast    -Len-', 'Kara', 'Ikkyukim', 'Brady',
       'Diana', 'Max', 'Jeff', 'Alex', 'Hiroki', 'Tny', 'Lolita', 'Ruchi',
       'Joyell', 'CRNY Monthly Rentals', 'Marlon', 'Jeniff

In [103]:
prolific.groupby('host_name')['neighbourhood_group'].describe()

Unnamed: 0_level_0,count,unique,top,freq
host_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFI Apartments,29,1,Manhattan,29
Abby,10,1,Brooklyn,10
Abraham,18,2,Manhattan,9
Ada,7,1,Brooklyn,7
Adam,29,1,Manhattan,29
Adeyemi,6,1,Brooklyn,6
Adonis,7,1,Manhattan,7
Ahmet,7,1,Manhattan,7
Alan,15,1,Queens,15
Alberto,23,2,Manhattan,17


In [106]:
prolific.pivot_table(index='host_name',columns='neighbourhood_group',values='price',aggfunc='mean')


neighbourhood_group,Bronx,Brooklyn,Manhattan,Queens,Staten Island
host_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AFI Apartments,,,121.965517,,
Abby,,30.400000,,,
Abraham,,190.888889,53.777778,,
Ada,,96.571429,,,
Adam,,,220.896552,,
Adeyemi,,26.666667,,,
Adonis,,,207.142857,,
Ahmet,,,88.142857,,
Alan,,,,121.066667,
Alberto,,,141.470588,90.833333,


<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f2919ec53c8>