# Exploratory Data Analysis of NYC Rentals

## Imports

In [1]:
import wandb
import pandas as pd 
import pandas_profiling

## Loading Data

In [2]:
run = wandb.init(project='nyc_airbnb', group='eda', save_code=True)
run

[34m[1mwandb[0m: Currently logged in as: [33mmgris[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
sample_local_path = run.use_artifact("sample.csv:latest").file()
sample_local_path

'./artifacts/sample.csv:v0/sample1.csv'

In [9]:
rentals_data = pd.read_csv(sample_local_path, parse_dates=['last_review'])

In [10]:
rentals_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,NaT,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8


## Exploring Data

In [12]:
profile = pandas_profiling.ProfileReport(rentals_data)

In [13]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

wandb: Network error (TransientError), entering retry loop.


## Observations

####  Missing values (about 2.6%): 
- to handle in the inference pipeline 

#### id & host_id
- should be str to reflect its "non-numerical nature" 

#### price 
- quite significant outliers on the upside with a max @ 10,000 (vs a median sitting @ 105)
- some prices are below 10, actually @ 0

#### minimum nights 
- important outliers with a max @ 1, 250 (vs. the median sitting @ 2)

#### reviews 
- important right skew, but those outliers seem possible for highly popular rentals



## Data Corrections

In [16]:
# convert 'id' & 'host_id' to str 
rentals_data['id'] = rentals_data['id'].astype(str)
rentals_data['host_id'] = rentals_data['host_id'].astype(str)

In [19]:
# Handling outliers in prices 
detailed_percentiles = [.01, .05, .1, .15, .25, .5, .75, .85, .9, .95, .99]
rentals_data['price'].describe(percentiles=detailed_percentiles)

count    19001.000000
mean       122.340456
std         71.530346
min         10.000000
1%          30.000000
5%          40.000000
10%         48.000000
15%         54.000000
25%         66.000000
50%        100.000000
75%        160.000000
85%        200.000000
90%        225.000000
95%        270.000000
99%        340.000000
max        350.000000
Name: price, dtype: float64

In [18]:
# After debate with stakeholders, thresholds defined as follow
min_price = 10 
max_price = 350

# Boolean masking
is_not_outlier_price = rentals_data['price'].between(min_price, max_price)

# Filtering out outliers
n_rows_before = rentals_data.shape[0]
rentals_data = rentals_data[is_not_outlier_price]
n_rows_after = rentals_data.shape[0]
rows_dropped = n_rows_before - n_rows_after
pct_rows_dropped = round((rows_dropped / n_rows_before) * 100, 2)

print(f"{rows_dropped=}, i.e {pct_rows_dropped=}%")

rows_dropped=999, i.e pct_rows_dropped=5.0%


In [21]:
minimun_nights_stats = rentals_data['minimum_nights'].describe(percentiles=detailed_percentiles)
minimun_nights_stats

count    19001.000000
mean         6.906900
std         21.456544
min          1.000000
1%           1.000000
5%           1.000000
10%          1.000000
15%          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
85%          7.000000
90%         28.000000
95%         30.000000
99%         39.000000
max       1250.000000
Name: minimum_nights, dtype: float64

In [22]:
# After debate with stakeholders, thresholds defined as follow
minimum_nights_max = minimun_nights_stats['95%']
minimum_nights_max


30.0

In [23]:
# Boolean masking
is_not_outlier_minimum_nights = rentals_data['minimum_nights'] <= minimum_nights_max

# Filtering out outliers
n_rows_before = rentals_data.shape[0]
rentals_data = rentals_data[is_not_outlier_minimum_nights]
n_rows_after = rentals_data.shape[0]
rows_dropped = n_rows_before - n_rows_after
pct_rows_dropped = round((rows_dropped / n_rows_before) * 100, 2)

print(f"{rows_dropped=}, i.e {pct_rows_dropped=}%")

rows_dropped=269, i.e pct_rows_dropped=1.42%




In [None]:
run.finish()

[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.
