In [1]:
# !pip install ydata-profiling

import wandb
import pandas as pd
from ydata_profiling import ProfileReport

In [2]:
run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)  # we use save_code=True in the call to wandb.init so the notebook is uploaded and versioned by W&B.

# Fetch data artifact from W&B and read it
local_path = wandb.use_artifact("sample.csv:latest").file()
print(local_path)

[34m[1mwandb[0m: Currently logged in as: [33madisongoh[0m ([33madisongoh-national-university-of-singapore[0m). Use [1m`wandb login --relogin`[0m to force relogin


./artifacts/sample.csv:latest/sample1.csv


In [3]:
df = pd.read_csv(local_path)
print(df.shape)

(20000, 16)


In [8]:
profile = ProfileReport(df, title="Profiling Report")

print("Profile ready")

Profile ready


In [10]:
# Export the report to a html file
profile.to_file("profile_report_EDA_before.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                | 0/16 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 57.15it/s][A
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'putmask: first argument must be an array')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

What do you notice? Look around and see what you can find. 

- For example, there are missing values in a few columns and the column last_review is a date but it is in string format.
- Look also at the price column, and note the outliers. There are some zeros and some very high prices. After talking to your stakeholders, you decide to consider from a minimum of 10 to a maximum of 350 per night.

In [19]:
# Drop outliers - price between 10 to 350
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()

# minimum_nights between 1 to 14
idx = df['minimum_nights'].between(1, 14)
df = df[idx].copy()

# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

Note how we did not impute missing values. We will do that in the inference pipeline, so we will be able to handle missing values also in production.



In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16747 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              16747 non-null  int64         
 1   name                            16740 non-null  object        
 2   host_id                         16747 non-null  int64         
 3   host_name                       16740 non-null  object        
 4   neighbourhood_group             16747 non-null  object        
 5   neighbourhood                   16747 non-null  object        
 6   latitude                        16747 non-null  float64       
 7   longitude                       16747 non-null  float64       
 8   room_type                       16747 non-null  object        
 9   price                           16747 non-null  int64         
 10  minimum_nights                  16747 non-null  int64         
 11  number_

In [21]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
count,16747.0,16747.0,16747.0,16747.0,16747.0,16747.0,16747.0,13842,13842.0,16747.0,16747.0
mean,18733160.0,66001280.0,40.727028,-73.949359,121.082642,2.860572,25.856392,2018-10-04 03:07:21.612483584,1.469779,3.105153,97.520571
min,2539.0,2571.0,40.50873,-74.23914,10.0,1.0,0.0,2011-05-12 00:00:00,0.01,1.0,0.0
25%,9287530.0,7748101.0,40.687595,-73.98123,67.0,1.0,1.0,2018-07-01 00:00:00,0.21,1.0,0.0
50%,19317830.0,30523480.0,40.72007,-73.95365,100.0,2.0,7.0,2019-05-26 00:00:00,0.83,1.0,25.0
75%,28616560.0,101980400.0,40.76298,-73.932665,159.0,3.0,27.0,2019-06-24 00:00:00,2.18,2.0,179.0
max,36485610.0,274273300.0,40.91306,-73.71795,350.0,14.0,607.0,2019-07-08 00:00:00,27.95,327.0,365.0
std,10903900.0,77701220.0,0.056214,0.04792,70.664779,2.291717,47.30526,,1.73422,20.105021,124.838908


In [22]:
profile2 = ProfileReport(df, title="Profiling Report")

print("Profile ready")

Profile ready


In [24]:
# Export the report to a html file
profile2.to_file("profile_report_EDA_after.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                | 0/16 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.29it/s][A
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'putmask: first argument must be an array')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
# Terminate the run
run.finish()

VBox(children=(Label(value='9.619 MB of 9.619 MB uploaded (0.021 MB deduped)\r'), FloatProgress(value=1.0, max…