In [38]:
import wandb
import yaml
import pandas as pd
import pandas_profiling
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
with open('../../config.yaml') as fp:
    config = yaml.safe_load(fp)

In [3]:
run = wandb.init(project=config['main']['project_name'], group='EDA', save_code= True)

[34m[1mwandb[0m: Currently logged in as: [33mhiep_pham[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
local_path = run.use_artifact('sample.csv:latest').file()
df = pd.read_csv(local_path)

In [13]:
df.shape

(20000, 16)

In [12]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
id,9138664,31444015,8741020,34602077,23203149
name,Private Lg Room 15 min to Manhattan,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,Voted #1 Location Quintessential 1BR W Village...,Spacious 1 bedroom apartment 15min from Manhattan,Big beautiful bedroom in huge Bushwick apartment
host_id,47594947,8523790,45854238,261055465,143460
host_name,Iris,Johlex,John,Regan,Megan
neighbourhood_group,Queens,Manhattan,Manhattan,Queens,Brooklyn
neighbourhood,Sunnyside,Hell's Kitchen,West Village,Astoria,Bushwick
latitude,40.74271,40.76682,40.73631,40.76424,40.69839
longitude,-73.92493,-73.98878,-74.00611,-73.92351,-73.92044
room_type,Private room,Entire home/apt,Entire home/apt,Entire home/apt,Private room
price,74,170,245,125,65


In [14]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [17]:
df.loc[:, 'last_review'] = pd.to_datetime(df['last_review'])
most_recent_date = df.last_review.max()
df.loc[:, 'time_since_last_review'] = df.last_review.apply(lambda x: (most_recent_date - x).days)

Timestamp('2019-07-08 00:00:00')

In [20]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,time_since_last_review
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5,43.0
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,NaT,,1,188,
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0,292.0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13,45.0
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8,15.0


In [32]:
y = 'price'
categorical_vars = ['neighbourhood_group', 'room_type']
num_vars = ['minimum_nights', 'reviews_per_month', 'calculated_host_listings_count', 
            'availability_365', 'latitude', 'longitude', 'time_since_last_review']

In [24]:
df[num_vars].isnull().sum()

price                                0
minimum_nights                       0
reviews_per_month                 4123
calculated_host_listings_count       0
availability_365                     0
latitude                             0
longitude                            0
time_since_last_review            4123
dtype: int64

## Numerical variables

In [33]:
df[num_vars + [y]].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
minimum_nights,20000.0,6.9921,21.645449,1.0,1.0,2.0,5.0,1250.0
reviews_per_month,15877.0,1.377446,1.683006,0.01,0.19,0.72,2.01,27.95
calculated_host_listings_count,20000.0,6.95545,32.433831,1.0,1.0,1.0,2.0,327.0
availability_365,20000.0,112.9012,131.762226,0.0,0.0,44.0,229.0,365.0
latitude,20000.0,40.728455,0.054755,40.50873,40.68942,40.72273,40.76299,40.91306
longitude,20000.0,-73.952125,0.046559,-74.23914,-73.98303,-73.95564,-73.93638,-73.71795
time_since_last_review,15877.0,278.448951,413.881928,0.0,15.0,50.0,372.0,2979.0
price,20000.0,153.26905,243.325609,0.0,69.0,105.0,175.0,10000.0


In [43]:
df[num_vars + [y]].corr()

Unnamed: 0,minimum_nights,reviews_per_month,calculated_host_listings_count,availability_365,latitude,longitude,time_since_last_review,price
minimum_nights,1.0,-0.113449,0.121679,0.13476,0.023391,-0.058709,0.055031,0.045317
reviews_per_month,-0.113449,1.0,-0.004417,0.18291,-0.015011,0.162698,-0.444577,-0.035664
calculated_host_listings_count,0.121679,-0.004417,1.0,0.222085,0.0193,-0.112995,-0.049968,0.052515
availability_365,0.13476,0.18291,0.222085,1.0,-0.012504,0.074925,-0.319842,0.086792
latitude,0.023391,-0.015011,0.0193,-0.012504,1.0,0.085655,0.028578,0.039562
longitude,-0.058709,0.162698,-0.112995,0.074925,0.085655,1.0,-0.100389,-0.153573
time_since_last_review,0.055031,-0.444577,-0.049968,-0.319842,0.028578,-0.100389,1.0,-0.004817
price,0.045317,-0.035664,0.052515,0.086792,0.039562,-0.153573,-0.004817,1.0


## Cateorical variables

In [30]:
pd.crosstab(df[categorical_vars[0]], df[categorical_vars[1]], margins=True, normalize=True)* 100

room_type,Entire home/apt,Private room,Shared room,All
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,0.75,1.32,0.135,2.205
Brooklyn,19.73,20.825,0.77,41.325
Manhattan,26.675,16.285,0.91,43.87
Queens,4.395,6.995,0.385,11.775
Staten Island,0.37,0.435,0.02,0.825
All,51.92,45.86,2.22,100.0


In [34]:
df.groupby('neighbourhood_group').agg({y: ['mean', 'median', 'count']})

Unnamed: 0_level_0,price,price,price
Unnamed: 0_level_1,mean,median,count
neighbourhood_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Bronx,90.780045,65.0,441
Brooklyn,122.964549,90.0,8265
Manhattan,200.456348,150.0,8774
Queens,98.288323,75.0,2355
Staten Island,113.769697,71.0,165


In [35]:
df.groupby('room_type').agg({y: ['mean', 'median', 'count']})

Unnamed: 0_level_0,price,price,price
Unnamed: 0_level_1,mean,median,count
room_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Entire home/apt,213.236807,160.0,10384
Private room,89.386393,70.0,9172
Shared room,70.445946,49.0,444


In [44]:
run.finish()

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…