# Data Analysis of Airbnb Sydney Dataset 

## 读入数据集

In [1]:
# Import libraries
import geopandas
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.express as px
import json
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
import seaborn as sns
import numpy as np
from params import *
warnings.filterwarnings("ignore")


In [67]:
df = pd.read_csv("./dataset/train.csv")
df.head()

Unnamed: 0,description,neighbourhood,latitude,longitude,type,accommodates,bathrooms,bedrooms,amenities,reviews,review_rating,review_scores_A,review_scores_B,review_scores_C,review_scores_D,instant_bookable,target
0,🚘 FREE CAR SPACE AVAILABLE ON THE BUILDING😊<br...,Waverley,-33.88882,151.27456,Entire home/apt,2,1 bath,1.0,"[""Gym"", ""Bed linens"", ""Shampoo"", ""Coffee maker...",42,96.0,10.0,10.0,10.0,10.0,f,1
1,Really quite area and very clean,Rockdale,-33.95701,151.14604,Entire home/apt,4,1 bath,2.0,"[""Washer"", ""Free parking on premises"", ""Long t...",1,80.0,6.0,10.0,8.0,10.0,t,1
2,It is a very cozy apartment walk distance to C...,Warringah,-33.77454,151.28556,Private room,1,1 bath,1.0,"[""Washer"", ""Smoke alarm"", ""Hot water"", ""Essent...",0,,,,,,t,0
3,Beautiful studio apartment on Sydney's Norther...,Warringah,-33.7094,151.30038,Entire home/apt,2,1 bath,,"[""Portable fans"", ""Washer \u2013\u00a0In unit""...",15,99.0,10.0,10.0,10.0,10.0,f,0
4,是一栋豪华别墅，里面空间大，房间大，干净。离开riverwood火车站走路10分钟，大型超市...,Hurstville,-33.95146,151.06089,Private room,2,1 private bath,1.0,"[""Washer"", ""Iron"", ""Free parking on premises"",...",2,50.0,7.0,6.0,9.0,9.0,t,0


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15063 entries, 0 to 15062
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   description       14582 non-null  object 
 1   neighbourhood     15063 non-null  object 
 2   latitude          15063 non-null  float64
 3   longitude         15063 non-null  float64
 4   type              15063 non-null  object 
 5   accommodates      15063 non-null  int64  
 6   bathrooms         15051 non-null  object 
 7   bedrooms          14051 non-null  float64
 8   amenities         15063 non-null  object 
 9   reviews           15063 non-null  int64  
 10  review_rating     10289 non-null  float64
 11  review_scores_A   10264 non-null  float64
 12  review_scores_B   10270 non-null  float64
 13  review_scores_C   10259 non-null  float64
 14  review_scores_D   10268 non-null  float64
 15  instant_bookable  15063 non-null  object 
 16  target            15063 non-null  int64 

In [69]:
# 空值检测
df.isnull().sum().sort_values(ascending=False)

review_scores_C     4804
review_scores_A     4799
review_scores_D     4795
review_scores_B     4793
review_rating       4774
bedrooms            1012
description          481
bathrooms             12
instant_bookable       0
amenities              0
reviews                0
neighbourhood          0
accommodates           0
type                   0
longitude              0
latitude               0
target                 0
dtype: int64

In [70]:
df_new = df.copy()
df_new['bathrooms'][df['bathrooms'].isnull()] = '1 bath'
df_new['bedrooms'][df['bedrooms'].isnull()] = 1.0
df_new['description'][df['description'].isnull()] = ''
df_new['id'] = ''


In [71]:
df_new.isnull().sum().sort_values(ascending=False)

review_scores_C     4804
review_scores_A     4799
review_scores_D     4795
review_scores_B     4793
review_rating       4774
description            0
neighbourhood          0
target                 0
instant_bookable       0
reviews                0
amenities              0
bedrooms               0
bathrooms              0
accommodates           0
type                   0
longitude              0
latitude               0
id                     0
dtype: int64

## 类别型属性

### 单独分析
- type

In [72]:
# Room type percentage
df_room = df_new.groupby(['type']).size().reset_index(name = 'count')
df_room

Unnamed: 0,type,count
0,Entire home/apt,10348
1,Hotel room,118
2,Private room,4509
3,Shared room,88


In [73]:
fig = px.pie(df_room, values='count', names='type')
fig.update_traces(textinfo='percent+value+label',insidetextorientation='radial')
fig.update_layout(    width = 800, height = 600, uniformtext_minsize=30,title_x=0.5,font={'size': 18})
fig.show()
#fig.savefig(PIC_PATH+'type_pie.jpg',dpi=800)

- neighbourhood

In [74]:
# Airbnb numbers in each neighbourhood
df_neighbourhood = df_new.groupby(['neighbourhood']).size().reset_index(name="count")
df_neighbourhood.sort_values(by=['count'])

Unnamed: 0,neighbourhood,count
14,Hunters Hill,22
6,Camden,22
11,Fairfield,32
12,Holroyd,36
7,Campbelltown,38
19,Liverpool,55
30,Strathfield,57
15,Hurstville,64
10,City Of Kogarah,73
25,Penrith,81


In [75]:
fig = px.pie(df_neighbourhood, values='count', names='neighbourhood')
fig.update_traces(textposition='inside', textinfo='percent+label',insidetextorientation='radial')
fig.update_layout( width = 800, height = 600, uniformtext_minsize=12, title_x=0.5,font={'size': 18})
fig.update_layout(legend=dict(font={'size': 12}))
fig.show()

- target

In [85]:
df_target = df_new[['target']]
df_target = df_target.groupby(['target']).size().reset_index(name = 'count')
df_target 

Unnamed: 0,target,count
0,0,4628
1,1,4742
2,2,1763
3,3,2242
4,4,1302
5,5,386


In [86]:
fig = px.pie(df_target, values='count', names='target', 
             title='Percentage of Airbnb in Sydney by price range')
fig.update_traces(textinfo='percent+value+label',insidetextorientation='radial')
fig.update_layout(    width = 800, height = 600, uniformtext_minsize=12,title_x=0.5)
fig.show()

### 组合分析

- neighbourhood vs target

In [76]:
df_neighbourhood = df_new.groupby(['neighbourhood','target']).size().reset_index(name="count")
df_neighbourhood

Unnamed: 0,neighbourhood,target,count
0,Ashfield,0,45
1,Ashfield,1,30
2,Ashfield,2,11
3,Ashfield,3,8
4,Ashfield,4,2
...,...,...,...
208,Woollahra,1,181
209,Woollahra,2,87
210,Woollahra,3,124
211,Woollahra,4,69


In [77]:
fig = px.bar(df_neighbourhood, x="neighbourhood", y="count", color="target",
               text = 'count', barmode = 'stack')
fig.update_layout(height=700,title_x=0.5, xaxis={'categoryorder':'total descending'},plot_bgcolor='rgba(0,0,0,0)',font={'size': 15})
fig.show()

- type vs target

In [87]:
# Price distribution in terms of room type
fig = px.violin(df_new, x="type", y="target", color="type")
fig.update_layout(showlegend=False)
fig.update_layout(yaxis=dict(title=dict(standoff = 0.5,font={'size': 18})))
fig.update_layout(xaxis=dict(title=dict(standoff = 3,font={'size': 18}),tickfont={'size': 18}))
fig.show()

- neighbourhood vs type

In [78]:
# Airbnb number distribution by neighbourhood
df_neighbourhood1 = df_new.groupby(['neighbourhood','type']).size().reset_index(name="count")
df_neighbourhood1

Unnamed: 0,neighbourhood,type,count
0,Ashfield,Entire home/apt,64
1,Ashfield,Private room,31
2,Ashfield,Shared room,1
3,Auburn,Entire home/apt,87
4,Auburn,Hotel room,3
...,...,...,...
112,Willoughby,Private room,59
113,Willoughby,Shared room,2
114,Woollahra,Entire home/apt,484
115,Woollahra,Hotel room,2


In [79]:
fig = px.bar(df_neighbourhood1, x="neighbourhood", y="count", color="type",
               text = 'count', barmode = 'stack',
            title = 'Airbnb distribution by neighbourhood in Sydney')
fig.update_layout(height=650,title_x=0.5, xaxis={'categoryorder':'total descending'},plot_bgcolor='rgba(0,0,0,0)')
fig.show()

## 地缘因素

In [88]:
fig = px.density_mapbox(df_new, lat='latitude', lon='longitude', z=df_new['target'],
                        center=dict(lat=-33.918732, lon=151.242035), zoom=9,
                        mapbox_style="stamen-terrain",
                        radius=20,
                        opacity=0.5)
fig.update_layout(width = 1000, height = 1000,title_x=0.5, title_font=dict(size=32))
fig.update_layout(coloraxis_showscale=True,title_x = 1,font={'size': 18})
fig.show()

### 文本类属性

- description：count

In [90]:
# Split words string in description column and filter nonmeaningful words
df_new['description'] = df_new['description'].str.lower()
df_new['description'] = df_new['description'].str.replace('[^a-z]', ' ',regex=True)
df_descriptionword = df_new['description'].str.split(expand=True).stack().value_counts().reset_index(name='count')
df_descriptionword = df_descriptionword.rename(columns = {'index':'word'})
df_descriptionword = df_descriptionword[~df_descriptionword['word'].isin(['in', 'to', 'with', 'the', 'and', 'a', 'of', 'br', 'b', 'for', 'you',
                                                                          'from','this','is','are','on','it','s','your','all','has','or','have','our','we', 'will', 'one'])]
df_descriptionword.head(20)

Unnamed: 0,word,count
12,apartment,15057
13,space,14972
14,bedroom,11386
15,access,10438
17,walk,10062
18,beach,10029
19,room,9663
21,sydney,9299
22,kitchen,9267
24,bed,8264


In [91]:
# Plot description bar chart
fig = px.bar(df_descriptionword.head(10) , x='word', y='count',
              color='count')
fig.update_layout(height=600,width = 850,title_x=0.5)
fig.update_layout(yaxis=dict(title=dict(standoff = 0,font={'size': 18})))
fig.update_layout(xaxis=dict(title=dict(standoff = 2,font={'size': 18}),tickfont={'size': 12}))
fig.update_layout(coloraxis_showscale=False)
fig.show()

- amenities：count

In [92]:

df_new['amenities'] = df_new['amenities'].str.lower()
df_new['amenities'] = df_new['amenities'].str.replace('[^a-z]', ' ',regex=True)
df_amenitiesword = df_new['amenities'].str.split(expand=True).stack().value_counts().reset_index(name='count')
df_amenitiesword = df_amenitiesword.rename(columns = {'index':'word'})
df_amenitiesword = df_amenitiesword[~df_amenitiesword['word'].isin(['in', 'to', 'with', 'the', 'and', 'a', 'of', 'br', 'b', 'for', 'you',
                                                                          'from','this','is','are','on','it','s','your','all','has','or','have','our','we', 'will', 'one'])]
df_amenitiesword.head(20)

Unnamed: 0,word,count
0,dryer,19632
1,alarm,16693
2,tv,15197
3,essentials,14753
4,wifi,14090
5,kitchen,14078
6,allowed,14036
7,smoke,13431
8,washer,13428
9,parking,12310


In [93]:
# Plot description bar chart
fig = px.bar(df_amenitiesword.head(10) , x='word', y='count',
              color='count')
fig.update_layout(height=600,width = 850,title_x=0.5)
fig.update_layout(yaxis=dict(title=dict(standoff = 0,font={'size': 18})))
fig.update_layout(xaxis=dict(title=dict(standoff = 2,font={'size': 18}),tickfont={'size': 12}))
fig.show()

- TFIDF

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
# description
all_des = df_new['description'].values
hanzi = re.compile(r'[\u4e00-\u9fa5]') 
chinese_des = sum([1 for ad in all_des if hanzi.match(ad)])
my_stopwords = ENGLISH_STOP_WORDS.union(['br'])
des_tfm = TfidfVectorizer(stop_words=my_stopwords, min_df=0.1, use_idf=True, smooth_idf=True, norm=None)
tf_des = des_tfm.fit_transform(all_des).toarray()
tf_des_features = np.array(des_tfm.get_feature_names())

# description
all_ame = df_new['amenities'].values
ame_tfm = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, min_df=0.1, use_idf=True, smooth_idf=True, norm=None)
tf_ame = ame_tfm.fit_transform(all_ame).toarray()
tf_ame_features = np.array(ame_tfm.get_feature_names())

In [100]:
df_des_stat = pd.DataFrame(np.c_[tf_des_features,np.round(np.mean(tf_des,axis=0),4)],columns=['word','tfidf'])
df_ame_stat = pd.DataFrame(np.c_[tf_ame_features,np.mean(tf_ame,axis=0)],columns=['word','tfidf'])
df_des_stat = df_des_stat.sort_values(by=['tfidf'],ascending=False)
df_ame_stat = df_ame_stat.sort_values(by=['tfidf'],ascending=False)
df_des_stat.head()

Unnamed: 0,word,tfidf
2,apartment,1.7272
14,bedroom,1.3372
10,beach,1.3263
87,space,1.3252
80,room,1.2628


In [101]:
fig = px.bar(df_des_stat.head(11) , x='word', y='tfidf',
              color='tfidf',
             color_continuous_scale='reds')
fig.update_layout(yaxis=dict(title=dict(standoff = 0,font={'size': 18})))
fig.update_layout(xaxis=dict(title=dict(standoff = 2,font={'size': 18}),tickfont={'size': 12}))
fig.update_layout(height=600,width = 850,showlegend=False,font = dict(size=12))
fig.update_xaxes(tickangle=0)
fig.show()