In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import re
# from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("listings_au_cleaned.csv")

In [3]:
#drop orignial categorical variables and keep binary encoded version
df = df.drop(columns=['Unnamed: 0','property_type', 'cancellation_policy', 'property_type_cleaned'])

In [4]:
df.isna().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11846 entries, 0 to 11845
Data columns (total 26 columns):
host_is_superhost            11846 non-null int64
host_total_listings_count    11846 non-null float64
host_identity_verified       11846 non-null int64
zipcode                      11846 non-null int64
bathrooms                    11846 non-null float64
bedrooms                     11846 non-null float64
beds                         11846 non-null float64
price                        11846 non-null float64
availability_30              11846 non-null int64
availability_60              11846 non-null int64
availability_90              11846 non-null int64
availability_365             11846 non-null int64
reviews_per_month            11846 non-null float64
review_scores_location       11846 non-null float64
flexible                     11846 non-null int64
moderate                     11846 non-null int64
strict                       11846 non-null int64
Apartment                  

In [5]:
#numeric variables
num = ['host_total_listings_count','bathrooms','bedrooms','beds','price',
       'availability_30','availability_60','availability_90','availability_365',
       'reviews_per_month','review_scores_location']
#binary/categorical variables
cat = list(set(list(df.columns.values))-set(num))
cat = list(set(cat) - set(['zipcode'])) 

In [6]:
#drop zipcode not in AU
df['zipcode'] = df['zipcode'].astype(int)
df = df[df['zipcode'] >= 70000]

In [7]:
###check number of listings for each zipcode 
# df_la.groupby('zipcode').count().reset_index()
zip_count = df['host_is_superhost'].groupby(df['zipcode']).count().reset_index()

In [8]:
zip_count

Unnamed: 0,zipcode,host_is_superhost
0,78613,1
1,78617,2
2,78619,2
3,78620,1
4,78652,2
5,78660,4
6,78669,1
7,78681,2
8,78701,636
9,78702,1565


In [9]:
###aggregate numeric variables by taking averages in each zipcode
df_zip = df[num].groupby(df['zipcode']).mean().reset_index()
df_zip = df_zip.set_index('zipcode')

In [10]:
df_zip.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,review_scores_location
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
78613,1.0,1.0,1.0,1.0,50.0,13.0,43.0,73.0,163.0,0.87,10.0
78617,1.5,1.5,3.0,3.0,34.5,24.0,51.0,79.0,334.5,0.87,10.0
78619,1.0,2.75,3.5,4.0,537.5,11.0,26.0,41.0,210.0,0.87,10.0
78620,3.0,1.0,1.0,3.0,175.0,23.0,53.0,83.0,358.0,0.24,10.0
78652,1.5,1.75,2.0,3.0,78.0,20.5,50.5,77.5,122.5,1.755,10.0


In [11]:
###aggregate binary variables by counting 0/1 level's percentage in each zipcode
for i in range(len(cat)):
    ## count number of listings for each zipcode
    a = pd.DataFrame(df[cat[i]].groupby(df['zipcode']).count())
    ## count frequency of '1' levels
    a1 = pd.DataFrame(df.loc[df[cat[i]] == 1, 
                                cat[i]].groupby(df['zipcode']).count())
    ## count frequency of '0' levels
    a0 = pd.DataFrame(df.loc[df[cat[i]] == 0, 
                                cat[i]].groupby(df['zipcode']).count())
    ## creat new column names
    a1.columns = [cat[i] + '1']
    # a0.columns = [cat[i] + '0']
    ## join columns
    # m = (a.join(a0.join(a1,  how = 'outer'), how = 'inner')).fillna(0)
    m = (a.join(a1, how='outer')).fillna(0)
    ## calculate percentage = frequency/totalcount
    m.iloc[:,1] = m.iloc[:,1]/m.iloc[:,0]
    # m.iloc[:,2] = m.iloc[:,2]/m.iloc[:,0]
    m = m.iloc[:,1:2]
    ## merge with numeric variables
    df_zip = df_zip.join(m, how = 'inner')

In [12]:
###merged dataset for modeling
df_zip.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,...,Bungalow1,flexible1,Loft1,strict1,host_is_superhost1,Townhouse1,Guesthouse1,Apartment1,Condominium1,House1
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78613,1.0,1.0,1.0,1.0,50.0,13.0,43.0,73.0,163.0,0.87,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
78617,1.5,1.5,3.0,3.0,34.5,24.0,51.0,79.0,334.5,0.87,...,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5
78619,1.0,2.75,3.5,4.0,537.5,11.0,26.0,41.0,210.0,0.87,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
78620,3.0,1.0,1.0,3.0,175.0,23.0,53.0,83.0,358.0,0.24,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
78652,1.5,1.75,2.0,3.0,78.0,20.5,50.5,77.5,122.5,1.755,...,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0


### Step 2, add in NLP features

In [13]:
text = pd.read_csv('textFeatures_AU.csv')

In [14]:
text.head()

Unnamed: 0,zipcode,good,safe,night,walk,unsafe,bad,dangerous
0,78613.0,,,,,,,
1,78617.0,,,,,,,
2,78619.0,,,,,,,
3,78620.0,0.192463,0.134758,0.154177,0.166445,0.074078,0.133736,0.087533
4,78652.0,0.245174,0.185053,0.120877,0.143276,0.1006,0.167012,0.119292


In [15]:
text = text.set_index('zipcode')
df_txt = df_zip.join(text, on = 'zipcode', how='inner')
df_txt.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,...,Apartment1,Condominium1,House1,good,safe,night,walk,unsafe,bad,dangerous
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78613,1.0,1.0,1.0,1.0,50.0,13.0,43.0,73.0,163.0,0.87,...,0.0,0.0,1.0,,,,,,,
78617,1.5,1.5,3.0,3.0,34.5,24.0,51.0,79.0,334.5,0.87,...,0.0,0.0,0.5,,,,,,,
78619,1.0,2.75,3.5,4.0,537.5,11.0,26.0,41.0,210.0,0.87,...,0.0,0.0,1.0,,,,,,,
78620,3.0,1.0,1.0,3.0,175.0,23.0,53.0,83.0,358.0,0.24,...,0.0,0.0,0.0,0.192463,0.134758,0.154177,0.166445,0.074078,0.133736,0.087533
78652,1.5,1.75,2.0,3.0,78.0,20.5,50.5,77.5,122.5,1.755,...,0.0,0.0,1.0,0.245174,0.185053,0.120877,0.143276,0.1006,0.167012,0.119292


### Step 3, add more NLP features

In [16]:
import pickle
with open('wordbag_AU.pickle', 'rb') as f:
    wordbag = pickle.load(f)

with open('wordrank_final_fromLA.pickle', 'rb') as f:
    wordrank = pickle.load(f)
    
with open('neighborhoodfeature_au.pickle', 'rb') as f:
    neighborf = pickle.load(f)

In [17]:
wordbag = wordbag.reset_index(drop=False)
wordbag = wordbag.set_index('index')

In [18]:
#top_ten_idx = [i for i,x in enumerate(wordrank) if x in range(10)]
#top_ten_idx
neighborf = neighborf.reset_index(drop=False)
neighborf.head()

Unnamed: 0,zipcode,count,good,safe,peaceful,night,walk,unsafe,bad,dangerous
0,78619.0,88.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,78620.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,78652.0,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,78660.0,56.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
4,78681.0,40.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [19]:
top_ten = wordbag[wordrank[0:20]].join(wordbag.iloc[:,-2:],on='index',how='inner')
top_ten = top_ten.reset_index(drop=False)
top_ten = pd.merge(top_ten, neighborf, left_on='index', right_on='zipcode', how='left')

In [20]:
top_ten = top_ten[pd.notnull(top_ten['zipcode'])]
##drop index column
top_ten = top_ten.drop(['index'],axis=1)
top_ten = top_ten.set_index('zipcode')

In [21]:
top_ten.columns

Index(['young', 'explore', 'fantastic great', 'fantastic', 'famous', 'fairly',
       'fair', 'fabulous', 'extremely responsive', 'extremely helpful',
       'extremely friendly', 'extremely comfortable', 'extremely clean',
       'extremely', 'extra', 'expensive', 'even', 'exceptionally',
       'exceptional', 'excellent great', 'sentimental', 'wordperrev', 'count',
       'good', 'safe', 'peaceful', 'night', 'walk', 'unsafe', 'bad',
       'dangerous'],
      dtype='object')

In [22]:
# df_txt = df_txt.join(top_ten, on = 'zipcode', how='inner')
# df_txt.head()
df = df_txt.merge(top_ten, on = 'zipcode', how='inner', suffixes=('', '_y'))
df.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,...,wordperrev,count,good_y,safe_y,peaceful,night_y,walk_y,unsafe_y,bad_y,dangerous_y
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78619,1.0,2.75,3.5,4.0,537.5,11.0,26.0,41.0,210.0,0.87,...,0.0,88.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78620,3.0,1.0,1.0,3.0,175.0,23.0,53.0,83.0,358.0,0.24,...,16.242631,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78652,1.5,1.75,2.0,3.0,78.0,20.5,50.5,77.5,122.5,1.755,...,27.121995,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78660,3.0,1.25,1.0,1.0,50.75,15.5,37.75,60.25,151.5,2.15,...,0.0,56.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
78681,2.0,1.5,2.0,2.5,72.5,25.5,51.0,70.5,160.5,1.18,...,18.686851,40.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [23]:
df = df[pd.notnull(df['good'])]

In [24]:
df.to_csv("listings_au_cleaned_zipcode.csv")