In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import re
# from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("listings_cleaned.csv")

### Step 1. group listings data by zipcode

In [3]:
#drop orignial categorical variables and keep binary encoded version
df = df.drop(columns=['Unnamed: 0','property_type', 'cancellation_policy', 'property_type_cleaned'])

In [4]:
df.isna().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42584 entries, 0 to 42583
Data columns (total 26 columns):
host_is_superhost            42584 non-null int64
host_total_listings_count    42584 non-null float64
host_identity_verified       42584 non-null int64
zipcode                      42584 non-null int64
bathrooms                    42584 non-null float64
bedrooms                     42584 non-null float64
beds                         42584 non-null float64
price                        42584 non-null float64
availability_30              42584 non-null float64
availability_60              42584 non-null float64
availability_90              42584 non-null float64
availability_365             42584 non-null float64
reviews_per_month            42584 non-null float64
review_scores_location       42584 non-null float64
flexible                     42584 non-null int64
moderate                     42584 non-null int64
strict                       42584 non-null int64
Apartment          

In [5]:
#numeric variables
num = ['host_total_listings_count','bathrooms','bedrooms','beds','price',
       'availability_30','availability_60','availability_90','availability_365',
       'reviews_per_month','review_scores_location']
#binary/categorical variables
cat = list(set(list(df.columns.values))-set(num))
cat = list(set(cat) - set(['zipcode'])) 

In [6]:
#drop zipcode not in LA
df['zipcode'] = df['zipcode'].astype(int)
df = df[df['zipcode'] >= 90000]

In [7]:
###check number of listings for each zipcode 
# df_la.groupby('zipcode').count().reset_index()
zip_count = df['host_is_superhost'].groupby(df['zipcode']).count().reset_index()

In [8]:
zip_count

Unnamed: 0,zipcode,host_is_superhost
0,90001,11
1,90002,9
2,90003,23
3,90004,520
4,90005,350
5,90006,499
6,90007,303
7,90008,127
8,90010,66
9,90011,39


In [9]:
###aggregate numeric variables by taking averages in each zipcode
df_zip = df[num].groupby(df['zipcode']).mean().reset_index()
df_zip = df_zip.set_index('zipcode')

In [10]:
###aggregate binary variables by counting 0/1 level's percentage in each zipcode
for i in range(len(cat)):
    ## count number of listings for each zipcode
    a = pd.DataFrame(df[cat[i]].groupby(df['zipcode']).count())
    ## count frequency of '1' levels
    a1 = pd.DataFrame(df.loc[df[cat[i]] == 1, 
                                cat[i]].groupby(df['zipcode']).count())
    ## count frequency of '0' levels
    a0 = pd.DataFrame(df.loc[df[cat[i]] == 0, 
                                cat[i]].groupby(df['zipcode']).count())
    ## creat new column names
    a1.columns = [cat[i] + '1']
    # a0.columns = [cat[i] + '0']
    ## join columns
    #m = (a.join(a0.join(a1,  how = 'outer'), how = 'inner')).fillna(0)
    m = (a.join(a1, how='outer')).fillna(0)
    ## calculate percentage = frequency/totalcount
    m.iloc[:,1] = m.iloc[:,1]/m.iloc[:,0]
    #m.iloc[:,2] = m.iloc[:,2]/m.iloc[:,0]
    m = m.iloc[:,1:2]
    ## merge with numeric variables
    df_zip = df_zip.join(m, how = 'inner')

In [11]:
###merged dataset for modeling
df_zip.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,...,Apartment1,moderate1,Guest suite1,Bungalow1,Condominium1,Guesthouse1,flexible1,host_is_superhost1,Loft1,strict1
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,7.090909,1.045455,1.0,1.727273,69.181818,14.909091,33.363636,51.0,213.181818,1.342727,...,0.090909,0.181818,0.0,0.090909,0.454545,0.090909,0.272727,0.0,0.0,0.545455
90002,1.444444,1.0,1.111111,1.444444,75.555556,18.555556,40.777778,64.888889,247.555556,2.413333,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.333333
90003,2.391304,1.217391,1.130435,1.695652,67.652174,14.652174,32.26087,51.695652,171.347826,2.061739,...,0.26087,0.347826,0.0,0.0,0.0,0.043478,0.173913,0.521739,0.0,0.478261
90004,5.148077,1.391346,1.205769,1.923077,153.842308,12.180769,27.986538,45.863462,147.35,1.455731,...,0.5,0.290385,0.013462,0.011538,0.013462,0.044231,0.288462,0.2,0.021154,0.421154
90005,10.917143,1.254286,0.971429,1.662857,87.171429,14.545714,33.634286,53.357143,172.0,2.106829,...,0.537143,0.208571,0.0,0.005714,0.037143,0.008571,0.254286,0.168571,0.005714,0.537143


### Step 2, add in NLP features (1st Method: word2vec)

In [12]:
text = pd.read_csv('textFeatures.csv')

In [13]:
text.head()

Unnamed: 0,zipcode,good,safe,peaceful,night,walk,unsafe,bad,dangerous
0,10019,,,,,,,,
1,10023,,,,,,,,
2,37738,,,,,,,,
3,60601,,,,,,,,
4,90001,0.232158,0.156485,0.083805,0.115957,0.131038,0.108312,0.179478,0.123318


In [14]:
text = text.set_index('zipcode')
df_txt = df_zip.join(text, on = 'zipcode', how='inner')
df_txt.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,...,Loft1,strict1,good,safe,peaceful,night,walk,unsafe,bad,dangerous
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,7.090909,1.045455,1.0,1.727273,69.181818,14.909091,33.363636,51.0,213.181818,1.342727,...,0.0,0.545455,0.232158,0.156485,0.083805,0.115957,0.131038,0.108312,0.179478,0.123318
90002,1.444444,1.0,1.111111,1.444444,75.555556,18.555556,40.777778,64.888889,247.555556,2.413333,...,0.0,0.333333,0.174719,0.129185,0.077965,0.101681,0.114704,0.079193,0.129388,0.090907
90003,2.391304,1.217391,1.130435,1.695652,67.652174,14.652174,32.26087,51.695652,171.347826,2.061739,...,0.0,0.478261,0.199368,0.142892,0.089643,0.117684,0.135803,0.085547,0.143283,0.100415
90004,5.148077,1.391346,1.205769,1.923077,153.842308,12.180769,27.986538,45.863462,147.35,1.455731,...,0.021154,0.421154,0.19969,0.146995,0.095235,0.113894,0.136349,0.087915,0.141293,0.10269
90005,10.917143,1.254286,0.971429,1.662857,87.171429,14.545714,33.634286,53.357143,172.0,2.106829,...,0.005714,0.537143,0.203206,0.14909,0.091597,0.115445,0.135272,0.090355,0.144487,0.103797


### Step 3, add more NLP features (2nd Method: bag of words)

In [15]:
import pickle
with open('wordbag_final.pickle', 'rb') as f:
    wordbag = pickle.load(f)

with open('wordrank_final.pickle', 'rb') as f:
    wordrank = pickle.load(f)
    
with open('neighborhoodfeature.pickle', 'rb') as f:
    neighborf = pickle.load(f)

In [16]:
wordbag.head()

Unnamed: 0_level_0,abbot,able,absolute,absolutely,absolutely beautiful,absolutely great,absolutely perfect,absolutely wonderful,ac,accessible,...,wonderful nice,wonderfully,worth,wrong,yard,yet,young,level2,sentimental,wordperrev
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,0.0,0.219051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.219229,0.0,0.0,0.0,0.0,3.0,7.450386,31.947962
90002,0.0,0.025709,0.0,0.01156,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.032162,0.0,0.0,0.061705,0.0,3.0,2.687272,36.027004
90003,0.0,0.0,0.0,0.030165,0.0,0.0,0.0,0.0,0.0,0.098486,...,0.0,0.0,0.083923,0.0,0.0,0.0,0.0,2.0,3.168464,32.300323
90004,0.0,0.048483,0.05493,0.05874,0.029149,0.022464,0.043467,0.041146,0.043031,0.053382,...,0.008844,0.075401,0.040435,0.044596,0.031295,0.056566,0.032397,1.0,5.892534,40.439221
90005,0.0,0.053237,0.018247,0.032387,0.009037,0.023216,0.016846,0.019933,0.020846,0.047507,...,0.006855,0.013487,0.043094,0.025139,0.012129,0.022548,0.040805,1.0,5.285061,35.063114


In [17]:
#top_ten_idx = [i for i,x in enumerate(wordrank) if x in range(10)]
#top_ten_idx
neighborf.head()

Unnamed: 0_level_0,count,good,safe,peaceful,night,walk,unsafe,bad,dangerous
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
90001,80.555556,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
90002,28.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90003,58.266667,0.066667,0.133333,0.0,0.0,0.666667,0.0,0.066667,0.0
90004,54.334311,0.026393,0.123167,0.002933,0.082111,0.246334,0.0,0.0,0.0
90005,54.121212,0.070707,0.10101,0.025253,0.136364,0.176768,0.0,0.015152,0.0


In [18]:
neighborf = neighborf.reset_index(drop=False)
neighborf['len']=[len(neighborf['zipcode'][i]) for i in range(neighborf.shape[0])]
neighborf['len'].unique()
#neighborf['zipcode'] = [i[:5] for i in neighborf['zipcode']]

array([ 5,  6, 10, 12,  4,  7])

In [19]:
#Case 1: with 'CA', len=7
neighborf['zipcode']=[neighborf['zipcode'][i][2:] if neighborf['len'][i]==7 else neighborf['zipcode'][i] for i in range(neighborf.shape[0])]
#Case 2: with '-', len=10
neighborf['zipcode']=[neighborf['zipcode'][i][0:5] if re.search(r'-',neighborf['zipcode'][i]) else neighborf['zipcode'][i] for i in range(neighborf.shape[0])]
#Case 3: with 'Near' or '/n/n'
neighborf['zipcode']=[re.sub("[^0-9]", "", neighborf['zipcode'][i]) for i in range(neighborf.shape[0])]
#Case 4: len>6, take first 5 digits
neighborf['zipcode']=[neighborf['zipcode'][i][0:5] if len(neighborf['zipcode'][i])>5 else neighborf['zipcode'][i] for i in range(neighborf.shape[0])]
#Case 5: len<5 drop it
neighborf['len2']=[len(neighborf['zipcode'][i]) for i in range(neighborf.shape[0])]
neighborf = neighborf[neighborf['len2']==5].reset_index(drop=True)

#drop len & len2
neighborf = neighborf.drop(['len','len2'],axis=1)

In [20]:
top_ten = wordbag[wordrank[0:20]].join(wordbag.iloc[:,-1],on='zipcode',how='inner')
neighborf['zipcode'] = neighborf['zipcode'].astype(int)
neighborf = neighborf.set_index('zipcode')
top_ten = top_ten.merge(neighborf, on='zipcode', how = 'inner', suffixes=('', '_f'))
top_ten

Unnamed: 0_level_0,young,explore,fantastic great,fantastic,famous,fairly,fair,fabulous,extremely responsive,extremely helpful,...,wordperrev,count,good,safe,peaceful,night,walk,unsafe,bad,dangerous
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,0.000000,0.000000,0.000000,0.055456,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,31.947962,80.555556,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
90002,0.000000,0.000000,0.090036,0.024407,0.000000,0.060558,0.000000,0.000000,0.000000,0.110578,...,36.027004,28.400000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
90003,0.000000,0.000000,0.000000,0.031844,0.000000,0.000000,0.180036,0.000000,0.000000,0.000000,...,32.300323,58.266667,0.066667,0.133333,0.000000,0.000000,0.666667,0.000000,0.066667,0.000000
90004,0.032397,0.045227,0.033015,0.048585,0.056317,0.034895,0.028914,0.029299,0.052052,0.075302,...,40.439221,54.334311,0.026393,0.123167,0.002933,0.082111,0.246334,0.000000,0.000000,0.000000
90005,0.040805,0.050080,0.025590,0.022463,0.080027,0.056553,0.028014,0.020645,0.040346,0.053877,...,35.063114,54.121212,0.070707,0.101010,0.025253,0.136364,0.176768,0.000000,0.015152,0.000000
90006,0.044097,0.022762,0.030211,0.014742,0.006012,0.052831,0.064823,0.005118,0.020839,0.033393,...,29.859746,43.175862,0.106897,0.113793,0.003448,0.100000,0.144828,0.000000,0.000000,0.000000
90007,0.039062,0.015581,0.030330,0.036999,0.000000,0.050999,0.034863,0.025693,0.020921,0.000000,...,33.899629,47.636364,0.012987,0.181818,0.000000,0.006494,0.077922,0.000000,0.000000,0.000000
90008,0.000000,0.057924,0.016913,0.032095,0.033659,0.034128,0.051844,0.009552,0.023333,0.083089,...,36.750789,57.884615,0.025641,0.076923,0.025641,0.012821,0.192308,0.000000,0.000000,0.000000
90010,0.000000,0.064692,0.113337,0.071689,0.000000,0.012705,0.028951,0.032003,0.104235,0.000000,...,36.319616,63.878049,0.000000,0.000000,0.000000,0.024390,0.000000,0.000000,0.000000,0.000000
90011,0.000000,0.000000,0.000000,0.006379,0.000000,0.047480,0.108191,0.039866,0.097384,0.000000,...,34.176500,47.277778,0.055556,0.055556,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000


In [21]:
top_ten.columns

Index(['young', 'explore', 'fantastic great', 'fantastic', 'famous', 'fairly',
       'fair', 'fabulous', 'extremely responsive', 'extremely helpful',
       'extremely friendly', 'extremely comfortable', 'extremely clean',
       'extremely', 'extra', 'expensive', 'even', 'exceptionally',
       'exceptional', 'excellent great', 'wordperrev', 'count', 'good', 'safe',
       'peaceful', 'night', 'walk', 'unsafe', 'bad', 'dangerous'],
      dtype='object')

In [22]:
# df_txt = df_txt.join(top_ten, on = 'zipcode', how='inner')
# df_txt.head()
df = df_txt.merge(top_ten, on = 'zipcode', how='inner', suffixes=('', '_y'))
df.head()

Unnamed: 0_level_0,host_total_listings_count,bathrooms,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,reviews_per_month,...,wordperrev,count,good_y,safe_y,peaceful_y,night_y,walk_y,unsafe_y,bad_y,dangerous_y
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,7.090909,1.045455,1.0,1.727273,69.181818,14.909091,33.363636,51.0,213.181818,1.342727,...,31.947962,80.555556,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
90002,1.444444,1.0,1.111111,1.444444,75.555556,18.555556,40.777778,64.888889,247.555556,2.413333,...,36.027004,28.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90003,2.391304,1.217391,1.130435,1.695652,67.652174,14.652174,32.26087,51.695652,171.347826,2.061739,...,32.300323,58.266667,0.066667,0.133333,0.0,0.0,0.666667,0.0,0.066667,0.0
90004,5.148077,1.391346,1.205769,1.923077,153.842308,12.180769,27.986538,45.863462,147.35,1.455731,...,40.439221,54.334311,0.026393,0.123167,0.002933,0.082111,0.246334,0.0,0.0,0.0
90005,10.917143,1.254286,0.971429,1.662857,87.171429,14.545714,33.634286,53.357143,172.0,2.106829,...,35.063114,54.121212,0.070707,0.10101,0.025253,0.136364,0.176768,0.0,0.015152,0.0


In [23]:
df.shape

(268, 63)

In [24]:
df.to_csv("listings_cleaned_zipcode.csv")