## Import Libraries

In [1]:
pip install -r "../../requirements.txt"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import os
import pickle

sns.set(style= "whitegrid", palette="pastel", color_codes=True)
sns.mpl.rc("figure", figsize=(10,6))
%matplotlib inline

## SF raw listing 

In [3]:
listing_df = pd.read_csv("../../May Myo/raw_data/listings.csv.gz")
listing_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

### Filter variables we want

In [4]:
df= listing_df[['id','neighbourhood_cleansed','amenities','accommodates']]
df

Unnamed: 0,id,neighbourhood_cleansed,amenities,accommodates
0,958,Western Addition,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Dry...",3
1,5858,Bernal Heights,"[""Smoke alarm"", ""Heating"", ""Kitchen"", ""First a...",5
2,7918,Haight Ashbury,"[""Host greets you"", ""Heating"", ""Hot water"", ""K...",2
3,8142,Haight Ashbury,"[""Host greets you"", ""Heating"", ""Hot water"", ""K...",2
4,8339,Western Addition,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Car...",4
...,...,...,...,...
7269,44758668,Mission,"[""Smoke alarm"", ""Long term stays allowed"", ""Re...",2
7270,44799193,Downtown/Civic Center,"[""Smoke alarm"", ""Long term stays allowed"", ""Re...",2
7271,44805385,Lakeshore,"[""Gym"", ""Heating"", ""Air conditioning"", ""Iron"",...",6
7272,44810526,South of Market,"[""Gym"", ""Heating"", ""Hot water"", ""Stove"", ""Air ...",1


### Get unique amentities

In [5]:
amentities_col = df.amenities
amentities_dict = {}
id = 0
for amentities in amentities_col:
    x_striped = amentities[1:-1]
    x_arr = x_striped.split(',')
    
    for amentity in x_arr:
        name = "{}".format((amentity.strip())[1:-1].lower().replace("\\u2019","'"))
        
        if name not in amentities_dict:
            amentities_dict[name] = 1
        else:
            amentities_dict[name] += 1
sorted_amentities = dict(sorted(amentities_dict.items(), key=lambda x: x[1], reverse=True))
sorted_amentities = {k: v for k, v in sorted_amentities.items() if v >= 30}
filter_amentities = list(sorted_amentities.keys())
filter_amentities

['wifi',
 'smoke alarm',
 'heating',
 'essentials',
 'hangers',
 'carbon monoxide alarm',
 'hair dryer',
 'iron',
 'tv',
 'shampoo',
 'kitchen',
 'laptop-friendly workspace',
 'washer',
 'dryer',
 'fire extinguisher',
 'hot water',
 'refrigerator',
 'coffee maker',
 'microwave',
 'dishes and silverware',
 'first aid kit',
 'bed linens',
 'oven',
 'free street parking',
 'cooking basics',
 'private entrance',
 'stove',
 'dishwasher',
 'cable tv',
 'extra pillows and blankets',
 'long term stays allowed',
 'luggage dropoff allowed',
 'garden or backyard',
 'patio or balcony',
 'lock on bedroom door',
 'lockbox',
 'elevator',
 'free parking on premises',
 'bathtub',
 'indoor fireplace',
 'air conditioning',
 'keypad',
 'gym',
 'paid parking off premises',
 'bbq grill',
 'breakfast',
 "pack 'n play/travel crib",
 'host greets you',
 'paid parking on premises',
 'shower gel',
 'room-darkening shades',
 "children's books and toys",
 'building staff',
 'single level home',
 'ethernet connecti

### Manually create encoding columns

In [6]:
for item in filter_amentities:
    df[item] = 0
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[item] = 0


Unnamed: 0,id,neighbourhood_cleansed,amenities,accommodates,wifi,smoke alarm,heating,essentials,hangers,carbon monoxide alarm,...,baby bath,pocket wifi,changing table,window guards,beach essentials,ev charger,beachfront,table corner guards,bread maker,waterfront
0,958,Western Addition,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Dry...",3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5858,Bernal Heights,"[""Smoke alarm"", ""Heating"", ""Kitchen"", ""First a...",5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7918,Haight Ashbury,"[""Host greets you"", ""Heating"", ""Hot water"", ""K...",2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8142,Haight Ashbury,"[""Host greets you"", ""Heating"", ""Hot water"", ""K...",2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8339,Western Addition,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Car...",4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7269,44758668,Mission,"[""Smoke alarm"", ""Long term stays allowed"", ""Re...",2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7270,44799193,Downtown/Civic Center,"[""Smoke alarm"", ""Long term stays allowed"", ""Re...",2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7271,44805385,Lakeshore,"[""Gym"", ""Heating"", ""Air conditioning"", ""Iron"",...",6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7272,44810526,South of Market,"[""Gym"", ""Heating"", ""Hot water"", ""Stove"", ""Air ...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Manual one hot encoding

In [7]:
for index, rows in df.iterrows():
    tmp = rows['amenities']
    tmp = (tmp.strip())[1:-1].lower().replace("\\u2019","'").split(",")
    for item in tmp:
        item = item.replace('"', "").strip()
        if item in filter_amentities:
            df.loc[index, item] = 1

### store encoding columns with ID into pickle

In [8]:
Filepath = "encoded_amentities.pkl"

if os.path.exists(Filepath):
    encoded_Df = pd.read_pickle(Filepath)
    print("read success!")
else:
    filter_amentities.insert(0,"id")
    encoded_Df = df[[x for x in filter_amentities]]
    encoded_Df.to_pickle("encoded_amentities.pkl")