In [14]:
import os
import pickle
import pandas as pd
import numpy as np

## Load filtered DF

In [2]:
Cleaned_df = pd.read_pickle("cleaned_df_without_encode.pkl")
Cleaned_df.shape

(6813, 16)

## Merge listing and safe index score

In [3]:
safe_df = pd.read_pickle("../../SafeIndex/safeIndex_normalized.pkl")
safe_df = safe_df[safe_df['Category'] == 'overall']
safe_df.rename(columns={'Normalized_score':'SafeIndex_Score'}, inplace=True)

In [4]:
df = pd.merge(left=Cleaned_df , right=safe_df, left_on='neighbourhood_cleansed', right_on='Area').drop(columns=['Area', 'Category'])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,amenities,price,minimum_nights,review_scores_rating,reviews_per_month,price_log,SafeIndex_Score
0,within an hour,1.00,0.99,1,1,Western Addition,Entire apartment,Entire home/apt,3,1.0,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Dry...",132.0,2,97.0,1.91,4.882802,0.400000
1,within a few hours,0.97,0.79,1,38,Western Addition,Entire apartment,Entire home/apt,5,2.0,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Dry...",166.0,30,91.0,0.15,5.111988,0.400000
2,within a day,0.50,0.76,1,3,Western Addition,Private room in townhouse,Private room,2,1.0,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Car...",130.0,2,97.0,3.92,4.867534,0.400000
3,within a day,0.50,0.76,1,3,Western Addition,Private room in townhouse,Private room,2,1.0,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Car...",110.0,2,98.0,3.73,4.700480,0.400000
4,within an hour,1.00,1.00,0,2,Western Addition,Entire apartment,Entire home/apt,10,2.0,"[""Heating"", ""Children\u2019s books and toys"", ...",480.0,3,91.0,0.32,6.173786,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6804,-,0.00,1.00,0,3,Visitacion Valley,Entire guesthouse,Entire home/apt,2,1.0,"[""Heating"", ""Iron"", ""Dryer"", ""Coffee maker"", ""...",179.0,2,100.0,0.47,5.187386,0.581538
6805,-,0.00,1.00,0,0,Visitacion Valley,Entire guest suite,Entire home/apt,2,1.0,"[""Heating"", ""Hot water"", ""Iron"", ""Dryer"", ""Cof...",190.0,30,100.0,0.48,5.247024,0.581538
6806,within an hour,1.00,1.00,0,0,Visitacion Valley,Private room in apartment,Private room,4,1.0,"[""Smoke alarm"", ""Heating"", ""Hot water"", ""Free ...",85.0,1,100.0,0.34,4.442651,0.581538
6807,within an hour,1.00,0.97,1,6,Visitacion Valley,Entire apartment,Entire home/apt,10,3.0,"[""Heating"", ""Hot water"", ""Stove"", ""Iron"", ""Dry...",451.0,2,0.0,0.00,6.111467,0.581538


###  Remove unncessary features

In [5]:
Filepath = "remove_features_without_encoding.pkl"

if os.path.exists(Filepath):
    df = pd.read_pickle(Filepath)
    print("read success!")
else:
    amentities_col = df.amenities
    amentities_dict = {}
    id = 0
    for amentities in amentities_col:
        x_striped = amentities[1:-1]
        x_arr = x_striped.split(',')

        for amentity in x_arr:
            name = "{}".format((amentity.strip())[1:-1].lower().replace("\\u2019","'"))

            if name not in amentities_dict:
                amentities_dict[name] = 1
            else:
                amentities_dict[name] += 1
    sorted_amentities = dict(sorted(amentities_dict.items(), key=lambda x: x[1], reverse=True))
    sorted_amentities = {k: v for k, v in sorted_amentities.items() if v >= 30}
    filter_amentities = list(sorted_amentities.keys())

    for item in filter_amentities:
        df[item] = 0

    for index, rows in df.iterrows():
        tmp = rows['amenities']
        tmp = (tmp.strip())[1:-1].lower().replace("\\u2019","'").split(",")
        for item in tmp:
            item = item.replace('"', "").strip()
            if item in filter_amentities:
                df.loc[index, item] = 1

    print("Before",df.shape)
    drop_columns = ['hangers', 'carbon monoxide alarm', 'iron', 'shampoo',
                    'laptop-friendly workspace', 'dryer', 'microwave',
                    'dishes and silverware', 'cooking basics', 'stove',
                    'extra pillows and blankets', 'gym', 'bedroom comforts',
                    "children's dinnerware", 'full kitchen', 'self check-in',
                    'outlet covers', 'barbecue utensils', 'baby bath',
                    'table corner guards']
    df.drop(columns =drop_columns, inplace=True)
    print("After",df.shape)
    df.to_pickle(Filepath)
    df

Before (6809, 100)
After (6809, 80)


## Encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
OHE_features = ['host_response_time','property_type','property_type','room_type','neighbourhood_cleansed']
# OHE_features = ['host_response_time','property_type','property_type','room_type']
OHE_df = df[OHE_features]
OHE_df

Unnamed: 0,host_response_time,property_type,property_type.1,room_type,neighbourhood_cleansed
0,within an hour,Entire apartment,Entire apartment,Entire home/apt,Western Addition
1,within a few hours,Entire apartment,Entire apartment,Entire home/apt,Western Addition
2,within a day,Private room in townhouse,Private room in townhouse,Private room,Western Addition
3,within a day,Private room in townhouse,Private room in townhouse,Private room,Western Addition
4,within an hour,Entire apartment,Entire apartment,Entire home/apt,Western Addition
...,...,...,...,...,...
6804,-,Entire guesthouse,Entire guesthouse,Entire home/apt,Visitacion Valley
6805,-,Entire guest suite,Entire guest suite,Entire home/apt,Visitacion Valley
6806,within an hour,Private room in apartment,Private room in apartment,Private room,Visitacion Valley
6807,within an hour,Entire apartment,Entire apartment,Entire home/apt,Visitacion Valley


In [8]:
enc = OneHotEncoder()
enc.fit(OHE_df)
onehotlabels = pd.DataFrame(enc.transform(OHE_df).toarray(), columns =  enc.get_feature_names(OHE_df.columns))
onehotlabels

Unnamed: 0,host_response_time_-,host_response_time_a few days or more,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour,property_type_Casa particular,property_type_Earth house,property_type_Entire apartment,property_type_Entire bungalow,property_type_Entire cabin,...,neighbourhood_cleansed_Parkside,neighbourhood_cleansed_Potrero Hill,neighbourhood_cleansed_Presidio Heights,neighbourhood_cleansed_Russian Hill,neighbourhood_cleansed_Seacliff,neighbourhood_cleansed_South of Market,neighbourhood_cleansed_Twin Peaks,neighbourhood_cleansed_Visitacion Valley,neighbourhood_cleansed_West of Twin Peaks,neighbourhood_cleansed_Western Addition
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6804,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6805,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6806,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6807,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
import pickle
Filepath = "remove_features_with_encoding.pkl"
if os.path.exists(Filepath):
#     mergedDf = pd.read_pickle(Filepath)
    with open(Filepath, 'rb') as f:
        mergedDf = pickle.load(f)
    print("read success!")
else:
    mergedDf = onehotlabels.merge(df, left_index=True, right_index=True)
    mergedDf.to_pickle(Filepath)

read success!
