In [4]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('train_data.csv',index_col=0)
df.shape

## Data Preparation 

In [None]:
def preparation(df):
#Drop unused columns 
    df = df.drop([ 'bed_type', 'first_review', 'host_has_profile_pic','latitude','longitude','name',
                  'neighbourhood','thumbnail_url','zipcode'], axis = 1)

    # create dummies for 'property_type'
    df['is_apt'] = df['property_type'].apply(lambda x: 1 if x=='Apartment' else 0)
    df['is_house'] = df['property_type'].apply(lambda x: 1 if x=='House' else 0)
    df['is_other_ppt_type'] = df['property_type'].apply(lambda x: 1 if (x!='Apartment') | (x!='House') else 0)

    # Create dummies for 'room_type'
    df['is_entireRoom'] = df['room_type'].apply(lambda x: 1 if x=='Entire home/apt' else 0)
    df['is_privateRoom'] = df['room_type'].apply(lambda x: 1 if x=='Private room' else 0)
    df['is_sharedRoom'] = df['room_type'].apply(lambda x: 1 if x=='Shared room' else 0)

    # Ordinal variables
    df['is_cancellation_policy'] = df['cancellation_policy'].apply(lambda x: 1 if (x =='strict') |(x =='super_strict_30')|(x =='super_strict_60') 
                                                                else (2 if x=='moderate' else 3))

    # Binary variables
    df['is_cleaning_fee'] = df['cleaning_fee'].apply(lambda x: 1 if x is True else 0)
    df ['is_instantBookable'] = df['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)
    df ['is_instantBookable'] = df['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)
    df.host_identity_verified = df.host_identity_verified.apply(lambda x: 1 if x=='t' else 0)


    # Change 'city' to categorical 
    df['is_city_nyc'] = df['city'].apply(lambda x: 1 if x=='NYC' else 0)
    df['is_city_la'] = df['city'].apply(lambda x: 1 if x=='LA' else 0)
    df['is_city_sf'] = df['city'].apply(lambda x: 1 if x=='SF' else 0)
    df['is_city_dc'] = df['city'].apply(lambda x: 1 if x=='SF' else 0)
    df['is_city_chicago'] = df['city'].apply(lambda x: 1 if x=='Chicago' else 0)


    df.drop(columns = ['property_type', 'room_type', 'cancellation_policy', 'cleaning_fee', 'instant_bookable', 'instant_bookable'], axis = 1, inplace = True)
    
    return df

In [None]:
df = preparation(df)

### Missing Value

In [None]:
def fill_na(df):
    df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float') / 100.0
    df['host_response_rate'] = df['host_response_rate'].interpolate()
    df['host_response_rate'] = df['host_response_rate'].round(2)


    # df.bathrooms = df.bathrooms.fillna(df.bathrooms.median())
    df['beds'] = df['beds'].interpolate().round(2).astype('int32')
    df['bedrooms'] = df['bedrooms'].interpolate().round(2).astype('int32')
    df['bathroom'] = df['bathroom'].interpolate().round(2).astype('int32')

    df.host_response_rate = df.host_response_rate.fillna(0)
    
    return df

In [None]:
df = fill_na(df)

### Feature Engineering

In [None]:
# Create 'host_days'
df['host_since'] = pd.to_datetime(df['host_since'])
df['last_review'] =  pd.to_datetime(df['last_review'])
df['host_days'] = df['last_review'] - df['host_since']

df.host_days = df.host_days.astype(str)
df.loc[df['host_days']=='NaT','host_days']='0 days'
df.drop(df[df['host_days']=='-592 days'].index,inplace=True)
df.host_days = df.host_days.str.strip(' days').astype(int)

# Engineer 'last_review' to 'last_review_year' and 'last_review_month'
# df['last_review'] = df ['last_review'].astype('string')
# df['last_review'].fillna(value = 'No Review',inplace = True)
# df['last_review_year'] = df['last_review'].str[0:4]
# df['last_review_month'] = df['last_review'].str[5:7]

# df.loc[df['last_review_year']=='NaT','last_review_year']='2017' #imputed with most frequency value
# df.last_review_year = df.last_review_year.astype(int)
# df.loc[df['last_review_month']=='','last_review_month'] = '09'
# df.last_review_month = df.last_review_month.astype(int)

# df = pd.get_dummies(df,columns=['last_review_year'])
# df = pd.get_dummies(df,columns=['last_review_month'])

# Calculate number of beds per bedroom
df['bath_to_bedroom'] = [df['bathrooms']/df['bedrooms'] if df.bedrooms[i]!=0 else df['bathrooms'][i] for i in range(len(df.bedrooms))]
df['ppl_to_beds'] = df['accomodates']/df['beds']


df.drop(columns = ['beds', 'bathroom', 'host_since', 'last_review'], axis = 1, inplace = True)

In [None]:
# convert to uint for saving storage
df.is_instantBookable = df.is_instantBookable.astype('uint8')

df.host_identity_verified = df.host_identity_verified.astype('uint8')

df.is_apt = df.is_apt.astype('uint8')

df.is_house = df.is_house.astype('uint8')

df.is_other_ppt_type = df.is_other_ppt_type.astype('uint8')

df.is_entireRoom = df.is_entireRoom.astype('uint8')

df.is_privateRoom = df.is_privateRoom.astype('uint8')

df.is_sharedRoom = df.is_sharedRoom.astype('uint8')

df.is_clean_fee = df.is_clean_fee.astype('uint8')

## Text Data Engineering

#### Amenities

In [None]:
# !pip install mlxtend

In [None]:
from mlxtend.preprocessing import TransactionEncoder

In [None]:
pd.set_option('display.max_columns', None)

amenities = df.amenities

import regex as re
lt, lst=[],[]
for items in amenities:
    items = items.split(',')
    for item in items:
        item = re.sub(r'["{}]', '', item)
        item = item.strip()
        if item != '':
            lt.append(item)
    lst.append(lt)
    lt=[]
    
te = TransactionEncoder()
te_ary = te.fit(lst).transform(lst)
te_ary.astype("int")

amnt_df=pd.DataFrame(te_ary, columns=te.columns_)
amnt_df = amnt_df.astype(int)

In [None]:
for i in amnt_df.columns:
  if i in ['Children’s books and toys', 'Pack ’n Play/travel crib','Family/kid friendly']:
    amnt_df = amnt_df.rename(columns= {i: 'baby_friendly'})
  elif i in ['Garden or backyard','Patio or balcony']:
    amnt_df = amnt_df.rename(columns= {i: 'garden_patio'})
  elif i in ['Coffee maker','Oven','Microwave','Stove','Refrigerator','Dryer','Washer','Iron','Hair dryer']:
    amnt_df = amnt_df.rename(columns= {i: 'household_appliances'})
  elif i in ['Doorman','Lockbox','Private entrance','Safety card','Self Check-In','Lock on bedroom door','Buzzer/wireless intercom','24-hour check-in','First aid kit','Fire extinguisher']:
    amnt_df = amnt_df.rename(columns= {i: 'good_security_system'})
  elif i in ['Extra pillows and blankets','Bed linens']:
    amnt_df = amnt_df.rename(columns= {i: 'bedroom_essentials'})
  elif i in ['Wheelchair accessible']:
    amnt_df = amnt_df.rename(columns= {i: 'Wheelchair_accessible'})
  elif i in ['Keypad','Laptop friendly workspace','Internet']:
    amnt_df = amnt_df.rename(columns= {i: 'good_working_environment'})
  elif i in ['Dishwasher','Cooking basics','Hot water','Dishes and silverware']:
    amnt_df = amnt_df.rename(columns= {i: 'kitchen_essentials'})
  elif i in ['Cat(s)','Dog(s)','Pets allowed','Pets live on this property']:
    amnt_df = amnt_df.rename(columns= {i: 'pets_allowed'})
  elif i in ['Pool','Gym']:
    amnt_df = amnt_df.rename(columns= {i: 'gym_pool'})
  elif i in ['Elevator in building','Elevator']:
    amnt_df = amnt_df.rename(columns= {i: 'elevator'})
  elif i in ['Bathtub','Hot tub', 'Shampoo']:
    amnt_df = amnt_df.rename(columns= {i: 'bathroom_essentials'})
  elif i in ['Breakfast']:
    amnt_df = amnt_df.rename(columns= {i: 'breakfast'})
  elif i in ['Indoor fireplace']:
    amnt_df = amnt_df.rename(columns= {i: 'fireplace'})
  elif i in['Free parking on premises']:
    amnt_df = amnt_df.rename(columns= {i: 'free_parking'})
  elif i in ['Hangers']:
    amnt_df = amnt_df.rename(columns= {i: 'hangers'})
  elif i in ['Air conditioning']:
    amnt_df = amnt_df.rename(columns= {i: 'air_conditioning'})

    
amnt_df = amnt_df.groupby(level=0, axis=1).sum()
amnt_df = pd.DataFrame(np.where(amnt_df>=1, 1, 0), columns=amnt_df.columns)
amnt_df = amnt_df[['baby_friendly','garden_patio','household_appliances','good_security_system',
                   'bedroom_essentials','Wheelchair_accessible','good_working_environment','kitchen_essentials',
                   'pets_allowed','gym_pool','elevator','bathroom_essentials','breakfast','fireplace',
                   'free_parking','hangers','air_conditioning']]

In [None]:
df.reset_index(inplace = True) 
df = pd.concat([df, amnt_df], axis=1)

#### Description

## Model Engineering