# Intro

In [114]:
#Read in libraries
import pandas as pd
import swifter

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn

In [115]:
#Set style for matplotlib
plt.style.use('ggplot')

In [116]:
#Set options for pandas
pd.options.display.max_columns =1_000
pd.set_option('display.max_rows', 1_000)
pd.set_option('display.float_format', '{:.2f}'.format)


In [117]:
#Set path to listings data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\03_Processed'

#Set list of date columns to parse
dates = ['calendar_last_scraped', 'first_review', 'host_since', 'last_review']

#Read in file
listings = pd.read_csv(path + '/01_16_2020_Listings_Post_EDA.csv',index_col=0, parse_dates=dates)

## Preview Data

In [118]:
listings.head(1)

Unnamed: 0,accommodates,amenities,availability_30,availability_365,bathrooms,bed_type,bedrooms,beds,chlc,chlc_private_rooms,chlc_shared_rooms,calendar_last_scraped,calendar_updated,cancellation_policy,city,cleaning_fee,description,extra_people,first_review,guests_included,host_about,host_id,host_identity_verified,host_is_superhost,host_listings_count,host_location,host_name,host_neighbourhood,host_response_rate,host_response_time,host_since,host_verifications,house_rules,id,instant_bookable,is_location_exact,last_review,latitude,longitude,market,maximum_maximum_nights,name,neighborhood_overview,neighbourhood_cleansed,number_of_reviews,number_of_reviews_ltm,price,property_type,require_guest_phone_verification,require_guest_profile_picture,requires_license,review_scores_accuracy,review_scores_checkin,review_scores_cleanliness,review_scores_communication,review_scores_location,review_scores_rating,review_scores_value,reviews_per_month,room_type,scrape_id,security_deposit,space,summary,transit,zipcode,weekly_price,monthly_price
0,3,TV Cable TV Internet Wifi Kitchen Pets liv...,0,77,1.0,Real Bed,1.0,2.0,1,0.0,0.0,2019-04-03,1 week ago,moderate,San Francisco,100.0,New update: the house next door is under const...,25.0,2009-07-23,2,We are a family with 2 boys born in 2009 and 2...,1169,True,True,1.0,"San Francisco, California, United States",Holly,Duboce Triangle,100.0,within an hour,2008-07-31,email phone facebook reviews kba,* No Pets - even visiting guests for a short t...,958,True,True,2019-03-16,37.77,-122.43,San Francisco,30.0,"Bright, Modern Garden Unit - 1BR/1B",*Quiet cul de sac in friendly neighborhood *St...,Western Addition,183,51.0,170.0,Apartment,False,False,True,10.0,10.0,10.0,10.0,10.0,97.0,10.0,1.55,Entire home/apt,20190403130253.0,100.0,"Newly remodeled, modern, and bright garden uni...",New update: the house next door is under const...,*Public Transportation is 1/2 block away. *Ce...,94117.0,1120.0,4200.0


In [119]:
#Print Shape
print('Listings shape: ', listings.shape)

#Check data types
listings.dtypes

Listings shape:  (88771, 68)


accommodates                                 int64
amenities                                   object
availability_30                              int64
availability_365                             int64
bathrooms                                  float64
bed_type                                    object
bedrooms                                   float64
beds                                       float64
chlc                                         int64
chlc_private_rooms                         float64
chlc_shared_rooms                          float64
calendar_last_scraped               datetime64[ns]
calendar_updated                            object
cancellation_policy                         object
city                                        object
cleaning_fee                               float64
description                                 object
extra_people                               float64
first_review                        datetime64[ns]
guests_included                

# Removing irrelevant features

In [120]:
#Removing columns that do not pertain to price
to_drop = ['calendar_last_scraped','calendar_updated', 'description','first_review','host_about','host_id',
          'host_listings_count','host_location','host_name', 'host_neighbourhood','house_rules','host_since','name',
          'neighborhood_overview', 'scrape_id','weekly_price','monthly_price', 'id','last_review',
           'longitude', 'latitude','space', 'summary', 'transit']

#Remove to_drop cols
listings.drop(columns=to_drop, inplace = True)
print('Current shape of listings: ', listings.shape)

Current shape of listings:  (88771, 44)


# Update Data Types

In [121]:
#Ordinal variables to convert to category
ordinals = ['review_scores_accuracy', 'review_scores_checkin','review_scores_cleanliness', 
            'review_scores_communication','review_scores_location','review_scores_rating',
            'review_scores_value']

#Numerics that are categorical variables
objects = ['zipcode']

#Convert 
listings[ordinals] = listings[ordinals].astype('category')
listings[objects] = listings[objects].astype('object')

# Missing Values

## Isolate columns with missing data
Create a dataframe that captures counts and % missing of listings columns missing data. Additionally, capture their data types and assign to data frame. 

In [122]:
#Write function that creates a df that returns stats regarding missing values in listings
def missing_updater(df):
    #capture count of missing values
    missing = pd.DataFrame(df.isna().sum())
    missing.rename(columns = {0:'Missing_Values'},inplace = True)
    #Calculate % of missing values
    missing['%_Missing'] = (missing['Missing_Values']/len(listings)) * 100
    #Capture data type of each column
    missing['Data_Types'] = pd.Series(listings.dtypes)
    #Keep rows where Missing_Values > 0 and sort by # of missing values
    missing = missing[missing['Missing_Values'] > 0]
    missing.sort_values(by = 'Missing_Values', ascending = False, inplace = True)
    return missing

#View
display(missing_updater(listings))

Unnamed: 0,Missing_Values,%_Missing,Data_Types
security_deposit,18573,20.92,float64
review_scores_value,18244,20.55,category
review_scores_location,18240,20.55,category
review_scores_checkin,18240,20.55,category
review_scores_accuracy,18225,20.53,category
review_scores_cleanliness,18216,20.52,category
review_scores_communication,18201,20.5,category
review_scores_rating,18174,20.47,category
reviews_per_month,17714,19.95,float64
host_response_rate,13260,14.94,float64


## Resolving missing values by data type

### Float64

#### Outlier Removal

In [123]:
#Print listings original shape
print('Listings data original shape:', listings.shape)

#Get list of float64s from missing.
floats= missing[missing.Data_Types == 'float64'].index.tolist()

#Listings outlier removal 
def remove_outlier(df, columns):
#Function to remove outliers from dataframe
    for col in columns:
        #Loop iterates over each column in df and returns df with outliers removed
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3-q1 #Interquartile range
        fence_low  = q1-1.5*iqr
        fence_high = q3+1.5*iqr
        df_out = df.loc[(df[col] > fence_low) & (df[col] < fence_high)]
        #Return df with outliers removed
        return df_out

#Remove outliers from listings and reassign to listings
listings = remove_outlier(listings, floats)

#Print listings original shape
print('Listings data new shape:', listings.shape)

Listings data original shape: (88771, 44)
Listings data new shape: (65767, 44)


Detect and remove outliers

In [124]:
#Replacing missing values in floatswith mean)
for col in listings[floats]:
    listings[col].fillna(listings[col].mean(), inplace = True)

#View remaining missing data from listings
display(missing_updater(listings))

Unnamed: 0,Missing_Values,%_Missing,Data_Types
review_scores_value,11469,17.44,category
review_scores_checkin,11465,17.43,category
review_scores_location,11465,17.43,category
review_scores_accuracy,11463,17.43,category
review_scores_cleanliness,11463,17.43,category
review_scores_communication,11463,17.43,category
review_scores_rating,11449,17.41,category
host_response_time,8849,13.46,object
zipcode,2908,4.42,object
market,202,0.31,object


### Objects

In [126]:
#Capture list of index for objects
objects = missing[missing['Data_Types']== 'object'].index.tolist()

#Apply forward and back fill on missing values
listings[objects]=listings[objects].ffill().bfill()

#View remaining missing data from listings
display(missing_updater(listings))

Unnamed: 0,Missing_Values,%_Missing,Data_Types
review_scores_value,11469,17.44,category
review_scores_checkin,11465,17.43,category
review_scores_location,11465,17.43,category
review_scores_accuracy,11463,17.43,category
review_scores_cleanliness,11463,17.43,category
review_scores_communication,11463,17.43,category
review_scores_rating,11449,17.41,category


### Category

Since we know that on average, the vast majority of reviews are relatively positive, we will populate review scores with the mode

In [128]:
#Get a list of the category variables from missing
categories = list(missing[missing['Data_Types'] == 'category'].index)

#Iterates each column in categories and replaces each column's NA's with the column's mode
for category in listings[categories]:
    listings[category].fillna(listings[category].mode()[0], inplace = True)
    
#View remaining missing data from listings
display(missing_updater(listings))

Unnamed: 0,Missing_Values,%_Missing,Data_Types


# Feature Engineering

## Transform numeric columns

In [None]:
#Subset data to visualize distributions of numeric variables
numerics = listings.select_dtypes(exclude=['category','bool','object','datetime64'])

fig, ax = plt.subplots(figsize = (20,20))
numerics.hist(ax = ax, bins = 20)
fig.subplots_adjust(wspace=.75);
#chlc stands for calculated host listings count

In [None]:
#Create heatmap of numeric data
corr = numerics.corr()

#Setup mask to hide upper triangle of heatmap
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True

f, ax = plt.subplots(figsize=(11, 15)) 
heatmap = sns.heatmap(corr, 
                      mask = mask,
                      square = True,
                      linewidths = .35,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .5, 
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      vmin = -1, 
                      vmax = 1,
                      annot = True,
                      annot_kws = {'size': 7},
                      fmt='.1%')

#add the column names as labels
ax.set_title('Heatmap of Continuous Variables in Listings Data');

## Adding new features

In [None]:
#Convert amenities into a count of amenities offered by airbnb
listings['total_amenities']= listings['amenities'].str.count(' ') + 1

#Check
listings[['total_amenities', 'amenities']].head(5)

Split amenities and host_verifications and turn into categorical variables

In [None]:
#Split amenities, host_verifications and encode as dummy variables
temp1 = listings['amenities'].str.get_dummies(sep=' ')
temp2 = listings['host_verifications'].str.get_dummies(sep=' ')

temp = pd.concat([temp1, temp2], axis=1)

#Remove column headers shorter than 3 characters in length
for col in temp.columns: 
    if len(col) < 3:
        temp.drop(columns= col, inplace=True)

#Combine listings and temp
listings = pd.concat([listings, temp], axis=1)

#Drop amenities column
listings.drop(columns = ['amenities','host_verifications'], inplace = True)

#View Shape
print('Shape of data frame: ', listings.shape)

# Encode Dummy Variables

In [None]:
#Subset Boolean values
bools = list(listings.select_dtypes(include=['bool']).columns)

#Convert True and False into ints
listings[bools] = listings[bools].astype(int)

#Convert encode data type object
temp=pd.get_dummies(listings.select_dtypes(include=['int32','object']))

#Combine listings and temp
listings = pd.concat([listings, temp], axis=1)

#Subset original object cols
objects = list(listings.select_dtypes(include=['object']).columns)

#Drop objects
listings.drop(columns= objects, inplace=True)
                      
#View shape of and preview data
print('Shape of data frame: ', listings.shape)
listings.head(3)

# Split Data into Training and Test Sets

In [None]:
#Prepare data for splitting into train and test sets
X= listings.drop(columns='price').values
y= listings['price'].values

#Check shapes
print('X shape: ', X.shape)
print('y shape: ', y.shape)

In [None]:
#Import models
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

#Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   random_state=42)

#Check shapes
print('Training data shape: {}'.format(X_train.shape))
print('Test data shape: {}'.format(X_test.shape))

# Model Based Feature Selection

In [None]:
#Instantiate Selectore with RandomForestRegressor
select = SelectFromModel(RandomForestRegressor(random_state=42,
                                              n_estimators=100), threshold='median')

#Fit and transform training data
select.fit(X_train, y_train)
X_train_t = select.transform(X_train)

#Check shapes
print('Original training data shape: {}'.format(X_train.shape))
print('Transformed training data shape: {}'.format(X_train_t.shape))