In [1]:
from IPython.display import HTML

def hide_code():
    return HTML('''<script>
    code_show=true; 
    function code_toggle() {
     if (code_show){
     $('div.input').hide();
     } else {
     $('div.input').show();
     }
     code_show = !code_show
    } 
    $( document ).ready(code_toggle);
    </script>
    <form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

hide_code()


# Airbnb Property Price Finder

Data Analysts:
    Justin Frank,
    Leo Ramirez,
    Tari Okoya-Koren,
    Veohnti Afokpa,
    Araz Ohanessian,
    Milton Dimas

## The Goal: 

### To create a form for prospective Airbnb hosts to accurately price their potential listings 

## The Process:

#### 1. Clean the provided dataset (a detailed list of Airbnb listings in the New York area)

#### 2. Find the features which most impact a listing's price for use in a machine learning model

#### 3. Use the relevant features and data to train a ML model which will be used to predict price

#### 4. Create a method of taking in user input and spitting out a listing price

In [2]:
import ipywidgets as widgets
from ipywidgets import Layout, Button, Box

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import time

In [4]:
# Importing required libraries
from numpy.random import seed
seed(123)
from datetime import datetime
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from xgboost import plot_importance

## The data was first collected, cleaned, and exported

#### The cleaning process:
- Since there were 106 columns, we dropped quite a few irrelevant columns.
- Cleaned up missing values using median values
- Changed to logarithmic form for later use in machine learning models

In [5]:
filepath = os.path.join('../df.csv')

In [6]:
df = pd.read_csv(filepath)
len(df.columns)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_identity_verified,neighbourhood_cleansed,property_type,room_type,accommodates,...,child_friendly,parking,host_greeting,internet,long_term_stays,pets_allowed,private_entrance,self_check_in,time_since_first_review,time_since_last_review
0,2060,a few days or more,0-49%,0.0,0.0,0.0,Washington Heights,Other,Private room,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4+ years,3851.0
1,2595,within a few hours,50-89%,0.0,6.0,1.0,Midtown,Apartment,Entire home/apt,2,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,4+ years,-209.0
2,3831,within an hour,90-99%,0.0,1.0,1.0,Clinton Hill,Other,Entire home/apt,3,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,4+ years,-425.0
3,5099,unknown,unknown,0.0,1.0,0.0,Murray Hill,Apartment,Entire home/apt,2,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4+ years,-187.0
4,5121,unknown,unknown,0.0,1.0,0.0,Bedford-Stuyvesant,Apartment,Private room,2,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4+ years,-237.0


In [7]:
dummy_filepath = os.path.join('../dummy_df.csv')

In [8]:
dummy_df = pd.read_csv(dummy_filepath)
len(dummy_df.columns)
dummy_df.head()

Unnamed: 0,id,host_is_superhost,host_listings_count,host_identity_verified,accommodates,bathrooms,price,security_deposit,cleaning_fee,extra_people,...,review_scores_value_10/10,review_scores_value_9/10,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict_14_with_grace_period,time_since_first_review_0-6 months,time_since_first_review_1-2 years,time_since_first_review_2-3 years,time_since_first_review_4+ years,time_since_first_review_6-12 months
0,2060,0.0,-4.60517,0.0,0.693147,0.0,4.60517,-4.60517,-4.60517,-4.60517,...,0,0,1,0,0,0,0,0,1,0
1,2595,0.0,1.791759,1.0,0.693147,0.0,5.4161,5.857933,4.553877,-4.60517,...,0,1,0,0,1,0,0,0,1,0
2,3831,0.0,0.0,1.0,1.098612,0.0,4.488636,6.214608,-4.60517,-4.60517,...,0,0,1,0,0,0,0,0,1,0
3,5099,0.0,0.0,0.0,0.693147,0.0,5.298317,5.703782,4.828314,4.60517,...,0,1,0,1,0,0,0,0,1,0
4,5121,0.0,0.0,0.0,0.693147,0.0,4.094345,6.109248,-4.60517,3.401197,...,0,1,0,0,1,0,0,0,1,0


## With irrelevant data removed, desired features cleaned, and data points parsed for easy comparisons, we can proceed with determining which features are most highly associated with respective listing price

### Using Tableau's data exploration capabilities, we selected several features and created visuals to confirm their correllation with a listing's price:

#### The Boroughs visualization tells the obvious story that location impacts price

In [9]:
%%HTML
<div class='tableauPlaceholder' id='viz1594007253040' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;NYCBoroughs&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='AirBnBFeatures-HypothesizedImpactfulFeatures&#47;NYCBoroughs' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;NYCBoroughs&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1594007253040');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

#### Zip Code mapped out reinforces the story told with Boroughs, but with increased granularity

In [10]:
%%HTML
<div class='tableauPlaceholder' id='viz1594003857060' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;ZipCode&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='path' value='views&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;ZipCode?:language=en&amp;:embed=y&amp;:display_count=y' /> <param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;ZipCode&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1594003857060');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

#### Certain bed types are also associated with higher prices

In [11]:
%%HTML
<div class='tableauPlaceholder' id='viz1594006192778' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;BedType&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='AirBnBFeatures-HypothesizedImpactfulFeatures&#47;BedType' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;BedType&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1594006192778');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

#### An obvious trend to be seen is the more guests that can stay in a listing, the higher the price tends to be

In [12]:
%%HTML
<div class='tableauPlaceholder' id='viz1594006270910' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;Accomodates&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='AirBnBFeatures-HypothesizedImpactfulFeatures&#47;Accomodates' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnBFeatures-HypothesizedImpactfulFeatures&#47;Accomodates&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1594006270910');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

### After determining the features of importance through Tableau and reviewing the format of data we were provided, we went about the process of selecting the best machine learning model for our purposes

In [13]:
file = open("../images/ml_models_flow.png", "rb")
image = file.read()
widgets.Image(
    value=image,
    format='png',
    width=1100,
    height=750,
)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\t\x0c\x00\x00\x04\x8c\x08\x06\x00\x00\x00\xc7\xc7R\x…

## Following the algorithm selection flow chart, RidgeRegression and SVR models seemed like the way to go

### These, however, yielded less than ideal results

#### The Ridge Method provided a 0.0653 model score, a dismal prediction accuracy of only 6.5%

In [14]:
file = open("../images/ridge.png", "rb")
image = file.read()
widgets.Image(
    value=image,
    format='png',
    width=6000,
    height=2000,
)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x08\xc4\x00\x00\x02\x84\x08\x06\x00\x00\x00\xe2\xa2t…

#### And the SVR Method was even worse, with only 2.7% as the overall model's accuracy

In [15]:
file = open("../images/svr.png", "rb")
image = file.read()
widgets.Image(
    value=image,
    format='png',
    width=6000,
    height=2000,
)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x08\xc4\x00\x00\x02\x9e\x08\x06\x00\x00\x00@\xefU\x9…

## While, ultimately, the models chosen were inaccurate, they shed light on 2 major flaws in our approach: 

#### 1. Our feature selection was narrow and limited to our presupposed notion of what would impact price. With such an expansive dataset, it was important for us to explore the possibility of seemingly inconsequential features affecting price. 

#### 2. The visualizations we created were only showing the correlation between price and a given feature. What they didn't tell us were the *weights* each feature had on price affect. This is to say, we could conclude which features were affecting price, but not which were affecting price *the most*. 

### Learning from our mistakes, we flipped our approach. Using a new Machine Learning Model, XGBoost, we could determine the weight of each feature first which would show us all the relevant features. From there we could input those features into Tableau to view the relation between median price and the newly discovered relevant features.

In [16]:
# Separating X and y
X = dummy_df.drop('price', axis=1)
y = dummy_df.price

# Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))

In [17]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### First, we determined the efficacy of our model. After training the model with the cleaned dataset, we used testing data to find the accuracy score

In [18]:
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

xgb_reg_start = time.time()
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train)
val_preds_xgb_reg = xgb_reg.predict(X_test)

val_preds_xgb_reg
xgb_reg_end = time.time()

print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(r2_score(y_test, val_preds_xgb_reg),4))

Time taken to run: 0.8 minutes

Training MSE: 0.1044
Validation MSE: 0.1444

Training r2: 0.7815
Validation r2: 0.7012


### From there, we weighed each feature and sorted by weight percentage

In [19]:
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True, ascending=False)
ft_weights_xgb_reg['weight_%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in ft_weights_xgb_reg['weight']], index = ft_weights_xgb_reg.index)
ft_weights_xgb_reg.head(20)

Unnamed: 0,weight,weight_%
room_type_Entire home/apt,0.307146,30.71%
neighbourhood_cleansed_Midtown,0.04163,4.16%
property_type_Other,0.032436,3.24%
bathrooms,0.031539,3.15%
neighbourhood_cleansed_Bushwick,0.021077,2.11%
neighbourhood_cleansed_Hell's Kitchen,0.016428,1.64%
neighbourhood_cleansed_Bedford-Stuyvesant,0.015751,1.58%
neighbourhood_cleansed_West Village,0.013546,1.35%
neighbourhood_cleansed_East Village,0.01269,1.27%
gym,0.011349,1.13%


## With several new features learned to be important, we created a new set of Tableau visualizations to show the respective correlations clearly

#### The machine learning model taught us that specific amenities held much higher importance than others and that it was apt to parse them out individually and see each respective impact on price. As is supported by the weight calculations, you can see that the presence of a gym and/or an elevator can significantly affect listing price.

In [20]:
%%HTML
<div class='tableauPlaceholder' id='viz1594012285245' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnB-Amentities&#47;AmenitiesvsMedianPriceoverlap&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='AirBnB-Amentities&#47;AmenitiesvsMedianPriceoverlap' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ai&#47;AirBnB-Amentities&#47;AmenitiesvsMedianPriceoverlap&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1594012285245');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

#### While bedroom count seemed like a feature that would hold a high importance, the weight calculator top 20 didn't include it at all. Plotting the bedroom count values reconfirmed the lack of weight and the fact that it shouldn't be included in the input data

In [21]:
%%HTML
<div class='tableauPlaceholder' id='viz1594198219233' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ro&#47;RoomType_15934540785170&#47;Bedroom&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='RoomType_15934540785170&#47;Bedroom' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ro&#47;RoomType_15934540785170&#47;Bedroom&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1594198219233');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

## Before moving on to the final stage of collecting user input, we had to ensure a 70% prediction accuracy is the highest that can be achieved with our dataset

### Using Neural Net Modeling, we were able to draft 3 more models to ensure we found the best fit

In [22]:
file = open("../images/neural_net_models.png", "rb")
image = file.read()
widgets.Image(
    value=image,
    format='png',
    width=2000,
    height=2000,
)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x07\xcc\x00\x00\x048\x08\x06\x00\x00\x00lU&m\x00\x00…

In [23]:
def listMaker(column):
    '''
    Creates a list of unique items from a given column.
    Input column name in string format.
    '''
    new_list = list(df[column].unique())
    return new_list

In [24]:
max(listMaker('bathrooms'))

15.5

In [25]:
style = {'description_width': 'initial'}

In [26]:
room_types = listMaker('room_type')
room_type = widgets.Select(
    options=room_types,
    value=room_types[0],
    description='Room Type:',
    disabled=False
)

In [27]:
neighborhoods = listMaker('neighbourhood_cleansed')
neighborhood = widgets.Dropdown(
    options=neighborhoods,
    value=neighborhoods[0],
    description='Neighborhood:',
    disabled=False,
    style=style
)

In [28]:
bathroom_count = widgets.BoundedFloatText(
    value=2.5,
    step=0.5,
    description='Bathrooms:',
    disabled=False,
    style=style
)
bathroom_count

BoundedFloatText(value=2.5, description='Bathrooms:', step=0.5, style=DescriptionStyle(description_width='init…

In [29]:
prop_types = listMaker('property_type')
property_type = widgets.Select(
    options=prop_types,
    value='Other',
    # rows=10,
    description='Property Type:',
    disabled=False,
    style=style
)

In [30]:
amenities = df.columns[32:47].tolist()

new_strings=[]
for amen in amenities:
    new_amen = amen.replace('_', ' ')
    new_amen = new_amen[0].upper()+new_amen[1:]
    new_strings.append(new_amen)
    
items_amenities = [widgets.Checkbox(
    value=False,
    description=string,
    disabled=False
) for string in new_strings]
amen_checklist = widgets.GridBox(items_amenities, 
                                 layout=Layout(
                                        width='80%',
                                        grid_template_rows='auto auto auto auto',
                                        grid_template_columns='20% 20% 20% 20%',
                                        grid_template_areas='''
                                        "header header header header"
                                        "main main . sidebar "
                                        "footer footer footer footer"
                                        '''))
amen_checklist

GridBox(children=(Checkbox(value=False, description='Air conditioning'), Checkbox(value=False, description='Be…

In [31]:
accommodation_count = widgets.BoundedIntText(
    value=3,
    step=1,
    description='Accommodates:',
    disabled=False,
    style=style
)

In [32]:
layout_fees = Layout( width='85%')
cleaning_fee = widgets.RadioButtons(
    options=['Yes', 'No'],
    description='Cleaning Fee?',
    disabled=False,
    style=style
)
security_deposit = widgets.RadioButtons(
    options=['Yes', 'No'],
    description='Security Deposit?',
    disabled=False,
    style=style
)
guests_included = widgets.IntSlider(
    value=1,
    min=1,
    max=16,
    step=1,
    description='Guests Included:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style=style,
    layout=layout_fees
)
extra_ppl_cost = widgets.RadioButtons(
    options=['Yes', 'No'],
    description='Extra Persons Fee?',
    disabled=False,
    style=style
)

In [33]:
minimum_nights = widgets.BoundedIntText(
    value=1,
    min=1,
    max=1500,
    step=1,
    description='Minimum Night Stay:',
    disabled=False,
    style=style
)
maximum_nights = widgets.BoundedIntText(
    value=1,
    min=1,
    max=1500,
    step=1,
    description='Maximum Nights:',
    disabled=False,
    style=style
)
availability_90 = widgets.BoundedIntText(
    value=1,
    min=0,
    max=90,
    step=1,
    description='Available Days out of 90:',
    disabled=False,
    style=style
)

In [34]:
fees_vbox = widgets.VBox([cleaning_fee, extra_ppl_cost, security_deposit])

In [35]:
accordion_property = widgets.Accordion(children=[neighborhood, property_type, room_type, bathroom_count, accommodation_count])
accordion_property.set_title(0, 'Neighborhood')
accordion_property.set_title(1, 'Property Type')
accordion_property.set_title(2, 'Room Type')
accordion_property.set_title(3, 'Bathrooms')
accordion_property.set_title(4, 'Total Accommodation')
accordion_property.selected_index = None
accordion_property

Accordion(children=(Dropdown(description='Neighborhood:', options=('Washington Heights', 'Midtown', 'Clinton H…

In [36]:
accordion_dimensions = widgets.Accordion(children=[availability_90, minimum_nights, maximum_nights, fees_vbox])
accordion_dimensions.set_title(0, 'Availability 90')
accordion_dimensions.set_title(1, 'Minimum Nights')
accordion_dimensions.set_title(2, 'Maximum Nights')
accordion_dimensions.set_title(3, 'Optional Fees')
accordion_dimensions.selected_index = None
accordion_dimensions

Accordion(children=(BoundedIntText(value=1, description='Available Days out of 90:', max=90, style=Description…

In [37]:
UserInput = widgets.Tab()
UserInput.children = [accordion_property, accordion_dimensions, amen_checklist]
UserInput.set_title(0, 'Property Details')
UserInput.set_title(1, 'Availability and Fees')
UserInput.set_title(2, 'Amenities Offered')

## Now that we know the 'heaviest' features, we can collect those features from a user and funnel them through our machine learning model

### Please input your property's features:

In [38]:
UserInput

Tab(children=(Accordion(children=(Dropdown(description='Neighborhood:', options=('Washington Heights', 'Midtow…

### Click to generate your listing price!

In [39]:
import itertools

In [40]:
def predict_price(b):
    with output:
#         start_time = time.time()
        print('Estimated waiting time: 37 seconds')
        # Separating X and y
        X = dummy_df.drop('price', axis=1)
        y = dummy_df.price

        # Scaling
        scaler = StandardScaler()
        X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))

        # Splitting into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

        #collect new amenity values
        amen_values = [amen_checklist.children[i].value for i in range(0, len(amen_checklist.children))]

        #collect other userinput values and combine
        X_test_values = [neighborhood.value, property_type.value, room_type.value, bathroom_count.value, accommodation_count.value,
             availability_90.value, minimum_nights.value, maximum_nights.value, cleaning_fee.value, extra_ppl_cost.value,
             security_deposit.value]
        X_test_values.extend(amen_values)

        #Clean X values to fit ML model
        cleaned_X_values = []
        for i in range(len(X_test_values)):
            if type(X_test_values[i]) == int:
                cleaned_X_values.append(X_test_values[i])
            elif type(X_test_values[i]) == float:
                cleaned_X_values.append(X_test_values[i])
            elif X_test_values[i] == 'No':
                cleaned_X_values.append(0)
            elif X_test_values[i] == False:
                cleaned_X_values.append(0)
            else:
                cleaned_X_values.append(1)
        cleaned_X_values    

        #Combine feature titles
        X_test_titles = [f'neighbourhood_cleansed_{neighborhood.value}', f'property_type_{property_type.value}', f'room_type_{room_type.value}', 'bathrooms', 'accommodates',
             'availability_90', 'minimum_nights', 'maximum_nights', 'cleaning_fee', 'extra_people',
             'security_deposit']
        X_test_titles.extend(amenities)

        #zip values and titles
        zipped_X_test = zip(X_test_titles, cleaned_X_values)
        zipped_X_test = list(zipped_X_test)

        #Create row for X_test dataframe
        X_test_row = []
        for feature in X_test.columns.tolist():
            if feature in X_test_titles:
                X_test_row.append(zipped_X_test[X_test_titles.index(feature)][1])
            if feature not in X_test_titles:
                X_test_row.append(0)

        #Create dataframe for ML model
        X_userinput = pd.DataFrame([X_test_row], columns=X_test.columns)

        numerical_columns = ['accommodates', 'availability_90', 'bathrooms', 'cleaning_fee', 'extra_people', 'host_days_active', 'host_listings_count', 'maximum_nights', 'minimum_nights', 'number_of_reviews', 'security_deposit']
        # Log transforming columns
        numerical_columns = [i for i in numerical_columns if i not in ['availability_90', 'host_days_active']] # Removing items not to be transformed

        for col in numerical_columns:
            X_userinput[col] = X_userinput[col].astype('float64').replace(0.0, 0.01) # Replacing 0s with 0.01
            X_userinput[col] = np.log(X_userinput[col])

        X_userinput = pd.DataFrame(scaler.transform(X_userinput), columns=list(X_userinput.columns))

        #Train model
        xgb_reg = xgb.XGBRegressor()
        xgb_reg.fit(X_train, y_train)

        #Create prediction
        log_predict = xgb_reg.predict(X_userinput)

        #Inverse logarithm for price
        predicted_price = np.expm1(log_predict)[0].round(2)
        
#         end_time = time.time()

        print('Your estimated listing price is: $' + str(predicted_price) + ' per night.')
#         print(f'Time taken: {round((end_time - start_time),1)} seconds')

In [41]:
from IPython.display import display
button = widgets.Button(description="Click Me!", button_style='success')
output = widgets.Output()

display(button, output)

button.on_click(predict_price)

Button(button_style='success', description='Click Me!', style=ButtonStyle())

Output()

# What's next?

### Hyperparameter tuning the XGBoost model in an attempt to increase accuracy

### Web Scraping for a constantly updating data set

### Including non-static features to allow for an ever-updating listing price 

### Employ JavaScript and HTML for app/website adaptation

### Initialize a Natural Language Processor (NLP) model for reviews and description analyses