# Introduction

In the following notebook, I will be modeling Airbnb Listings data to create a price predictor to using a Deep Learning model

**Read in libraries**

In [147]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

**Set notebook preferences**

In [148]:
#Set options for pandas
pd.set_option('display.max_columns',1_000)
pd.set_option('display.max_rows',500)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#Set options for numpy
np.set_printoptions(suppress=True)

#Set visualization preferences
plt.style.use('fivethirtyeight')

#Surpress warnings
import warnings
warnings.simplefilter('ignore')

**Read in data**

In [149]:
#Set path to training data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Airbnb - San Francisco Listings Analysis\Data\03_Processed'

#Read data
df = pd.read_csv(path + '/2020_0608_Listings_Processed.csv', index_col=0)



**Preview Data**

In [150]:
#Preview data
print('Data shape:',df.shape)
display(df.head())

Data shape: (82433, 42)


Unnamed: 0,accommodates,amenities_count,availability_30,availability_365,availability_60,availability_90,bathrooms,bed_type,bedrooms,beds,calculated_host_listings_count,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,cancellation_policy,cleaning_fee,extra_people,guests_included,holiday,host_has_profile_pic,host_identity_verified,host_is_superhost,host_response_rate,host_response_time,instant_bookable,is_location_exact,last_review_month_encoded,last_review_weekday_encoded,last_review_year_encoded,maximum_nights,minimum_nights,neighbourhood_cleansed_encoded,number_of_reviews,price,property_type,require_guest_phone_verification,require_guest_profile_picture,requires_license,review_scores_rating,reviews_per_month,room_type,security_deposit,weekend
0,3,36,25,106,43,58,1.0,Real Bed,1.0,2.0,1,0,0,moderate,100.0,25.0,2,False,1.0,1.0,1.0,90.0,within an hour,1,1,203.282,241.67,217.141,1125,1,248.227,240,170.0,Apartment,0,0,1,97.0,1.84,Entire home/apt,100.0,False
1,5,17,0,0,0,0,1.0,Real Bed,2.0,3.0,1,0,0,strict 14 with grace period,100.0,0.0,2,False,1.0,1.0,0.0,100.0,within a day,0,1,223.91,217.53,215.357,60,30,185.379,111,235.0,Apartment,0,0,1,98.0,0.83,Entire home/apt,2.0,True
2,2,18,30,365,60,90,4.0,Real Bed,1.0,1.0,9,9,0,strict 14 with grace period,50.0,12.0,1,False,1.0,1.0,0.0,100.0,within an hour,0,1,203.282,241.67,217.141,60,32,211.965,19,65.0,Apartment,0,0,1,84.0,0.15,Private room,200.0,False
3,2,16,30,365,60,90,4.0,Real Bed,1.0,1.0,9,9,0,strict 14 with grace period,50.0,12.0,1,False,1.0,1.0,0.0,100.0,within an hour,0,1,202.584,209.425,223.098,90,32,211.965,8,65.0,Apartment,0,0,1,93.0,0.12,Private room,200.0,False
5,3,33,30,173,56,84,1.0,Real Bed,1.0,1.0,2,2,0,strict 14 with grace period,50.0,60.0,2,False,1.0,0.0,1.0,100.0,within an hour,1,1,203.282,208.033,217.141,14,1,220.275,736,139.0,Condominium,0,0,1,98.0,5.66,Private room,0.0,False


# Model Development

### Prepare Data

**Split data into X and y**

In [151]:
#Split data into X and y
X = df.drop('price', axis = 1)
y = df.price.values

**One-Hot Encode**

In [153]:
#Read in Libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#Subset categorical variables
cat_features = X.select_dtypes('object').columns

#Init ColumnTransformer 
ct = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'), cat_features)],remainder='passthrough')

#One-hot encode categorical variables'
X_encoded = ct.fit_transform(X)

**Split data and scale**

In [154]:
#Import libraries
from sklearn.model_selection import train_test_split

#Split data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, 
                                                    test_size = .2,
                                                   random_state = 24)

#Check
print('Training data:{} | Training labels:{}'.format(X_train.shape,y_train.shape))
print('Test data:{} | Test labels:{}'.format(X_test.shape,y_test.shape))

Training data:(65946, 85) | Training labels:(65946,)
Test data:(16487, 85) | Test labels:(16487,)


In [155]:
#Init scaler
scaler = StandardScaler()

#Fit and transform training and test data
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

## Base model

In [156]:
#Read in libraries
import tensorflow as tf
from tensorflow import keras

#Init base model
base_model = tf.keras.models.Sequential()

#Add Layers
base_model.add(tf.keras.layers.Dense(units = 64, activation = 'relu'))
base_model.add(tf.keras.layers.Dense(units = 64, activation = 'relu'))
base_model.add(tf.keras.layers.Dense(units = 1))

#compile
base_model.compile(loss = 'mean_squared_error', optimizer = 'Adam',)

In [157]:
#Train model
history = base_model.fit(X_train_transformed,y_train, batch_size= 100,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

**Evaluate Training Loss**

In [None]:
#Store training loss as a data frame
training_loss = pd.DataFrame(history, columns='loss')

#Plot training loss
training_loss.plot()

**Evaluate base model on test data**

In [None]:
#Evaluate performance on test data
predictions = 