Import Dependencies and Data

In [1]:
# import dependencies
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
import tensorflow as tf
import seaborn as sns
sns.set_theme(style="darkgrid")

In [2]:
# additional dependencies
# library for regular expressions aka Regex
import re
import psycopg2
from sqlalchemy import create_engine
from config import db_password

In [3]:
# create the database engine
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/AirBnb"
engine = create_engine(db_string)

In [4]:
# create the dataframe
resultset = engine.execute("SELECT * FROM properties")
properties_df = pd.DataFrame(resultset.fetchall())
properties_df.columns = resultset.keys()
properties_df

Unnamed: 0,id,airbnb_id,night_price_native,night_price,weekly_price,monthly_price,cleaning_fee_native_price,num_of_baths,num_of_rooms,occupancy,...,airbnb_city,capacity_of_people,zip,property_type,room_type,reviews_count,num_of_beds,lat,lon,star_rating
0,57,33456771.0,149,182,0,0,0,1.0,1.0,64,...,Orlando,2,3,Condominium,Entire home/apt,233,1.0,28.5408,-81.3805,5
1,85,33451533.0,91,98,0,0,0,1.0,1.0,47,...,Orlando,4,3,Tiny house,Entire home/apt,368,3.0,28.581,-81.4061,5
2,89,33451593.0,129,103,0,0,0,1.0,2.0,94,...,Orlando,5,3,Tiny house,Entire home/apt,455,3.0,28.5809,-81.4078,5
3,91,33451552.0,116,104,0,0,0,1.0,1.0,71,...,Orlando,3,3,Tiny house,Entire home/apt,418,1.0,28.5821,-81.4067,5
4,119,33451504.0,81,91,0,0,0,1.0,1.0,77,...,Orlando,4,3,Tiny house,Entire home/apt,34,2.0,28.582,-81.4075,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4715,4455,33343016.0,125,97,0,0,13,2.0,3.0,66,...,Kissimmee,6,3,House,Entire home/apt,4,0.0,28.2979,-81.4592,5
4716,4506,33345902.0,365,241,0,0,16,3.5,5.0,11,...,Kissimmee,1,3,House,Entire home/apt,3,0.0,28.2735,-81.4678,5
4717,4539,33345246.0,18,194,0,0,15,3.0,4.0,16,...,Kissimmee,1,3,Townhouse,Entire home/apt,5,0.0,28.3397,-81.6475,5
4718,794,33465909.0,98,99,0,0,0,1.0,1.0,34,...,Orlando,2,3,Apartment,Entire home/apt,9,0.0,28.4813,-81.2494,5


Analyzing the data for preprocessing

In [5]:
# Preprocessing of the data
properties_df.count()

id                           4720
airbnb_id                    4720
night_price_native           4720
night_price                  4720
weekly_price                 4720
monthly_price                4720
cleaning_fee_native_price    4720
num_of_baths                 4720
num_of_rooms                 4717
occupancy                    4720
nights_booked                4720
rental_income                4720
airbnb_neighborhood_id       4720
airbnb_city                  4720
capacity_of_people           4720
zip                          4706
property_type                4720
room_type                    4720
reviews_count                4720
num_of_beds                  4715
lat                          4720
lon                          4720
star_rating                  4720
dtype: int64

In [6]:
# Preprocessing of the data
dfClean = properties_df.dropna()

In [7]:
# Preprocessing of the data
dfClean.count()

id                           4698
airbnb_id                    4698
night_price_native           4698
night_price                  4698
weekly_price                 4698
monthly_price                4698
cleaning_fee_native_price    4698
num_of_baths                 4698
num_of_rooms                 4698
occupancy                    4698
nights_booked                4698
rental_income                4698
airbnb_neighborhood_id       4698
airbnb_city                  4698
capacity_of_people           4698
zip                          4698
property_type                4698
room_type                    4698
reviews_count                4698
num_of_beds                  4698
lat                          4698
lon                          4698
star_rating                  4698
dtype: int64

In [8]:
# Preprocessing of the data
dfClean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 0 to 4719
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         4698 non-null   int64  
 1   airbnb_id                  4698 non-null   object 
 2   night_price_native         4698 non-null   int64  
 3   night_price                4698 non-null   int64  
 4   weekly_price               4698 non-null   int64  
 5   monthly_price              4698 non-null   int64  
 6   cleaning_fee_native_price  4698 non-null   int64  
 7   num_of_baths               4698 non-null   object 
 8   num_of_rooms               4698 non-null   float64
 9   occupancy                  4698 non-null   int64  
 10  nights_booked              4698 non-null   int64  
 11  rental_income              4698 non-null   int64  
 12  airbnb_neighborhood_id     4698 non-null   object 
 13  airbnb_city                4698 non-null   objec

In [9]:
dfClean.shape

(4698, 23)

In [10]:
dfClean.nunique()

id                           4698
airbnb_id                    4698
night_price_native            358
night_price                   402
weekly_price                   42
monthly_price                  36
cleaning_fee_native_price     175
num_of_baths                   20
num_of_rooms                   10
occupancy                      91
nights_booked                 330
rental_income                2542
airbnb_neighborhood_id        113
airbnb_city                     3
capacity_of_people             19
zip                             1
property_type                  17
room_type                       1
reviews_count                 308
num_of_beds                    16
lat                          2379
lon                          2387
star_rating                     5
dtype: int64

Preprocessing the Data

In [11]:
dfClean["zip"]

0       3
1       3
2       3
3       3
4       3
       ..
4715    3
4716    3
4717    3
4718    3
4719    3
Name: zip, Length: 4698, dtype: object

In [12]:
dfClean["room_type"]

0       Entire home/apt
1       Entire home/apt
2       Entire home/apt
3       Entire home/apt
4       Entire home/apt
             ...       
4715    Entire home/apt
4716    Entire home/apt
4717    Entire home/apt
4718    Entire home/apt
4719    Entire home/apt
Name: room_type, Length: 4698, dtype: object

In [13]:
dfClean["airbnb_neighborhood_id"]

0       268890.0
1       125976.0
2       125976.0
3       125976.0
4       125976.0
          ...   
4715     18847.0
4716     18847.0
4717     18847.0
4718    269222.0
4719    124197.0
Name: airbnb_neighborhood_id, Length: 4698, dtype: object

In [14]:
# Drop the non-beneficial ID columns, 'id', 'airbnb_id', "lat", "lon", "room_type", "zip", "airbnb_neighborhood_id".
dfClean = dfClean.drop(["id", "airbnb_id", "lat", "lon", "room_type", "zip", "airbnb_neighborhood_id"], 1)
dfClean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 0 to 4719
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   night_price_native         4698 non-null   int64  
 1   night_price                4698 non-null   int64  
 2   weekly_price               4698 non-null   int64  
 3   monthly_price              4698 non-null   int64  
 4   cleaning_fee_native_price  4698 non-null   int64  
 5   num_of_baths               4698 non-null   object 
 6   num_of_rooms               4698 non-null   float64
 7   occupancy                  4698 non-null   int64  
 8   nights_booked              4698 non-null   int64  
 9   rental_income              4698 non-null   int64  
 10  airbnb_city                4698 non-null   object 
 11  capacity_of_people         4698 non-null   int64  
 12  property_type              4698 non-null   object 
 13  reviews_count              4698 non-null   int64

  


In [15]:
dfClean.shape

(4698, 16)

In [16]:
dfClean["num_of_baths"] = pd.to_numeric(dfClean["num_of_baths"])
dfClean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 0 to 4719
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   night_price_native         4698 non-null   int64  
 1   night_price                4698 non-null   int64  
 2   weekly_price               4698 non-null   int64  
 3   monthly_price              4698 non-null   int64  
 4   cleaning_fee_native_price  4698 non-null   int64  
 5   num_of_baths               4698 non-null   float64
 6   num_of_rooms               4698 non-null   float64
 7   occupancy                  4698 non-null   int64  
 8   nights_booked              4698 non-null   int64  
 9   rental_income              4698 non-null   int64  
 10  airbnb_city                4698 non-null   object 
 11  capacity_of_people         4698 non-null   int64  
 12  property_type              4698 non-null   object 
 13  reviews_count              4698 non-null   int64

In [17]:
# Generate our categorical variable lists
properties_cat = dfClean.dtypes[dfClean.dtypes == "object"].index.tolist()
properties_cat

['airbnb_city', 'property_type']

Use OneHotEncoder to change the remaining 2 object columns

In [18]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(dfClean[properties_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(properties_cat)
encode_df.head()



Unnamed: 0,airbnb_city_Kissimmee,airbnb_city_Orlando,airbnb_city_Tampa,property_type_Apartment,property_type_Bungalow,property_type_Cabin,property_type_Chalet,property_type_Condominium,property_type_Cottage,property_type_Farm stay,property_type_Guest suite,property_type_Guesthouse,property_type_House,property_type_Loft,property_type_Other,property_type_Serviced apartment,property_type_Tiny house,property_type_Townhouse,property_type_Vacation home,property_type_Villa
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
# Merge one-hot encoded features and drop the originals
dfClean = dfClean.merge(encode_df,left_index=True, right_index=True)
dfClean = dfClean.drop(properties_cat,1)
dfClean.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,night_price_native,night_price,weekly_price,monthly_price,cleaning_fee_native_price,num_of_baths,num_of_rooms,occupancy,nights_booked,rental_income,...,property_type_Guest suite,property_type_Guesthouse,property_type_House,property_type_Loft,property_type_Other,property_type_Serviced apartment,property_type_Tiny house,property_type_Townhouse,property_type_Vacation home,property_type_Villa
0,149,182,0,0,0,1.0,1.0,64,234,3549,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,91,98,0,0,0,1.0,1.0,47,17,1388,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,129,103,0,0,0,1.0,2.0,94,343,2944,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,116,104,0,0,0,1.0,1.0,71,259,2245,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,81,91,0,0,0,1.0,1.0,77,282,2139,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [21]:
dfClean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4676 entries, 0 to 4697
Data columns (total 34 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   night_price_native                4676 non-null   int64  
 1   night_price                       4676 non-null   int64  
 2   weekly_price                      4676 non-null   int64  
 3   monthly_price                     4676 non-null   int64  
 4   cleaning_fee_native_price         4676 non-null   int64  
 5   num_of_baths                      4676 non-null   float64
 6   num_of_rooms                      4676 non-null   float64
 7   occupancy                         4676 non-null   int64  
 8   nights_booked                     4676 non-null   int64  
 9   rental_income                     4676 non-null   int64  
 10  capacity_of_people                4676 non-null   int64  
 11  reviews_count                     4676 non-null   int64  
 12  num_of