## 1.0 Import and install python libraries

In [285]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
import summarytools
from summarytools import dfSummary
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [286]:
# set random seed to ensure that results are repeatable
np.random.seed(1)

## 3.0 Load data 

In [287]:
# load data
airbnb = pd.read_csv("airbnb.csv")

## Data Exploration

In [288]:
# look at the data
airbnb.head(3) 

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75


In [289]:
# information of the dataframe
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   host_is_superhost                  3555 non-null   int64  
 1   host_identity_verified             3555 non-null   int64  
 2   neighbourhood_cleansed             3555 non-null   object 
 3   latitude                           3555 non-null   float64
 4   longitude                          3555 non-null   float64
 5   property_type                      3552 non-null   object 
 6   room_type                          3555 non-null   object 
 7   accommodates                       3555 non-null   int64  
 8   bathrooms                          3541 non-null   float64
 9   bedrooms                           3545 non-null   float64
 10  beds                               3546 non-null   float64
 11  bed_type                           3555 non-null   objec

In [290]:
# summary of the data
airbnb.describe()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price,price_gte_150
count,3555.0,3555.0,3555.0,3555.0,3555.0,3541.0,3545.0,3546.0,3555.0,3555.0,3555.0,3555.0,3555.0,3555.0,2755.0,3555.0,3555.0
mean,0.11308,0.727989,42.339973,-71.084874,3.023629,1.215899,1.246544,1.597293,14.85879,1.427004,10.886639,3.116737,19.126582,279.052602,91.89147,166.060478,0.500422
std,0.316735,0.445058,0.024464,0.031614,1.754808,0.492656,0.73844,0.995467,4.82126,1.050204,19.092755,8.273949,35.666178,408.686952,9.548381,103.378456,0.50007
min,0.0,0.0,42.235942,-71.171789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,10.0,0.0
25%,0.0,0.0,42.329875,-71.105183,2.0,1.0,1.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,89.0,85.0,0.0
50%,0.0,1.0,42.345191,-71.078487,2.0,1.0,1.0,1.0,15.0,1.0,0.0,2.0,5.0,92.0,94.0,150.0,1.0
75%,0.0,1.0,42.354672,-71.062142,4.0,1.0,2.0,2.0,18.0,1.0,20.0,3.0,21.0,402.0,98.0,219.0,1.0
max,1.0,1.0,42.389982,-71.0001,16.0,6.0,5.0,16.0,30.0,14.0,200.0,300.0,404.0,2680.0,100.0,650.0,1.0


In [291]:
dfSummary(airbnb)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,host_is_superhost [int64],1. Jamaica Plain 2. South End 3. Back Bay 4. Fenway 5. Dorchester 6. Allston 7. Beacon Hill 8. Brighton 9. Downtown 10. South Boston 11. other,"341 (9.6%) 323 (9.1%) 298 (8.4%) 285 (8.0%) 269 (7.6%) 260 (7.3%) 188 (5.3%) 184 (5.2%) 172 (4.8%) 171 (4.8%) 1,064 (29.9%)",,0 (0.0%)
2,host_identity_verified [int64],Mean (sd) : 42.3 (0.0) min < med < max: 42.2 < 42.3 < 42.4 IQR (CV) : 0.0 (1730.7),"3,554 distinct values",,0 (0.0%)
3,neighbourhood_cleansed [object],Mean (sd) : -71.1 (0.0) min < med < max: -71.2 < -71.1 < -71.0 IQR (CV) : 0.0 (-2248.5),"3,554 distinct values",,0 (0.0%)
4,latitude [float64],1. Apartment 2. House 3. Condominium 4. Townhouse 5. Bed & Breakfast 6. Loft 7. Other 8. Boat 9. Villa 10. Entire Floor 11. other,"2,593 (72.9%) 555 (15.6%) 228 (6.4%) 53 (1.5%) 41 (1.2%) 39 (1.1%) 17 (0.5%) 12 (0.3%) 6 (0.2%) 4 (0.1%) 7 (0.2%)",,0 (0.0%)
5,longitude [float64],1. Entire home/apt 2. Private room 3. Shared room,"2,103 (59.2%) 1,373 (38.6%) 79 (2.2%)",,0 (0.0%)
6,property_type [object],Mean (sd) : 1.2 (0.5) min < med < max: 0.0 < 1.0 < 6.0 IQR (CV) : 0.0 (2.5),11 distinct values,,3 (0.1%)
7,room_type [object],1. 1.0 2. 2.0 3. 0.0 4. 3.0 5. 4.0 6. 5.0 7. nan,"2,367 (66.6%) 688 (19.4%) 287 (8.1%) 149 (4.2%) 41 (1.2%) 13 (0.4%) 10 (0.3%)",,0 (0.0%)
8,accommodates [int64],Mean (sd) : 1.6 (1.0) min < med < max: 0.0 < 1.0 < 16.0 IQR (CV) : 1.0 (1.6),11 distinct values,,0 (0.0%)
9,bathrooms [float64],1. Real Bed 2. Futon 3. Airbed 4. Pull-out Sofa 5. Couch,"3,423 (96.3%) 51 (1.4%) 40 (1.1%) 31 (0.9%) 10 (0.3%)",,14 (0.4%)
10,bedrooms [float64],Mean (sd) : 91.9 (9.5) min < med < max: 20.0 < 94.0 < 100.0 IQR (CV) : 9.0 (9.6),48 distinct values,,10 (0.3%)


In [292]:
# Check the missing values
airbnb.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                             14
bedrooms                              10
beds                                   9
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 800
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [293]:
# create a list for catagorical variables
category_var_list = list(airbnb.select_dtypes(include='object').columns)
category_var_list

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy',
 'price_category']

In [294]:
# explore the categorical variable values
for cat in category_var_list:
    print(f"Category: {cat} Values: {airbnb[cat].unique()}")

Category: neighbourhood_cleansed Values: ['Roslindale' 'Jamaica Plain' 'Mission Hill' 'Longwood Medical Area'
 'Bay Village' 'Leather District' 'Chinatown' 'North End' 'Roxbury'
 'South End' 'Back Bay' 'East Boston' 'Charlestown' 'West End'
 'Beacon Hill' 'Downtown' 'Fenway' 'Brighton' 'West Roxbury' 'Hyde Park'
 'Mattapan' 'Dorchester' 'South Boston Waterfront' 'South Boston'
 'Allston']
Category: property_type Values: ['House' 'Apartment' 'Condominium' 'Villa' 'Bed & Breakfast' 'Townhouse'
 'Entire Floor' 'Loft' 'Guesthouse' 'Boat' 'Dorm' 'Other' nan 'Camper/RV']
Category: room_type Values: ['Entire home/apt' 'Private room' 'Shared room']
Category: bed_type Values: ['Real Bed' 'Pull-out Sofa' 'Futon' 'Airbed' 'Couch']
Category: cancellation_policy Values: ['moderate' 'flexible' 'strict' 'super_strict_30']
Category: price_category Values: ['gte_226' 'lte_$75' 'btw_$75-$150' 'btw_$151-$225']


## Process the data

#### Drop the columns that will not be using

In [295]:
# Our target is price; but there are three related price variableds - price, price_gte_150, 
# and price_category. We need to drop price_gte_150, and price_category
airbnb.drop(['price_category', 'price_gte_150'], axis=1, inplace = True)

#### Encode the categorical variables

In [296]:
airbnb['property_type'].isna().sum() # check for missing values in this column

3

In [297]:
airbnb["property_type"].fillna("unkown", inplace = True)

In [298]:
airbnb['neighbourhood_cleansed'].isna().sum() 

0

In [299]:
airbnb['room_type'].isna().sum()

0

In [300]:
airbnb['bed_type'].isna().sum()

0

In [301]:
airbnb['cancellation_policy'].isna().sum()

0

We need to encode neighborhood_cleansed and property_type along with cancelation_policy colums into one-hot encoding and room_type, bed_type to label encoding  

In [302]:
labelencoder = LabelEncoder()
airbnb['room_type'] = labelencoder.fit_transform(airbnb['room_type'])
airbnb['bed_type'] = labelencoder.fit_transform(airbnb['bed_type'])

In [303]:
##The argument sparse=False specifies that the encoded data should be stored as a dense numpy array rather 
##than a sparse matrix. A sparse matrix is a matrix that has a large number of zero values, 
##whereas a dense matrix does not

In [304]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

In [305]:
##The ColumnTransformer is a class in scikit-learn that allows you to apply different preprocessing 
##techniques to different subsets of the data, based on the column or feature. 
##The ColumnTransformer allows you to standardize the numerical data and one-hot encode the categorical data

In [306]:
from sklearn.compose import ColumnTransformer

In [307]:
categorical_features = ['neighbourhood_cleansed', 'property_type', 'cancellation_policy']

In [308]:
##The remainder='passthrough' argument can be useful when you have a mixture of features in your data, 
##some of which you want to transform and others that you want to keep unchanged. 

In [309]:
transformer = ColumnTransformer([('Ohe',
                                  ohe,
                                  categorical_features)],
                                remainder = 'passthrough')

In [310]:
transformedColumns = transformer.fit_transform(airbnb)

In [311]:
transformedColumns

array([[   0.,    0.,    0., ...,    0.,   nan,  250.],
       [   0.,    0.,    0., ...,  804.,   94.,   65.],
       [   0.,    0.,    0., ..., 2574.,   98.,   65.],
       ...,
       [   0.,    0.,    0., ...,    0.,   nan,  198.],
       [   0.,    0.,    0., ...,    8.,   90.,   65.],
       [   0.,    0.,    0., ...,    0.,   nan,   65.]])

In [312]:
##The get_feature_names_out() method returns a list of strings, where each string corresponds to a 
##new binary feature created by the encoding process. 

In [313]:
transformedColumnNames = transformer.get_feature_names_out()

In [314]:
airbnb = pd.DataFrame(transformedColumns, columns=transformedColumnNames)

In [315]:
transformedColumnNames

array(['Ohe__neighbourhood_cleansed_Allston',
       'Ohe__neighbourhood_cleansed_Back Bay',
       'Ohe__neighbourhood_cleansed_Bay Village',
       'Ohe__neighbourhood_cleansed_Beacon Hill',
       'Ohe__neighbourhood_cleansed_Brighton',
       'Ohe__neighbourhood_cleansed_Charlestown',
       'Ohe__neighbourhood_cleansed_Chinatown',
       'Ohe__neighbourhood_cleansed_Dorchester',
       'Ohe__neighbourhood_cleansed_Downtown',
       'Ohe__neighbourhood_cleansed_East Boston',
       'Ohe__neighbourhood_cleansed_Fenway',
       'Ohe__neighbourhood_cleansed_Hyde Park',
       'Ohe__neighbourhood_cleansed_Jamaica Plain',
       'Ohe__neighbourhood_cleansed_Leather District',
       'Ohe__neighbourhood_cleansed_Longwood Medical Area',
       'Ohe__neighbourhood_cleansed_Mattapan',
       'Ohe__neighbourhood_cleansed_Mission Hill',
       'Ohe__neighbourhood_cleansed_North End',
       'Ohe__neighbourhood_cleansed_Roslindale',
       'Ohe__neighbourhood_cleansed_Roxbury',
       'Ohe__ne

In [316]:
airbnb.head()

Unnamed: 0,Ohe__neighbourhood_cleansed_Allston,Ohe__neighbourhood_cleansed_Back Bay,Ohe__neighbourhood_cleansed_Bay Village,Ohe__neighbourhood_cleansed_Beacon Hill,Ohe__neighbourhood_cleansed_Brighton,Ohe__neighbourhood_cleansed_Charlestown,Ohe__neighbourhood_cleansed_Chinatown,Ohe__neighbourhood_cleansed_Dorchester,Ohe__neighbourhood_cleansed_Downtown,Ohe__neighbourhood_cleansed_East Boston,...,remainder__beds,remainder__bed_type,remainder__Number of amenities,remainder__guests_included,remainder__price_per_extra_person,remainder__minimum_nights,remainder__number_of_reviews,remainder__number_days_btw_first_last_review,remainder__review_scores_rating,remainder__price
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,4.0,14.0,1.0,0.0,2.0,0.0,0.0,,250.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,20.0,0.0,0.0,2.0,36.0,804.0,94.0,65.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,17.0,1.0,20.0,3.0,41.0,2574.0,98.0,65.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,22.0,2.0,25.0,1.0,1.0,0.0,100.0,75.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,13.0,1.0,0.0,2.0,29.0,380.0,99.0,79.0


In [317]:
airbnb.columns = airbnb.columns.str.replace('remainder__', '')
airbnb.columns = airbnb.columns.str.replace('Ohe__', '')

In [318]:
airbnb.head()

Unnamed: 0,neighbourhood_cleansed_Allston,neighbourhood_cleansed_Back Bay,neighbourhood_cleansed_Bay Village,neighbourhood_cleansed_Beacon Hill,neighbourhood_cleansed_Brighton,neighbourhood_cleansed_Charlestown,neighbourhood_cleansed_Chinatown,neighbourhood_cleansed_Dorchester,neighbourhood_cleansed_Downtown,neighbourhood_cleansed_East Boston,...,beds,bed_type,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,4.0,14.0,1.0,0.0,2.0,0.0,0.0,,250.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,20.0,0.0,0.0,2.0,36.0,804.0,94.0,65.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,17.0,1.0,20.0,3.0,41.0,2574.0,98.0,65.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,22.0,2.0,25.0,1.0,1.0,0.0,100.0,75.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,13.0,1.0,0.0,2.0,29.0,380.0,99.0,79.0


In [319]:
# check the dataframe columns to verify encoding and dropped columns
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Data columns (total 61 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   neighbourhood_cleansed_Allston                  3555 non-null   float64
 1   neighbourhood_cleansed_Back Bay                 3555 non-null   float64
 2   neighbourhood_cleansed_Bay Village              3555 non-null   float64
 3   neighbourhood_cleansed_Beacon Hill              3555 non-null   float64
 4   neighbourhood_cleansed_Brighton                 3555 non-null   float64
 5   neighbourhood_cleansed_Charlestown              3555 non-null   float64
 6   neighbourhood_cleansed_Chinatown                3555 non-null   float64
 7   neighbourhood_cleansed_Dorchester               3555 non-null   float64
 8   neighbourhood_cleansed_Downtown                 3555 non-null   float64
 9   neighbourhood_cleansed_East Boston       

In [320]:
airbnb

Unnamed: 0,neighbourhood_cleansed_Allston,neighbourhood_cleansed_Back Bay,neighbourhood_cleansed_Bay Village,neighbourhood_cleansed_Beacon Hill,neighbourhood_cleansed_Brighton,neighbourhood_cleansed_Charlestown,neighbourhood_cleansed_Chinatown,neighbourhood_cleansed_Dorchester,neighbourhood_cleansed_Downtown,neighbourhood_cleansed_East Boston,...,beds,bed_type,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,4.0,14.0,1.0,0.0,2.0,0.0,0.0,,250.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,20.0,0.0,0.0,2.0,36.0,804.0,94.0,65.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,17.0,1.0,20.0,3.0,41.0,2574.0,98.0,65.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,22.0,2.0,25.0,1.0,1.0,0.0,100.0,75.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,13.0,1.0,0.0,2.0,29.0,380.0,99.0,79.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,4.0,17.0,1.0,24.0,2.0,4.0,344.0,90.0,69.0
3551,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.0,23.0,1.0,0.0,3.0,0.0,0.0,,150.0
3552,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,4.0,1.0,1.0,0.0,3.0,0.0,0.0,,198.0
3553,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,4.0,7.0,1.0,30.0,1.0,2.0,8.0,90.0,65.0


### Split data into train and test sets

In [321]:
# split the data into validation and training set
train_df, test_df = train_test_split(airbnb, test_size=0.3)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'price'
predictors = list(airbnb.columns)
predictors.remove(target)

#### Impute missing values

In [322]:
numeric_cols_with_nas = list(train_df.isna().sum()[train_df.isna().sum() > 0].index)
numeric_cols_with_nas

['bathrooms', 'bedrooms', 'beds', 'review_scores_rating']

We can see from the code above that there are 4 variables (columns) that contain missing numeric values (we've already taken care of any missing values in the catagorical variables earlier).

In [323]:
imputer = SimpleImputer(strategy="median")

train_df[numeric_cols_with_nas] = imputer.fit_transform(train_df[numeric_cols_with_nas])
test_df[numeric_cols_with_nas] = imputer.transform(test_df[numeric_cols_with_nas])

#### Standardize numeric values

Now, let's create a common scale between the numberic columns by standardizing each numeric column

In [324]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
cols_to_stdize = ['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']                
               
# Transform the predictors of training and validation sets
train_df[cols_to_stdize] = scaler.fit_transform(train_df[cols_to_stdize]) # train_predictors is not a numpy array


test_df[cols_to_stdize] = scaler.transform(test_df[cols_to_stdize]) # validation_target is now a series object


## Save the data

In [325]:
train_X = train_df[predictors]
train_y = train_df[target] # train_target is now a series objecttrain_df.to_csv('airbnb_train_df.csv', index=False)
test_X = train_df[predictors]
test_y = test_df[target] # validation_target is now a series object

train_df.to_csv('airbnb_train_df.csv', index=False)
train_X.to_csv('airbnb_train_X.csv', index=False)
train_y.to_csv('airbnb_train_y.csv', index=False)
test_df.to_csv('airbnb_test_df.csv', index=False)
test_X.to_csv('airbnb_test_X.csv', index=False)
test_y.to_csv('airbnb_test_y.csv', index=False)