In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [80]:
df = pd.read_csv('AB_US_2020.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,city
0,38585,Charming Victorian home - twin beds + breakfast,165529,Evelyne,,28804,35.65146,-82.62792,Private room,60,1,138,16/02/20,1.14,1,0,Asheville
1,80905,French Chic Loft,427027,Celeste,,28801,35.59779,-82.5554,Entire home/apt,470,1,114,07/09/20,1.03,11,288,Asheville
2,108061,Walk to stores/parks/downtown. Fenced yard/Pet...,320564,Lisa,,28801,35.6067,-82.55563,Entire home/apt,75,30,89,30/11/19,0.81,2,298,Asheville
3,155305,Cottage! BonPaul + Sharky's Hostel,746673,BonPaul,,28806,35.57864,-82.59578,Entire home/apt,90,1,267,22/09/20,2.39,5,0,Asheville
4,160594,Historic Grove Park,769252,Elizabeth,,28801,35.61442,-82.54127,Private room,125,30,58,19/10/15,0.52,1,0,Asheville


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226030 entries, 0 to 226029
Data columns (total 17 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              226030 non-null  int64  
 1   name                            226002 non-null  object 
 2   host_id                         226030 non-null  int64  
 3   host_name                       225997 non-null  object 
 4   neighbourhood_group             110185 non-null  object 
 5   neighbourhood                   226030 non-null  object 
 6   latitude                        226030 non-null  float64
 7   longitude                       226030 non-null  float64
 8   room_type                       226030 non-null  object 
 9   price                           226030 non-null  int64  
 10  minimum_nights                  226030 non-null  int64  
 11  number_of_reviews               226030 non-null  int64  
 12  last_review     

In [4]:
# There seems to be a lot of missing rows, I will create a function that looks at each row and prints out how many
# rows are missing and what % of the total number of rows is missing
def missing_rows(df):
    # The missing row % is = to the sum of all the nulls divided by the total number of rows
    missing_row_pct = (df.isnull().sum() / len(df)) * 100
    # This is the total number of missing rows per column
    missing_row_raw = df.isnull().sum()
    # creating a new dataframe to return that contains all the information we are looking for
    missing_df = pd.DataFrame({'num_rows_missing': missing_row_raw, 'pct_rows_missing': missing_row_pct})
    return missing_df

In [5]:
missing_rows(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
id,0,0.0
name,28,0.012388
host_id,0,0.0
host_name,33,0.0146
neighbourhood_group,115845,51.252046
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0
room_type,0,0.0
price,0,0.0


There are really only 3 outliers; neighbourhood_group, last_review, and reviews_per_month.

I think it's worth dropping neighbourhood_group for now as it has 51% of its values missing.
However the last_review and reviews_per_month both have a decent amount of rows not null so I will keep those for now.

In [16]:
airbnb_data = df.drop(columns='neighbourhood_group')

In [17]:
airbnb_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,city
0,38585,Charming Victorian home - twin beds + breakfast,165529,Evelyne,28804,35.65146,-82.62792,Private room,60,1,138,16/02/20,1.14,1,0,Asheville
1,80905,French Chic Loft,427027,Celeste,28801,35.59779,-82.5554,Entire home/apt,470,1,114,07/09/20,1.03,11,288,Asheville
2,108061,Walk to stores/parks/downtown. Fenced yard/Pet...,320564,Lisa,28801,35.6067,-82.55563,Entire home/apt,75,30,89,30/11/19,0.81,2,298,Asheville
3,155305,Cottage! BonPaul + Sharky's Hostel,746673,BonPaul,28806,35.57864,-82.59578,Entire home/apt,90,1,267,22/09/20,2.39,5,0,Asheville
4,160594,Historic Grove Park,769252,Elizabeth,28801,35.61442,-82.54127,Private room,125,30,58,19/10/15,0.52,1,0,Asheville


In [20]:
airbnb_data.to_csv('airbnb_data.csv')

In order to properly explore and create models for the data I will need to split the data, here is a function for splitting the data.

In [29]:
def df_split(df):
    # Creating two data frames, a larger one with train and validate combined, and the test dataframe
    train_validate, test = train_test_split(df, test_size=.25, random_state=123)
    # Splitting the train_validate dataframe in to separate dataframes for each.
    train, validate = train_test_split(train_validate, test_size=.4, random_state=123)
    return train, validate, test

In [30]:
train, validate, test = df_split(airbnb_data)

In [31]:
train.shape, validate.shape, test.shape

((101713, 16), (67809, 16), (56508, 16))

In [32]:
train.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,city
161859,44427665,Stylish 5BR apt in Bed-Stuy,358356356,Alex And Vlada,Bedford-Stuyvesant,40.69,-73.95134,Entire home/apt,209,30,0,,,10,365,New York City
76638,6745994,Los Angeles Studio Walk Downtown LA,35242887,Nicole,Elysian Park,34.07105,-118.23237,Entire home/apt,77,30,441,15/01/20,6.97,1,262,Los Angeles
50109,6483167,LUXURY REMODELED POIPU SANDS CONDO WITH A/C,17600641,Bruce,Koloa-Poipu,21.87433,-159.44223,Entire home/apt,239,5,21,20/03/20,0.5,1,261,Hawaii
50250,6991307,Pali Ke Kua Oceanview. Relax. Restore. Revive!,35827700,Sally,North Shore Kauai,22.22527,-159.49065,Entire home/apt,288,4,111,12/03/20,1.86,1,323,Hawaii
210290,32476765,Green Lake Getaway,41673708,Stephen,Wallingford,47.67108,-122.32533,Entire home/apt,62,2,25,14/06/20,1.71,1,59,Seattle


In [34]:
train.room_type.unique()

array(['Entire home/apt', 'Private room', 'Hotel room', 'Shared room'],
      dtype=object)

Seeing that there are only 4 unique values for the room type this will be a good time to create dummy variables for later exploration.

In [75]:
def airbnb_dummies(df):
    # Create dummy variables for the room_type column
    type_dummies = pd.get_dummies(df['room_type'], drop_first=False)
    # rename these dummie
    type_dummies = type_dummies.rename(columns={type_dummies.columns[0] : 'entire_home', type_dummies.columns[1] : 'hotel_room', type_dummies.columns[2] : 'private_room', type_dummies.columns[3] : 'shared_room'})
    df = pd.concat([df, type_dummies], axis=1)
    return df

In [77]:
df = airbnb_dummies(df)

In [78]:
df[df.private_room == 1]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,city,entire_home,hotel_room,private_room,shared_room
0,38585,Charming Victorian home - twin beds + breakfast,165529,Evelyne,,28804,35.65146,-82.62792,Private room,60,...,138,16/02/20,1.14,1,0,Asheville,0,0,1,0
4,160594,Historic Grove Park,769252,Elizabeth,,28801,35.61442,-82.54127,Private room,125,...,58,19/10/15,0.52,1,0,Asheville,0,0,1,0
6,213006,Blue Gate West,1098412,Susanne,,28806,35.58345,-82.59713,Private room,48,...,137,30/11/19,1.35,1,0,Asheville,0,0,1,0
7,246315,Asheville Dreamer's Cabin,1292070,Annie,,28805,35.59635,-82.50655,Private room,65,...,57,30/10/19,0.53,2,106,Asheville,0,0,1,0
15,495111,Walk Downtown private bath peaceful,12874214,David,,28801,35.60371,-82.55621,Private room,85,...,338,13/09/20,3.36,3,0,Asheville,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226011,45481892,Room at Capitol Hill. Amazing location,324097761,Ali,,"Union Station, Stanton Park, Kingman Park",38.89861,-76.99560,Private room,60,...,0,,,4,360,Washington D.C.,0,0,1,0
226012,45482278,Room in a beautiful house in Rock Creek Park,6785598,Adrien,,"Hawthorne, Barnaby Woods, Chevy Chase",38.98385,-77.05902,Private room,76,...,0,,,1,267,Washington D.C.,0,0,1,0
226014,45492667,Single room at Capitol Hill location.,324097761,Ali,,"Union Station, Stanton Park, Kingman Park",38.89803,-76.99563,Private room,41,...,0,,,4,357,Washington D.C.,0,0,1,0
226019,45495668,Private Bedroom/ Parking Available,35906256,Melina,,"Brightwood Park, Crestwood, Petworth",38.93829,-77.02613,Private room,37,...,0,,,1,0,Washington D.C.,0,0,1,0


Last but not least some of the columns will need to be scaled, specifically the minimum_nights, number of reviews, reviews_per_month, and availability_365.

In [48]:
def airbnb_scaler(train, validate, test):
    # Creating copies of the data frames so that we don't modify the originals
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    # Creating MinMaxScaler
    scaler = MinMaxScaler()
    # Grabbing the columns for each dataframe that we want to scale
    train_to_scale = train_scaled[['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']]
    validate_to_scale = validate_scaled[['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']]
    test_to_scale = test_scaled[['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']]
    # Scaling the columns for each dataframe
    train_scaled[['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']] = scaler.fit_transform(train_to_scale)
    validate_scaled[['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']] = scaler.fit_transform(validate_to_scale)
    test_scaled[['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']] = scaler.fit_transform(test_to_scale)
    return train_scaled, validate_scaled, test_scaled

In [37]:
train_scaled, validate_scaled, test_scaled = airbnb_scaler(train, validate, test)

In [38]:
train_scaled.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,city
161859,44427665,Stylish 5BR apt in Bed-Stuy,358356356,Alex And Vlada,Bedford-Stuyvesant,40.69,-73.95134,Entire home/apt,209,0.025801,0.0,,,10,1.0,New York City
76638,6745994,Los Angeles Studio Walk Downtown LA,35242887,Nicole,Elysian Park,34.07105,-118.23237,Entire home/apt,77,0.025801,0.462749,15/01/20,0.158002,1,0.717808,Los Angeles
50109,6483167,LUXURY REMODELED POIPU SANDS CONDO WITH A/C,17600641,Bruce,Koloa-Poipu,21.87433,-159.44223,Entire home/apt,239,0.003559,0.022036,20/03/20,0.011124,1,0.715068,Hawaii
50250,6991307,Pali Ke Kua Oceanview. Relax. Restore. Revive!,35827700,Sally,North Shore Kauai,22.22527,-159.49065,Entire home/apt,288,0.002669,0.116474,12/03/20,0.041998,1,0.884932,Hawaii
210290,32476765,Green Lake Getaway,41673708,Stephen,Wallingford,47.67108,-122.32533,Entire home/apt,62,0.00089,0.026233,14/06/20,0.038593,1,0.161644,Seattle


In [39]:
train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101713 entries, 161859 to 40446
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              101713 non-null  int64  
 1   name                            101701 non-null  object 
 2   host_id                         101713 non-null  int64  
 3   host_name                       101696 non-null  object 
 4   neighbourhood                   101713 non-null  object 
 5   latitude                        101713 non-null  float64
 6   longitude                       101713 non-null  float64
 7   room_type                       101713 non-null  object 
 8   price                           101713 non-null  int64  
 9   minimum_nights                  101713 non-null  float64
 10  number_of_reviews               101713 non-null  float64
 11  last_review                     79747 non-null   object 
 12  reviews_per_

In [40]:
missing_rows(train_scaled)

Unnamed: 0,num_rows_missing,pct_rows_missing
id,0,0.0
name,12,0.011798
host_id,0,0.0
host_name,17,0.016714
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0
room_type,0,0.0
price,0,0.0
minimum_nights,0,0.0


Now I would like a function that combines all the previous work together in order to get a cleaned, split, and scaled set of data frames.

In [79]:
def prep_data(df):
    df = df.drop(columns='neighbourhood_group')
    df = airbnb_dummies(df)
    train, validate, test = df_split(df)
    train_scaled, validate_scaled, test_scaled = airbnb_scaler(train, validate, test)
    return train, validate, test, train_scaled, validate_scaled, test_scaled

In [81]:
train, validate, test, train_scaled, validate_scaled, test_scaled = prep_data(df)

In [82]:
train.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,city,entire_home,hotel_room,private_room,shared_room
161859,44427665,Stylish 5BR apt in Bed-Stuy,358356356,Alex And Vlada,Bedford-Stuyvesant,40.69,-73.95134,Entire home/apt,209,30,0,,,10,365,New York City,1,0,0,0
76638,6745994,Los Angeles Studio Walk Downtown LA,35242887,Nicole,Elysian Park,34.07105,-118.23237,Entire home/apt,77,30,441,15/01/20,6.97,1,262,Los Angeles,1,0,0,0
50109,6483167,LUXURY REMODELED POIPU SANDS CONDO WITH A/C,17600641,Bruce,Koloa-Poipu,21.87433,-159.44223,Entire home/apt,239,5,21,20/03/20,0.5,1,261,Hawaii,1,0,0,0
50250,6991307,Pali Ke Kua Oceanview. Relax. Restore. Revive!,35827700,Sally,North Shore Kauai,22.22527,-159.49065,Entire home/apt,288,4,111,12/03/20,1.86,1,323,Hawaii,1,0,0,0
210290,32476765,Green Lake Getaway,41673708,Stephen,Wallingford,47.67108,-122.32533,Entire home/apt,62,2,25,14/06/20,1.71,1,59,Seattle,1,0,0,0


In [53]:
train_scaled.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,101713.0,101713.0,101713.0,101713.0,101713.0,101713.0,101713.0,79747.0,101713.0,101713.0
mean,25504440.0,93685590.0,35.661438,-103.248353,221.246085,0.008126,0.036302,0.032417,0.026691,0.435798
std,13174260.0,98396660.0,6.845288,26.238556,594.474476,0.022483,0.066797,0.038399,0.087289,0.383782
min,109.0,23.0,18.92099,-159.71428,0.0,0.0,0.0,0.0,0.0,0.0
25%,15234560.0,14104110.0,32.76167,-118.6028,75.0,0.0,0.001049,0.004994,0.0,0.0
50%,25917250.0,51223760.0,37.25685,-97.83271,121.0,0.00089,0.008395,0.018161,0.001689,0.380822
75%,37773810.0,149880800.0,40.72366,-76.93446,202.0,0.005338,0.040923,0.046765,0.008446,0.852055
max,45549230.0,367907100.0,47.73385,-71.00268,24999.0,1.0,1.0,1.0,1.0,1.0
