### Link to the AIRBNB Data
http://insideairbnb.com/get-the-data/

In [1]:
import os
import pandas as pd

file_path = 'data/uncleaned/listings.csv'  

In [2]:
df = pd.DataFrame()
df = pd.read_csv(file_path, header=0, index_col=0)
print(list(df.columns))


['listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_36

In [3]:
columns_to_drop = ['listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 
                   'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_about', 
                   'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 
                   'host_thumbnail_url', 'host_picture_url', 'host_listings_count', 
                   'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 
                   'neighbourhood', 'neighbourhood_group_cleansed', 'bathrooms',  'bedrooms',
                   'amenities', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 
                   'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 
                   'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60',
                   'availability_90', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 
                   'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 
                   'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 
                   'review_scores_value', 'license', 'instant_bookable', 'calculated_host_listings_count', 
                   'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
                   'calculated_host_listings_count_shared_rooms', 'reviews_per_month', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77']

In [4]:
df = df.drop(columns=columns_to_drop)
print(df.head())

             host_location        host_neighbourhood  \
id                                                     
13188.0  Vancouver, Canada                Riley Park   
13221.0  Vancouver, Canada                Riley Park   
13358.0  Vancouver, Canada        Downtown Vancouver   
13490.0  Vancouver, Canada  Kensington-Cedar Cottage   
14267.0  Vancouver, Canada  Kensington-Cedar Cottage   

           neighbourhood_cleansed          latitude          longitude  \
id                                                                       
13188.0                Riley Park          49.24773         -123.10509   
13221.0                Riley Park          49.25489         -123.09708   
13358.0                  Downtown  49.2811737060547  -123.125930786133   
13490.0  Kensington-Cedar Cottage          49.25622         -123.06607   
14267.0  Kensington-Cedar Cottage          49.24922         -123.08139   

              property_type        room_type accommodates bathrooms_text beds  \
id     

In [5]:
print(len(df))
# Gets rid of people who don't airbnb their place more than 10 times a year 
df = df[df['availability_365'] > 10]
print(len(df))

6691
4927


In [6]:
print(df['property_type'].unique())

['Entire rental unit' 'Entire home' 'Private room in condo'
 'Private room in home' 'Entire guest suite' 'Entire condo' 'Entire loft'
 'Private room in rental unit' 'Entire serviced apartment'
 'Entire guesthouse' 'Private room in guest suite'
 'Room in boutique hotel' 'Entire townhouse' '49.25107' 'Entire place'
 'Tiny home' 'Entire cottage' 'Private room in loft'
 'Private room in townhouse' 'Camper/RV' 'Floor' 'Private room in boat'
 'Shared room in rental unit' '-123.11172' 'Private room in bungalow'
 'Private room in guesthouse' 'Entire bungalow' 'Entire villa'
 '-123.11667' 'Private room in bed and breakfast' 'Private room in villa'
 'Private room in tiny home' 'Room in aparthotel' '-123.12454'
 'Entire vacation home' 'Boat' 'Shared room in villa'
 'Private room in serviced apartment' 'Shared room in loft'
 'Private room in casa particular' nan 'Casa particular'
 'Shared room in home' 'Room in hotel' '-123.107266374824'
 '-123.179117536209' 'Shared room in condo' 'Cave'
 'Private

In [7]:
import numpy as np

def simplify_property_type(value):
    if pd.isnull(value):
        return np.nan
    elif 'shared room' in value.lower():
        return 'Shared Room'
    elif 'private room' in value.lower():
        return 'Private Room'
    elif any(term in value.lower() for term in ['private condo', 'private apartment', 'private loft', 'private place']):
        return 'Private Apartment'
    elif 'private guest suite' in value.lower() or 'guesthouse' in value.lower():
        return 'Private Guest Suite'
    elif any(term in value.lower() for term in ['entire home', 'entire house', 'entire villa', 'entire place', 'entire cottage', 'entire townhouse']):
        return 'Private House'
    elif any(char.isdigit() for char in value):  
        return np.nan
    else:
        return value  

# Apply the function to the 'property_type' column
df['property_type'] = df['property_type'].apply(simplify_property_type)

# Drop any rows with NaN in 'property_type' column (which includes improper accommodation names)
df = df.dropna(subset=['property_type'])
print(df['property_type'].unique())

['Entire rental unit' 'Private House' 'Private Room' 'Entire guest suite'
 'Entire condo' 'Entire loft' 'Entire serviced apartment'
 'Private Guest Suite' 'Room in boutique hotel' 'Tiny home' 'Camper/RV'
 'Floor' 'Shared Room' 'Entire bungalow' 'Room in aparthotel'
 'Entire vacation home' 'Boat' 'Casa particular' 'Room in hotel' 'Cave']


In [8]:
def condense_property_type(value):
    if value in ['Entire condo', 'Entire loft', 'Entire serviced apartment', 'Entire bungalow']:
        return 'Private Apartment'
    elif value in ['Room in boutique hotel', 'Room in aparthotel', 'Room in hotel']:
        return 'Hotel Room'
    elif value in ['Entire vacation home', 'Entire rental unit', 'Private House', 'Tiny home']:
        return 'Private House'
    elif value in ['Private Guest Suite', 'Entire guest suite']:
        return 'Private Guest Suite'
    elif value in ['Floor', 'Casa particular']:
        return None 
    else:
        return value

# Apply the function to the 'property_type' column
df['property_type'] = df['property_type'].apply(condense_property_type)

# Drop any rows with None in 'property_type' column (which includes the dropped categories)
df = df.dropna(subset=['property_type'])

print(df['property_type'].unique())

['Private House' 'Private Room' 'Private Guest Suite' 'Private Apartment'
 'Hotel Room' 'Camper/RV' 'Shared Room' 'Boat' 'Cave']


In [9]:
print(len(df))

4918


In [10]:
print(df.head())

             host_location        host_neighbourhood  \
id                                                     
13221.0  Vancouver, Canada                Riley Park   
13490.0  Vancouver, Canada  Kensington-Cedar Cottage   
14267.0  Vancouver, Canada  Kensington-Cedar Cottage   
16611.0                NaN          Commercial Drive   
18270.0  Vancouver, Canada            Mount Pleasant   

           neighbourhood_cleansed  latitude   longitude  property_type  \
id                                                                       
13221.0                Riley Park  49.25489  -123.09708  Private House   
13490.0  Kensington-Cedar Cottage  49.25622  -123.06607  Private House   
14267.0  Kensington-Cedar Cottage  49.24922  -123.08139  Private House   
16611.0        Grandview-Woodland  49.26339  -123.07145  Private House   
18270.0            Mount Pleasant  49.26557    -123.096   Private Room   

               room_type accommodates bathrooms_text beds    price  \
id                

In [11]:
print(df['bathrooms_text'].unique())

['1 bath' '1 shared bath' '1 private bath' '2 baths' '1.5 baths'
 '2.5 baths' '3 baths' '4.5 baths' '4 baths' '1.5 shared baths' '0 baths'
 '2 shared baths' '3.5 baths' '2.5 shared baths' '6 baths'
 '3 shared baths' '5 baths' '5.5 baths' 'Private half-bath' '8 baths' nan
 '6.5 baths' 'Shared half-bath' '10 baths' 'Half-bath' '7 baths']


In [12]:
def condense_bath_info(bath_info):
    if pd.isna(bath_info):
        return None  # Will result in a NaN when applied, which we can later drop
    # Remove any non-numeric and non-shared words, also handling half-bath as .5
    bath_info = bath_info.replace('baths', '').replace('Private', '').replace('Half-', '.5 ').replace('half-', '.5 ').replace('private', '').replace('bath', '').strip()
    if bath_info.startswith('Shared'):
        bath_info = bath_info.replace('Shared ', '') + ' shared'  # Ensure '.5 shared' format
    elif 'shared' in bath_info:
        bath_info = bath_info.replace('shared ', '')
     # Move 'shared' to the end
    return bath_info.strip()

# Apply the function to the 'baths' column
df['bathrooms_text'] = df['bathrooms_text'].apply(condense_bath_info)

# Drop any rows with None in 'baths' column (which includes the dropped categories)
df = df.dropna(subset=['bathrooms_text'])


print(df['bathrooms_text'].unique())

['1' '1 shared' '2' '1.5' '2.5' '3' '4.5' '4' '1.5 shared' '0' '2 shared'
 '3.5' '2.5 shared' '6' '3 shared' '5' '5.5' '.5' '8' '6.5' '.5 shared'
 '10' '7']


In [15]:
print(df['beds'].unique())
df = df.dropna(subset=['beds'])
print(df['beds'].unique())
print(df['room_type'].unique())

['2' '1' '4' '3' '6' '5' '7' '8' '10' '13' '9']
['2' '1' '4' '3' '6' '5' '7' '8' '10' '13' '9']
['Entire home/apt' 'Private room' 'Shared room']


In [16]:
df.to_csv('data/cleaned/airbnb-cleaned.csv')