In [10]:
# Load libraries
import pandas as pd
import os
import re

In [9]:
# Open yvr_listing_data.csv in the data folder
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data.csv'))

# Print columns
print(listings_df.columns)

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [25]:
%%capture --no-stdout
"""
Create a new column titled "legal_listing" that contains the boolean describing whether or not the listing has a valid license.
The column is True if the listing has a valid license or does not require one and False if the listing does not have a valid license.
To compute the value of the column, we use the following logic:

If the listing has a number in the "license" column with the regex pattern of r'.*?(\d{2}[-\s]?\d{6}).*?' 
OR the listing has a number in the "minimum_nights" column with a value equal to or greater than 30,
THEN the "legal_listing" is True. ELSE the "valid_license" is False.

Note:
The regex pattern r'.*?(\d{2}[-\s]?\d{6}).*?' is used to find a number with 2 digits, followed by a dash or space, 
followed by 6 digits. The number can be surrounded by any number of characters. 
TODO: Verify this is the correct pattern for the license numbers and find any other ways of verifying legitimate license numbers.
"""

regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{6}).*?')

# Create the valid_license column using the logic described above
listings_df['legal_listing'] = listings_df['license'].str.contains(regex_pattern) | (listings_df['minimum_nights'] >= 30)

# Print only the columns we are interested in
print(listings_df[['id', 'license', 'minimum_nights', 'legal_listing']])

# Print count of valid and invalid licenses
print(listings_df['legal_listing'].value_counts())

                      id    license  minimum_nights  legal_listing
0                  13188  23-156488               2           True
1                  13358  22-311727               1           True
2                  13490        NaN              30          False
3                  14267  21-156500               3           True
4                  14424  19-162091              30           True
...                  ...        ...             ...            ...
6690  973179206514018336        NaN              30          False
6691  973258393610061991        NaN              30          False
6692  973300488231367035  23-158627               1           True
6693  973307866073484004  23-272980               1           True
6694  973438165463370718  23-212933               2           True

[6695 rows x 4 columns]
legal_listing
True     4596
False    2099
Name: count, dtype: int64


In [26]:
# Save the dataframe to a new csv file
listings_df.to_csv(os.path.join('data', 'yvr_listing_data_cleaned.csv'), index=False)