## Common script for loading all the necessary data
Just running it once suffices to generate the necessary csv/json files. Might be possible that paths need to be adjusted in case of a different project structure...

### Paths

In [None]:
# Airbnb
airbnb_listings_path = "listings.csv.gz"
airbnb_output_path = "vis_data.csv"

# Crime


### Imports

In [None]:
import gzip
import pandas as pd
import geopandas as gpd

### Airbnb Data

#### Loading the data

In [None]:
# Load the data
with gzip.open(airbnb_listings_path, 'rt', encoding='utf-8') as f:
    file_content = f.read()

df_listings = pd.read_csv(airbnb_listings_path, compression='gzip', header=0, sep=',', quotechar='"')

#### Cleaning the data

In [None]:
# get rid of the dollar signs
df_listings = df_listings.dropna(subset=['price'])  # remove missings
df_listings['price'] = df_listings['price'].astype(str)
df_listings['price'] = df_listings['price'].str.replace('$', '').str.replace(',', '').astype(float)

# listings data as gdf
gdf_listings = gpd.GeoDataFrame(df_listings, geometry=gpd.points_from_xy(df_listings.longitude, df_listings.latitude))
gdf_listings.crs = 'EPSG:4326'
gdf_listings.to_crs(epsg=26916, inplace=True)  # common projection

# remove one outlier value (faulty data?)
gdf_listings = gdf_listings[gdf_listings['price'] <= 10000]

gdf_listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

#### Filter only whats necessary

In [6]:
interesting_cols = [
    # keep for filtering
    'latitude',
    'longitude',
    'neighbourhood_cleansed',

    # classes for levels in treemap and for avg calc
    'property_type', 
    'room_type', 
    'review_scores_rating', # >> transform this into classes ? 4.8-4.9, 4.7-4.8, ...?
    'accommodates',
    'bedrooms', 
    'beds',
    'price'
]

vis_data = gdf_listings.loc[:, interesting_cols].copy()

vis_data.head(10)

# create buckets for review_scores_rating and a numeric mapping
bins = [0.0, 1.0, 2.0, 3.0, 3.5, 4.0, 4.25, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0]
labels = ['0.0-1.0', '1.0-2.0', '2.0-3.0', '3.0-3.5', '3.5-4.0', 
          '4.0-4.25', '4.25-4.5', '4.5-4.6', '4.6-4.7', '4.7-4.8', '4.8-4.9', '4.9-5.0']

mapping = {label: idx for idx, label in enumerate(labels)}  # optional numeric mapping for plotting/encoding

vis_data['rating_bucket'] = pd.cut(vis_data['review_scores_rating'], bins=bins, labels=labels, include_lowest=True)
# fill missing ratings with a clear category
vis_data['rating_bucket'] = vis_data['rating_bucket'].cat.add_categories(['No rating']).fillna('No rating')

vis_data.head()

Unnamed: 0,latitude,longitude,neighbourhood_cleansed,property_type,room_type,review_scores_rating,accommodates,bedrooms,beds,price,rating_bucket
0,41.89634,-87.65608,West Town,Entire rental unit,Entire home/apt,4.9,6,3.0,4.0,178.0,4.8-4.9
1,41.94272,-87.68409,North Center,Entire condo,Entire home/apt,4.92,4,2.0,2.0,228.0,4.9-5.0
2,41.96316,-87.69208,Lincoln Square,Entire rental unit,Entire home/apt,4.79,2,1.0,1.0,88.0,4.7-4.8
3,41.90006,-87.68096,West Town,Entire condo,Entire home/apt,4.8,3,1.0,2.0,135.0,4.7-4.8
4,41.88152,-87.69768,East Garfield Park,Private room in townhouse,Private room,4.77,1,1.0,1.0,36.0,4.7-4.8


#### Save it

In [None]:
vis_data.to_csv(airbnb_output_path, index=False)