In [5]:
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd


In [3]:
airbnb_data_path = "../data/airbnb_data/"
listings_csv_path = airbnb_data_path + "listings.csv.gz"

with gzip.open(listings_csv_path, 'rt', encoding='utf-8') as f:
    file_content = f.read()

df_listings = pd.read_csv(listings_csv_path, compression='gzip', header=0, sep=',', quotechar='"')

In [6]:
# get rid of the dollar signs
df_listings = df_listings.dropna(subset=['price'])  # remove missings
df_listings['price'] = df_listings['price'].astype(str)
df_listings['price'] = df_listings['price'].str.replace('$', '').str.replace(',', '').astype(float)

# listings data as gdf
gdf_listings = gpd.GeoDataFrame(df_listings, geometry=gpd.points_from_xy(df_listings.longitude, df_listings.latitude))
gdf_listings.crs = 'EPSG:4326'
gdf_listings.to_crs(epsg=26916, inplace=True)  # common projection

# remove one outlier value (faulty data?)
gdf_listings = gdf_listings[gdf_listings['price'] <= 10000]

# only keep certain columns for now
# gdf_listings_filtered = gdf_listings[['geometry', 'accommodates', 'price', 'review_scores_location', 'review_scores_rating', 'reviews_per_month']].copy()
gdf_listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [16]:
interesting_cols = [
    # keep for filtering
    'latitude',
    'longitude',

    # classes for levels in treemap
    'property_type', 
    'room_type', 
    'review_scores_rating', # >> transform this into classes ? 4.8-4.9, 4.7-4.8, ...?
    
    # figures to visualize
    'accommodates',
    'bedrooms', 
    'beds',
    'price'
]

vis_data = gdf_listings.loc[:, interesting_cols].copy()

vis_data.head(10)


Unnamed: 0,latitude,longitude,property_type,room_type,review_scores_rating,accommodates,bedrooms,beds,price
0,41.89634,-87.65608,Entire rental unit,Entire home/apt,4.9,6,3.0,4.0,178.0
1,41.94272,-87.68409,Entire condo,Entire home/apt,4.92,4,2.0,2.0,228.0
2,41.96316,-87.69208,Entire rental unit,Entire home/apt,4.79,2,1.0,1.0,88.0
3,41.90006,-87.68096,Entire condo,Entire home/apt,4.8,3,1.0,2.0,135.0
4,41.88152,-87.69768,Private room in townhouse,Private room,4.77,1,1.0,1.0,36.0
5,41.79606,-87.62382,Private room in condo,Private room,4.75,2,1.0,1.0,80.0
6,41.85669,-87.65132,Private room in rental unit,Private room,4.89,2,1.0,1.0,65.0
7,41.85295,-87.70686,Private room in home,Private room,3.67,1,1.0,1.0,45.0
9,41.85361,-87.62434,Entire rental unit,Entire home/apt,4.76,2,1.0,2.0,90.0
10,41.89288,-87.68458,Entire rental unit,Entire home/apt,4.92,2,1.0,1.0,147.0


In [19]:
vis_data.to_csv("./51912112_Individiual_Viualization/vis_data.csv", index=False)