# Reading all csv and json datasets, combine counts for  into one central file 

This notebook is used to load all relevant datasets found on https://data.stadt-zuerich.ch/ as well as the population per zip code data from https://opendata.swiss/en/dataset/bevoelkerung-pro-plz into a central file that will be used for the rest of the analysis.

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from os import listdir

In [2]:
DATA_PATH = './csv_json_files/'

## Reading and loading the csv files

In [3]:
# create function that reads cleans and appends csv to the general dataframe
def csv_read_clean_append(csv_file, duplicate_check, group_on, new_column, append_target=None):
    
    # load csv into dataframe, drop any duplacte rows, count values for group_on series 
    temp_df = pd.read_csv(DATA_PATH + csv_file).drop_duplicates(duplicate_check)[group_on].value_counts().to_frame(new_column)
    
    if append_target is None:
        return temp_df
    else:
        return append_target.join(temp_df)

In [4]:
# start with addresses and number of hospitality companies
combined_df = csv_read_clean_append('adressen.csv', 'adresse', 'plz', 'addresses')
combined_df = csv_read_clean_append('gastwirtschaftsbetriebe_per_20171231.csv',\
                                    'Betriebsname', 'plz', 'hospitality_companies', combined_df)

We now insert the population per zip code. We have to do some manual touching up, as the population is a sum of number of women and men.

In [5]:
# read population per zip code file
population_df = pd.read_csv(DATA_PATH+'bevoelkerung_proplz.csv',delimiter=';')

# sum the men and women count per zip code to get population per zip code
population_df = population_df[population_df['typ']\
                                .isin(['w', 'm'])]\
                                .groupby('plz')['anzahl']\
                                .agg('sum')\
                                .to_frame('population')

# merge the population data to the rest
combined_df = pd.merge(combined_df, population_df, 'inner', left_index=True,right_index=True)
combined_df.head()

Unnamed: 0,addresses,hospitality_companies,population
8050,4134,153.0,34412
8049,3498,34.0,27853
8048,3492,95.0,33499
8032,3403,50.0,23866
8046,3016,31.0,29139


## Reading and loading the json files

In [6]:
# create function that reads cleans and appends json datasets to the general dataframe
def json_read_clean_append(json_file, duplicate_check, group_on, new_column, append_target=None):
    
    # first read json using geopandas
    temp_df = gpd.read_file(DATA_PATH + json_file)
    
    
    # load csv into dataframe, drop any duplicate rows, count values for group_on series 
    temp_df = pd.DataFrame(temp_df).drop_duplicates(duplicate_check)[group_on].value_counts().to_frame(new_column)
    temp_df.index = pd.to_numeric(temp_df.index)
    
    if append_target is None:
        return temp_df
    else:
        return pd.merge(append_target, temp_df, 'left', left_index=True,right_index=True)

In [7]:
# loop through json files in data folder and add to general df with function above
for file in listdir(DATA_PATH):
    if file.endswith('.json'):
        combined_df = json_read_clean_append(file,'adresse', 'plz', file[:-5], combined_df)

In [8]:
# drop zip codes that are not used (last 5 rows), and fill other nans with zeros
combined_df = combined_df.dropna(subset=['hospitality_companies']).fillna(0)
combined_df

Unnamed: 0,addresses,hospitality_companies,population,indoor_pools,Mobility_rental,outdoor_pools,ice_rinks,police_locations,skate_parks,elementary_schools,nurseries,football_fields,kindergartens,tennis_courts,bikeparks,community_centers,care_centers,beachvolleyball
8050,4134,153.0,34412,1.0,18.0,0.0,1.0,2.0,0.0,8.0,30.0,2.0,19.0,0.0,0.0,1.0,1.0,0.0
8049,3498,34.0,27853,1.0,10.0,0.0,0.0,1.0,0.0,7.0,13.0,0.0,15.0,0.0,0.0,1.0,1.0,1.0
8048,3492,95.0,33499,1.0,11.0,1.0,0.0,1.0,1.0,9.0,16.0,3.0,20.0,1.0,0.0,1.0,1.0,3.0
8032,3403,50.0,23866,1.0,13.0,0.0,0.0,1.0,0.0,5.0,19.0,0.0,7.0,0.0,0.0,1.0,0.0,0.0
8046,3016,31.0,29139,0.0,7.0,0.0,0.0,1.0,1.0,7.0,10.0,1.0,17.0,1.0,0.0,1.0,1.0,0.0
8008,2929,110.0,23162,0.0,12.0,0.0,0.0,2.0,0.0,7.0,23.0,0.0,8.0,1.0,0.0,1.0,1.0,0.0
8006,2894,69.0,22387,0.0,20.0,0.0,0.0,2.0,0.0,5.0,24.0,0.0,8.0,0.0,0.0,1.0,0.0,1.0
8057,2786,35.0,24258,0.0,12.0,1.0,0.0,0.0,0.0,3.0,12.0,0.0,10.0,0.0,0.0,1.0,2.0,0.0
8051,2714,37.0,25792,0.0,7.0,1.0,0.0,1.0,1.0,9.0,12.0,1.0,16.0,1.0,0.0,1.0,1.0,1.0
8004,2663,359.0,34271,0.0,15.0,0.0,0.0,6.0,1.0,6.0,20.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0
