In [None]:
# All import statements needed for the project
import json
import requests
import pandas as pd
import urllib.parse
import requests
import pathlib
from pathlib import Path
import psycopg2
import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base

## 1 downloading, cleaning & flitering data

In [None]:
def download_nyc_geojson_data(url, file_name, force=False):
    filename = Path(file_name + '.geojson')
    
    if force or not filename.exists():
        print(f"Downloading {url} to {file_name}...")
    
        response = requests.get(url)
        text = response.text

        with open(file_name + '.geojson', 'w') as file:
            file.write(text)
        print(f"Done downloading {url}.")
        
    else:
        print(f"Reading from {file_name}...")
    
    gdf = gpd.read_file(file_name + '.geojson', driver = 'GeoJSON')
    return gdf

complaints

In [None]:
def download_and_clean_311_data():
    url = "https://data.cityofnewyork.us/resource/erm2-nwe9.geojson?$$app_token=RbFfvU4T8a7C7rDHaA9eqAkvZ"
    file_name = 'complaints_head(1000)'
    complaints = download_nyc_geojson_data(url, file_name, force=False)
    
    # choose columns we neeed
    new_complaints = complaints[['created_date', 'incident_zip','complaint_type','geometry']]
    
    # 去除NaN
    columns_with_nan = new_complaints.columns[new_complaints.isnull().any()].tolist()
    
    for column_name in columns_with_nan:
        print("Processing column:", column_name)
        new_complaints = new_complaints.dropna(subset=[column_name])
        print("Removed rows with NaN value in", column_name)

    return new_complaints

trees

In [None]:
def download_and_clean_tree_data():
    url = "https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson?$$app_token=RbFfvU4T8a7C7rDHaA9eqAkvZ"
    file_name = 'trees_head(1000)'
    trees = download_nyc_geojson_data(url, file_name, force=False)
    
    # 去除NaN
    columns_with_nan = trees.columns[trees.isnull().any()].tolist()
    
    for column_name in columns_with_nan:
        print("Processing column:", column_name)
        trees = trees.dropna(subset=[column_name])
        print("Removed rows with NaN value in", column_name)
        
    # choose columns we neeed
    new_trees = trees[['tree_id', 'health', 'status', 'latitude', 'longitude', 'geometry']].copy()

    return new_trees

zipcodes

In [None]:
def load_and_clean_zipcodes(zipcode_datafile):
    zipcode_data = gpd.read_file(zipcode_datafile)
    
    zipcodes = pd.DataFrame(
           zipcode_data,
           columns=['ZIPCODE', 'geometry'])
    # Convert 'Polygon' column to a format supported by SQL
    # Replace 'polygon_column' with your actual 'Polygon' column name
    ## zipcodes['geometry'] =zipcodes['geometry'].apply(lambda x: x.wkt if x else None)

    # Define data types explicitly (required for 'Polygon' column)
    ## data_types = {'geometry': String}  # Adjust the data type as per your SQL schema
    return zipcodes

In [None]:
rents

In [None]:
def load_and_clean_zillow_data(rent_datafile):
    zillow_rent_data = pd.read_csv(rent_datafile)
    
    columns_to_drop = ['RegionID', 'SizeRank','RegionType', 'State','StateName', 'City','Metro','CountyName']
    rents = zillow_rent_data.drop(columns=columns_to_drop)
    
    return rents

## 1.2 load all data

In [None]:
# ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "ZIP_CODE_040114.shp"
ZIPCODE_DATA_FILE = 'data/zipcodes/nyc_zipcodes.shp'
RENT_DATA_FILE = 'data/zillow_rent_data.csv'

In [None]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data(RENT_DATA_FILE)
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [None]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [None]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

In [None]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

In [None]:
geodf_311_data.info()

In [None]:
geodf_311_data.head()

In [None]:
geodf_tree_data.info()

In [None]:
geodf_tree_data.head()

In [None]:
df_zillow_data.info()

In [None]:
df_zillow_data.head()