# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [75]:
# All import statements needed for the project, for example:
import math
from datetime import datetime
from datetime import date
import numpy as np
import json
import pathlib
import urllib.parse
import psycopg2
import psycopg2.extras
import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

from pyproj import CRS, Transformer
from shapely.ops import transform
import seaborn as sns
import pandas as pd
import requests
import shapely
import sqlalchemy as db
import os
from shapely.geometry import Point
from sqlalchemy.orm import declarative_base

In [3]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

APP_TOKEN = "noFU7vdLMu3RKtmWOyOqkOi2x"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/resource/"
NYC_DATA_311 = "erm2-nwe9.json"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "Final_Project"
DB_USER = "postgres"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"

crs='EPSG:4326'

rent_month_dict = {}
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [14]:
def download_nyc_311_data(url, start, end, limit, force=False):

    url_path = urllib.parse.urlparse(url).path.split('/')[-1]
    filename = DATA_DIR / url_path
    
    if not DATA_DIR.exists():
        DATA_DIR.mkdir()

    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")
        
        offset = start
        all_entries = []
        selected_columns = ["unique_key", "created_date", "complaint_type", "incident_zip", "location"]
        
        while offset + limit <= end:  
            total = limit+offset
            print(f"start from {offset} and end is {total}")
            soql_query = f"{url}?$$app_token={APP_TOKEN}&$select={','.join(selected_columns)}&$limit={limit}&$offset={offset}"
            response = requests.get(soql_query)
            if response.status_code == 200: 
                entries = response.json()
                if not entries:
                    break 
                all_entries.extend(entries)
                offset += limit
            else:
                print(f"cannot access url error code{response.status_code}")
                break
        
        
        with open(filename, "w") as f:
            json.dump(all_entries, f)
        print(f"Done downloading {url}.")

    else:
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print(f"Current Time = {current_time} Reading from {filename}...")
    
    
    return filename

In [15]:
def download_nyc_tree_data(url, force=False):
    
    url_path = urllib.parse.urlparse(url).path.split('/')[-1]
    filename = DATA_DIR / url_path
    
    if not DATA_DIR.exists():
        DATA_DIR.mkdir()

    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")
        
        limit = 1_000_000
        offset = 0
        all_entries = []
        selected_columns = ["created_at", "tree_id", "zipcode", "the_geom", "spc_common", "health", "status"]
        end = 1_000_000
        
        while offset < end:  
            total = limit+offset
            print(f"start from {offset} and total is {total}")
            soql_query = f"{url}?$$app_token={APP_TOKEN}&$select={','.join(selected_columns)}&$limit={limit}&$offset={offset}"
            response = requests.get(soql_query)
            if response.status_code == 200: 
                entries = response.json()
                if not entries:
                    break 
                all_entries.extend(entries)
                offset += limit
            else:
                print(f"cannot access url error code{response.status_code}")
                break
        
        
        with open(filename, "w") as f:
            json.dump(all_entries, f)
        print(f"Done downloading {url}.")

    else:
        print(f"Reading from {filename}...")

    return filename

In [16]:
def load_and_clean_zipcodes(zipcode_datafile):
    global unique_zipcodes
    gdf = gpd.read_file("data/nyc_zipcodes/nyc_zipcodes.shp")
    gdf = gdf.to_crs(4326)
    zip_code = gdf[["ZIPCODE", "geometry"]].copy()
    unique_zipcodes_df = zip_code.drop_duplicates(subset = ["ZIPCODE"], keep = "last").reset_index()
    unique_zipcodes = unique_zipcodes_df["ZIPCODE"]
    return unique_zipcodes_df

In [17]:
def download_and_clean_311_data():
    Service_Requests_url = f"{BASE_NYC_DATA_URL}{NYC_DATA_311}"
    start = 0
    end = 100_000_000
    limit = 1_000_000
    filename = download_nyc_311_data(Service_Requests_url, start, end, limit)
    
    interactions_geo_data_frame = gpd.GeoDataFrame(pd.read_json(filename), dtype='object')
    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(f"Current Time = {current_time} reading done from {filename}...")`
    
    if 'location' in interactions_geo_data_frame.columns:
            interactions_geo_data_frame['longitude'] = interactions_geo_data_frame['location'].apply(
                lambda loc: loc.get('longitude') if isinstance(loc, dict) else None
            )
            interactions_geo_data_frame['latitude'] = interactions_geo_data_frame['location'].apply(
                lambda loc: loc.get('latitude') if isinstance(loc, dict) else None
            )
            
            geometry = gpd.GeoSeries(
                interactions_geo_data_frame.apply(
                    lambda row: Point(float(row['longitude']), float(row['latitude'])) if not pd.isna(row['longitude']) and not pd.isna(row['latitude']) else None,
                    axis=1
                ),
                crs='EPSG:4326'
            )
    
    if 'location' in interactions_geo_data_frame.columns:
        interactions_geo_data_frame.drop(columns=['location'], inplace=True)
    if 'longitude' in interactions_geo_data_frame.columns:
        interactions_geo_data_frame.drop(columns=['longitude'], inplace=True)
    if 'latitude' in interactions_geo_data_frame.columns:
        interactions_geo_data_frame.drop(columns=['latitude'], inplace=True)
            
    interactions_geo_data_frame.set_geometry(geometry, inplace=True)
        
    interactions_geo_data_frame = interactions_geo_data_frame.dropna(subset=['incident_zip'])
    interactions_geo_data_frame = interactions_geo_data_frame[interactions_geo_data_frame["incident_zip"].isin(unique_zipcodes)]
    
    interactions_geo_data_frame['created_date'] = pd.to_datetime(interactions_geo_data_frame['created_date'], format='%Y-%m-%dT%H:%M:%S.%f')
    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(f"Current Time = {current_time} done")
    
    #interactions_geo_df = interactions_geo_df.drop_duplicates(subset=['unique_key']).reset_index(drop=True)
    
    
    
    return interactions_geo_data_frame

In [18]:
def load_local_311_data():
    # Directory containing the JSON files
    json_files_directory = 'complain_data/'
    json_files = [f"{i}.json" for i in range(1, 36)]

    interactions_geo_data_frames = []

    # Iterate through each file in the directory
    for interactions_input_file in json_files:
        now = datetime.now()

        current_time = now.strftime("%H:%M:%S")
        print(f"Current Time = {current_time} input_file {interactions_input_file} begin" )
        file_path = os.path.join(json_files_directory, interactions_input_file)
        interactions_geo_data_frame = gpd.GeoDataFrame(pd.read_json(file_path, dtype='object'))
        #interactions_geo_data_frame = gpd.read_file(file_path, dtype='object')
    
        if 'location' in interactions_geo_data_frame.columns:
            #if 'longitude' in interactions_geo_data_frame['location'].columns and 'latitude' in interactions_geo_data_frame['location'].columns:
                interactions_geo_data_frame['longitude'] = interactions_geo_data_frame['location'].apply(
                    lambda loc: loc.get('longitude') if isinstance(loc, dict) else None
                )
                interactions_geo_data_frame['latitude'] = interactions_geo_data_frame['location'].apply(
                    lambda loc: loc.get('latitude') if isinstance(loc, dict) else None
                )


                geometry = gpd.GeoSeries(
                    interactions_geo_data_frame.apply(
                        lambda row: Point(float(row['longitude']), float(row['latitude'])) if not pd.isna(row['longitude']) and not pd.isna(row['latitude']) else None,
                        axis=1
                    ),
                    crs='EPSG:4326'
                )

        

        if 'location' in interactions_geo_data_frame.columns:
                interactions_geo_data_frame.drop(columns=['location'], inplace=True)
        if 'longitude' in interactions_geo_data_frame.columns:
                interactions_geo_data_frame.drop(columns=['longitude'], inplace=True)
        if 'latitude' in interactions_geo_data_frame.columns:
                interactions_geo_data_frame.drop(columns=['latitude'], inplace=True)
    
        interactions_geo_data_frame.set_geometry(geometry, inplace=True)
        
        interactions_geo_data_frame = interactions_geo_data_frame.dropna(subset=['incident_zip'])
        interactions_geo_data_frame = interactions_geo_data_frame[interactions_geo_data_frame["incident_zip"].isin(unique_zipcodes)]

        interactions_geo_data_frame['created_date'] = pd.to_datetime(interactions_geo_data_frame['created_date'], format='%Y-%m-%dT%H:%M:%S.%f')

        interactions_geo_data_frames.append(interactions_geo_data_frame)
    
    
    current_time = now.strftime("%H:%M:%S")
    print(f"Current Time = {current_time} load file done" )
    interactions_geo_df = gpd.GeoDataFrame(pd.concat(interactions_geo_data_frames, ignore_index=True, sort=True))
    current_time = now.strftime("%H:%M:%S")
    print(f"Current Time = {current_time} concat file done" )
    interactions_geo_df.crs = interactions_geo_data_frames[0].crs
    interactions_geo_data_frames.clear()
    interactions_geo_df = interactions_geo_df.drop_duplicates(subset=['unique_key']).reset_index(drop=True)
    now = datetime.now()

    current_time = now.strftime("%H:%M:%S")
    print(f"Current Time = {current_time} done normalized data 311" )
    
    return interactions_geo_df

In [19]:
def download_and_clean_tree_data():
    Trees_url = f"{BASE_NYC_DATA_URL}{NYC_DATA_TREES}"
    filename = download_nyc_tree_data(Trees_url)
    trees_gdf = gpd.GeoDataFrame(pd.read_json(filename), dtype='object')
    convert_dict = {
        "tree_id" : int,
        "spc_common" :str,
        "health": str,
        "status": str,
        "the_geom" : "geometry",
        "zipcode":str
    }
    
    trees_gdf['the_geom'] = gpd.GeoSeries(
        trees_gdf['the_geom'].apply(lambda x: Point(x['coordinates'])), crs='EPSG:4326'
    )
    
    trees_gdf_normalized = gpd.GeoDataFrame(trees_gdf.astype(convert_dict))
    trees_gdf_normalized = gpd.GeoDataFrame(trees_gdf_normalized[trees_gdf_normalized["zipcode"].isin(unique_zipcodes)])
    
    return trees_gdf_normalized

In [20]:
def load_and_clean_zillow_data(ZILLOW_DATA_FILE):
    global rent_month_dict
    zillow_rent_data = pd.read_csv(ZILLOW_DATA_FILE)
    NY_rent_date = zillow_rent_data[zillow_rent_data["City"] == "New York"]
    NY_rent_date = NY_rent_date.reset_index(drop=True)
    NY_rent_date_sub = NY_rent_date.drop(columns=['RegionID', 'SizeRank', 'RegionType', 'StateName', 'State','City','Metro','CountyName'])
    column_names = NY_rent_date_sub.columns.tolist()
    for i,j in enumerate(column_names):
        if i > 0:
            rent_month_dict[j] = i-1
    return NY_rent_date_sub

In [21]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    print(f"load zipcode data done")
    #geodf_311_data = download_and_clean_311_data()
    geodf_311_data = load_local_311_data()
    print(f"load 311 data done")
    geodf_tree_data = download_and_clean_tree_data()
    print(f"load trees data done")
    df_zillow_data = load_and_clean_zillow_data(ZILLOW_DATA_FILE)
    print(f"load rents data done")
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [None]:
# geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
# print(f"load zipcode data done")
# geodf_311_data = download_and_clean_311_data()
# print(f"load 311 data done")

load zipcode data done
Current Time = 02:04:45 Reading from data/erm2-nwe9.json...


In [22]:
df_zillow_data = load_and_clean_zillow_data(ZILLOW_DATA_FILE)

In [None]:
# geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
# print(f"load zipcode data done")
# geodf_tree_data = download_and_clean_tree_data()
# print(f"load trees data done")
# df_zillow_data = load_and_clean_zillow_data(ZILLOW_DATA_FILE)
# print(f"load rents data done")

In [None]:
# geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [13]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   index     248 non-null    int64   
 1   ZIPCODE   248 non-null    object  
 2   geometry  248 non-null    geometry
dtypes: geometry(1), int64(1), object(1)
memory usage: 5.9+ KB


In [14]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,index,ZIPCODE,geometry
0,0,11436,"POLYGON ((-73.80585 40.68291, -73.80569 40.682..."
1,1,11213,"POLYGON ((-73.93740 40.67973, -73.93487 40.679..."
2,2,11212,"POLYGON ((-73.90294 40.67084, -73.90223 40.668..."
3,3,11225,"POLYGON ((-73.95797 40.67066, -73.95576 40.670..."
4,4,11218,"POLYGON ((-73.97208 40.65060, -73.97192 40.650..."


In [15]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 33233422 entries, 0 to 33233421
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   complaint_type  object        
 1   created_date    datetime64[ns]
 2   geometry        geometry      
 3   incident_zip    object        
 4   unique_key      object        
dtypes: datetime64[ns](1), geometry(1), object(3)
memory usage: 1.2+ GB


In [16]:
geodf_311_data.head()

Unnamed: 0,complaint_type,created_date,geometry,incident_zip,unique_key
0,Derelict Vehicles,2023-11-17 12:00:00,POINT (-73.98863 40.77506),10069,59469711
1,Derelict Vehicles,2023-11-17 12:00:00,POINT (-73.90324 40.75259),11377,59468480
2,Illegal Parking,2023-11-17 01:06:19,POINT (-73.97546 40.59379),11223,59463383
3,Rodent,2023-11-17 01:05:09,POINT (-73.86890 40.83262),10472,59463316
4,Non-Emergency Police Matter,2023-11-17 01:03:42,POINT (-73.86991 40.74827),11373,59462918


In [None]:
geodf_tree_data.info()

In [None]:
geodf_tree_data.head()

In [None]:
df_zillow_data.info()

In [None]:
df_zillow_data.head()