# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [2]:
# All import statements needed for the project, for example:

import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base

In [3]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = "/Users/chining/Documents/GitHub/NYC-Rental-Analysis/data/drive-download-20231124T164450Z-001/nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "IxFjBjShI6cenJ0NOVZ1rnj0W"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "group14project"
DB_USER = "chenruijia"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [4]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [5]:
def download_nyc_geojson_data(url, force=False):
    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path.strip("/")
    
    filename = DATA_DIR / url_path
    
    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")
        
        ...
        
        with open(filename, "w") as f:
            json.dump(..., f)
        print(f"Done downloading {url}.")

    else:
        print(f"Reading from {filename}...")

    return filename

In [6]:
import geopandas as gpd
import pyproj
from functools import partial
from shapely.ops import transform
from shapely.geometry import Polygon

# Assuming the primary .shp file is 'nyc_zipcodes.shp' and it's in the same directory with other related files
shapefile_path = 'data/drive-download-20231124T164450Z-001/nyc_zipcodes.shp'


def load_and_clean_zipcodes(shapefile_path):
    # Reading the shapefile
    geodf_zipcode = gpd.read_file(shapefile_path)

    # Filter out non-Polygon geometries
    geodf_zipcode = geodf_zipcode[geodf_zipcode['geometry'].apply(lambda geom: isinstance(geom, Polygon))]

    # Filter by ZIPCODE pattern (New York City zip codes starting with '1')
    geodf_zipcode = geodf_zipcode[geodf_zipcode['ZIPCODE'].astype(str).str.match(r'^1\d{4}$')]
    
    # Defining the coordinate reference systems
    current_srid = 'EPSG:2263'  # NAD83 / New York Long Island (ftUS)
    desired_srid = 'EPSG:4326'  # WGS 84

    # Updating to use pyproj.Transformer
    transformer = pyproj.Transformer.from_crs(current_srid, desired_srid, always_xy=True)

    # Applying the transformation
    geodf_zipcode['geometry'] = geodf_zipcode['geometry'].apply(lambda geom: transform(transformer.transform, geom))

    return geodf_zipcode


 


In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
def download_and_clean_311_data():
    def fetch_data(offset, limit):
        api_endpoint = f"{base_api_endpoint}?{soql_query}&$select={columns}&$limit={limit}&$offset={offset}"
        response = requests.get(api_endpoint, headers=headers)
        if response.status_code == 200:
            return response.json()
        else:
            return []
    base_api_endpoint = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'
    columns = 'unique_key,created_date,closed_date,complaint_type,incident_zip,latitude,longitude'
    start_date = '2015-01-01T00:00:00'
    end_date = '2023-09-30T23:59:59'
    headers = {'X-App-Token': NYC_DATA_APP_TOKEN}
    soql_query = f"$where=created_date between '{start_date}' and '{end_date}'"
    limit = 200000
    all_data = []
    offset = 0
    more_data = True

    while more_data:
        with ThreadPoolExecutor(max_workers=10) as executor:

            futures = [executor.submit(fetch_data, off, limit) for off in range(offset, offset + 10 * limit, limit)]
    
            for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching Data"):
                data = future.result()
                if data:
                    all_data.extend(data)
                    print(f"Retrieved records up to offset {offset + limit}...")
                else:
                    more_data = False

            offset += 10 * limit
    
    df = pd.DataFrame(all_data)
    return df
 

In [8]:
def download_and_clean_tree_data():
    base_api_endpoint = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.json' 
    columns = 'tree_id,latitude,longitude,status,health,spc_common,zipcode'
    
    headers = {
        'X-App-Token': NYC_DATA_APP_TOKEN
    }
    
    limit = 50000   
    offset = 0
    all_data = []

    while True:
        api_endpoint = f"{base_api_endpoint}?$select={columns}&$limit={limit}&$offset={offset}"
        response = requests.get(api_endpoint, headers=headers)
        
        if response.status_code == 200:
            page_data = response.json()
    
            if not page_data:
                break
            all_data.extend(page_data)
            offset += limit
            print(f"Retrieved {offset} records so far...")
        else:
            print("Failed to retrieve data:", response.status_code)
            break
    
    geodf_tree_data = pd.DataFrame(all_data)
    geodf_tree_data.dropna(subset=['tree_id', 'latitude', 'longitude', 'status', 'health', 'spc_common', 'zipcode'], inplace=True)
    return geodf_tree_data

In [9]:
def load_and_clean_zillow_data():
    zillow_rent_data = pd.read_csv('data/drive-download-20231124T164450Z-001/zillow_rent_data.csv')

    
    nyc_zip_codes = zillow_rent_data[zillow_rent_data['RegionName'].astype(str).str.match(r'^1\d{4}$')]

    columns_to_delete = ["RegionID", "SizeRank", "RegionType", "StateName", "State", "City", "Metro", "CountyName"]
    df = nyc_zip_codes.drop(columns=columns_to_delete)

    df_long = df.melt(id_vars=["RegionName"], 
                    var_name="Date", 
                    value_name="RentPrice")


    df_long['Date'] = pd.to_datetime(df_long['Date'])

    df_long = df_long.sort_values(by=['RegionName', 'Date'])

    
    df_long['RentPrice'] = df_long.groupby('RegionName')['RentPrice'].ffill()
    df_long['RentPrice'] = df_long.groupby('RegionName')['RentPrice'].bfill()
    df_long['RentPrice'] = df_long.groupby('RegionName')['RentPrice'].ffill().bfill().interpolate(method='linear')

    zillow_data = pd.DataFrame(df_long)
    return zillow_data

 

In [10]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )


In [11]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
Fetching Data:  10%|█         | 1/10 [00:48<07:14, 48.26s/it]

Retrieved records up to offset 200000...


Fetching Data:  30%|███       | 3/10 [00:49<01:18, 11.20s/it]

Retrieved records up to offset 200000...
Retrieved records up to offset 200000...


Fetching Data:  40%|████      | 4/10 [00:50<00:42,  7.09s/it]

Retrieved records up to offset 200000...


Fetching Data:  50%|█████     | 5/10 [00:50<00:23,  4.61s/it]

Retrieved records up to offset 200000...


Fetching Data:  60%|██████    | 6/10 [00:50<00:12,  3.21s/it]

Retrieved records up to offset 200000...


Fetching Data:  70%|███████   | 7/10 [00:51<00:06,  2.25s/it]

Retrieved records up to offset 200000...


Fetching Data:  80%|████████  | 8/10 [00:51<00:03,  1.70s/it]

Retrieved records up to offset 200000...


Fetching Data:  90%|█████████ | 9/10 [05:00<01:18, 78.90s/it]

Retrieved records up to offset 200000...


Fetching Data: 100%|██████████| 10/10 [05:00<00:00, 30.06s/it]


Retrieved records up to offset 200000...


Fetching Data:  20%|██        | 2/10 [01:06<03:38, 27.30s/it]

Retrieved records up to offset 2200000...
Retrieved records up to offset 2200000...


Fetching Data:  30%|███       | 3/10 [01:07<01:47, 15.37s/it]

Retrieved records up to offset 2200000...


Fetching Data:  40%|████      | 4/10 [01:07<00:57,  9.51s/it]

Retrieved records up to offset 2200000...


Fetching Data:  50%|█████     | 5/10 [01:11<00:37,  7.43s/it]

Retrieved records up to offset 2200000...


Fetching Data:  60%|██████    | 6/10 [01:57<01:22, 20.57s/it]

Retrieved records up to offset 2200000...


Fetching Data:  70%|███████   | 7/10 [01:58<00:41, 13.93s/it]

Retrieved records up to offset 2200000...
Retrieved records up to offset 2200000...


Fetching Data:  80%|████████  | 8/10 [01:58<00:19,  9.61s/it]

Retrieved records up to offset 2200000...


Fetching Data: 100%|██████████| 10/10 [01:59<00:00, 11.96s/it]


Retrieved records up to offset 2200000...


Fetching Data:  10%|█         | 1/10 [00:33<05:04, 33.83s/it]

Retrieved records up to offset 4200000...


Fetching Data:  20%|██        | 2/10 [00:34<01:54, 14.32s/it]

Retrieved records up to offset 4200000...


Fetching Data:  30%|███       | 3/10 [00:35<00:58,  8.36s/it]

Retrieved records up to offset 4200000...


Fetching Data:  40%|████      | 4/10 [00:36<00:31,  5.31s/it]

Retrieved records up to offset 4200000...


Fetching Data:  60%|██████    | 6/10 [00:40<00:12,  3.20s/it]

Retrieved records up to offset 4200000...
Retrieved records up to offset 4200000...


Fetching Data:  80%|████████  | 8/10 [00:41<00:03,  1.80s/it]

Retrieved records up to offset 4200000...
Retrieved records up to offset 4200000...


Fetching Data: 100%|██████████| 10/10 [00:42<00:00,  4.21s/it]

Retrieved records up to offset 4200000...
Retrieved records up to offset 4200000...



Fetching Data:  10%|█         | 1/10 [00:32<04:51, 32.43s/it]

Retrieved records up to offset 6200000...


Fetching Data:  20%|██        | 2/10 [00:33<01:52, 14.11s/it]

Retrieved records up to offset 6200000...


Fetching Data:  30%|███       | 3/10 [00:33<00:54,  7.78s/it]

Retrieved records up to offset 6200000...


Fetching Data:  40%|████      | 4/10 [00:34<00:29,  4.97s/it]

Retrieved records up to offset 6200000...


Fetching Data:  50%|█████     | 5/10 [00:39<00:24,  4.87s/it]

Retrieved records up to offset 6200000...


Fetching Data:  70%|███████   | 7/10 [00:40<00:07,  2.36s/it]

Retrieved records up to offset 6200000...
Retrieved records up to offset 6200000...


Fetching Data:  80%|████████  | 8/10 [00:40<00:03,  1.67s/it]

Retrieved records up to offset 6200000...


Fetching Data:  90%|█████████ | 9/10 [00:40<00:01,  1.21s/it]

Retrieved records up to offset 6200000...


Fetching Data: 100%|██████████| 10/10 [00:43<00:00,  4.31s/it]


Retrieved records up to offset 6200000...


Fetching Data:  10%|█         | 1/10 [00:33<05:01, 33.45s/it]

Retrieved records up to offset 8200000...


Fetching Data:  20%|██        | 2/10 [00:33<01:51, 13.98s/it]

Retrieved records up to offset 8200000...


Fetching Data:  30%|███       | 3/10 [00:39<01:12, 10.34s/it]

Retrieved records up to offset 8200000...


Fetching Data:  40%|████      | 4/10 [00:40<00:38,  6.42s/it]

Retrieved records up to offset 8200000...
Retrieved records up to offset 8200000...


Fetching Data:  60%|██████    | 6/10 [00:40<00:12,  3.09s/it]

Retrieved records up to offset 8200000...


Fetching Data:  70%|███████   | 7/10 [00:40<00:07,  2.34s/it]

Retrieved records up to offset 8200000...
Retrieved records up to offset 8200000...


Fetching Data:  90%|█████████ | 9/10 [00:41<00:01,  1.39s/it]

Retrieved records up to offset 8200000...


Fetching Data: 100%|██████████| 10/10 [00:41<00:00,  4.14s/it]


Retrieved records up to offset 8200000...


Fetching Data:  10%|█         | 1/10 [00:36<05:29, 36.64s/it]

Retrieved records up to offset 10200000...


Fetching Data:  20%|██        | 2/10 [00:36<02:01, 15.22s/it]

Retrieved records up to offset 10200000...


Fetching Data:  30%|███       | 3/10 [00:37<00:59,  8.45s/it]

Retrieved records up to offset 10200000...


Fetching Data:  40%|████      | 4/10 [00:38<00:34,  5.73s/it]

Retrieved records up to offset 10200000...


Fetching Data:  50%|█████     | 5/10 [00:40<00:20,  4.13s/it]

Retrieved records up to offset 10200000...


Fetching Data:  60%|██████    | 6/10 [00:41<00:12,  3.08s/it]

Retrieved records up to offset 10200000...


Fetching Data:  70%|███████   | 7/10 [00:41<00:06,  2.23s/it]

Retrieved records up to offset 10200000...


Fetching Data:  80%|████████  | 8/10 [00:42<00:03,  1.72s/it]

Retrieved records up to offset 10200000...


Fetching Data:  90%|█████████ | 9/10 [00:46<00:02,  2.55s/it]

Retrieved records up to offset 10200000...


Fetching Data: 100%|██████████| 10/10 [00:47<00:00,  4.77s/it]


Retrieved records up to offset 10200000...


Fetching Data:  10%|█         | 1/10 [00:34<05:11, 34.66s/it]

Retrieved records up to offset 12200000...


Fetching Data:  20%|██        | 2/10 [00:34<01:55, 14.47s/it]

Retrieved records up to offset 12200000...


Fetching Data:  30%|███       | 3/10 [00:35<00:55,  7.96s/it]

Retrieved records up to offset 12200000...


Fetching Data:  40%|████      | 4/10 [00:35<00:30,  5.10s/it]

Retrieved records up to offset 12200000...


Fetching Data:  60%|██████    | 6/10 [00:38<00:11,  2.77s/it]

Retrieved records up to offset 12200000...
Retrieved records up to offset 12200000...


Fetching Data:  80%|████████  | 8/10 [00:38<00:02,  1.38s/it]

Retrieved records up to offset 12200000...
Retrieved records up to offset 12200000...


Fetching Data:  90%|█████████ | 9/10 [00:39<00:01,  1.18s/it]

Retrieved records up to offset 12200000...


Fetching Data: 100%|██████████| 10/10 [00:41<00:00,  4.16s/it]


Retrieved records up to offset 12200000...


Fetching Data:  20%|██        | 2/10 [00:45<02:30, 18.82s/it]

Retrieved records up to offset 14200000...
Retrieved records up to offset 14200000...


Fetching Data:  40%|████      | 4/10 [00:46<00:38,  6.44s/it]

Retrieved records up to offset 14200000...
Retrieved records up to offset 14200000...


Fetching Data:  50%|█████     | 5/10 [00:46<00:20,  4.18s/it]

Retrieved records up to offset 14200000...


Fetching Data:  60%|██████    | 6/10 [00:51<00:17,  4.41s/it]

Retrieved records up to offset 14200000...


Fetching Data:  70%|███████   | 7/10 [02:22<01:37, 32.58s/it]

Retrieved records up to offset 14200000...
Retrieved records up to offset 14200000...


Fetching Data:  90%|█████████ | 9/10 [02:22<00:15, 15.44s/it]

Retrieved records up to offset 14200000...


Fetching Data: 100%|██████████| 10/10 [02:23<00:00, 14.33s/it]


Retrieved records up to offset 14200000...


Fetching Data:  20%|██        | 2/10 [01:12<03:59, 29.90s/it]

Retrieved records up to offset 16200000...
Retrieved records up to offset 16200000...


Fetching Data:  30%|███       | 3/10 [01:12<01:54, 16.33s/it]

Retrieved records up to offset 16200000...


Fetching Data:  40%|████      | 4/10 [01:13<01:00, 10.10s/it]

Retrieved records up to offset 16200000...
Retrieved records up to offset 16200000...
Retrieved records up to offset 16200000...


Fetching Data:  70%|███████   | 7/10 [01:13<00:11,  3.80s/it]

Retrieved records up to offset 16200000...


Fetching Data:  80%|████████  | 8/10 [02:43<00:47, 23.73s/it]

Retrieved records up to offset 16200000...


Fetching Data:  90%|█████████ | 9/10 [02:44<00:18, 18.01s/it]

Retrieved records up to offset 16200000...


Fetching Data: 100%|██████████| 10/10 [02:47<00:00, 16.77s/it]

Retrieved records up to offset 16200000...



Fetching Data:  20%|██        | 2/10 [00:43<02:23, 17.91s/it]

Retrieved records up to offset 18200000...
Retrieved records up to offset 18200000...


Fetching Data:  30%|███       | 3/10 [00:43<01:08,  9.83s/it]

Retrieved records up to offset 18200000...


Fetching Data:  50%|█████     | 5/10 [00:44<00:20,  4.02s/it]

Retrieved records up to offset 18200000...
Retrieved records up to offset 18200000...


Fetching Data:  60%|██████    | 6/10 [00:46<00:13,  3.27s/it]

Retrieved records up to offset 18200000...


Fetching Data:  70%|███████   | 7/10 [00:46<00:07,  2.35s/it]

Retrieved records up to offset 18200000...


Fetching Data:  80%|████████  | 8/10 [01:21<00:25, 12.55s/it]

Retrieved records up to offset 18200000...


Fetching Data: 100%|██████████| 10/10 [01:21<00:00,  8.14s/it]


Retrieved records up to offset 18200000...
Retrieved records up to offset 18200000...


Fetching Data:  20%|██        | 2/10 [00:35<01:56, 14.60s/it]

Retrieved records up to offset 20200000...
Retrieved records up to offset 20200000...


Fetching Data:  30%|███       | 3/10 [00:35<00:57,  8.16s/it]

Retrieved records up to offset 20200000...


Fetching Data:  40%|████      | 4/10 [00:37<00:33,  5.50s/it]

Retrieved records up to offset 20200000...


Fetching Data:  60%|██████    | 6/10 [00:39<00:11,  2.91s/it]

Retrieved records up to offset 20200000...
Retrieved records up to offset 20200000...


Fetching Data:  70%|███████   | 7/10 [00:39<00:06,  2.06s/it]

Retrieved records up to offset 20200000...


Fetching Data:  80%|████████  | 8/10 [00:41<00:03,  1.75s/it]

Retrieved records up to offset 20200000...
Retrieved records up to offset 20200000...


Fetching Data: 100%|██████████| 10/10 [00:42<00:00,  4.22s/it]


Retrieved records up to offset 20200000...


Fetching Data:  10%|█         | 1/10 [00:39<05:54, 39.37s/it]

Retrieved records up to offset 22200000...


Fetching Data:  20%|██        | 2/10 [00:39<02:11, 16.45s/it]

Retrieved records up to offset 22200000...


Fetching Data:  30%|███       | 3/10 [00:40<01:04,  9.22s/it]

Retrieved records up to offset 22200000...


Fetching Data:  40%|████      | 4/10 [00:45<00:44,  7.44s/it]

Retrieved records up to offset 22200000...


Fetching Data:  60%|██████    | 6/10 [00:45<00:13,  3.27s/it]

Retrieved records up to offset 22200000...
Retrieved records up to offset 22200000...


Fetching Data:  70%|███████   | 7/10 [00:45<00:06,  2.29s/it]

Retrieved records up to offset 22200000...


Fetching Data:  80%|████████  | 8/10 [00:46<00:03,  1.64s/it]

Retrieved records up to offset 22200000...


Fetching Data:  90%|█████████ | 9/10 [00:46<00:01,  1.19s/it]

Retrieved records up to offset 22200000...


Fetching Data: 100%|██████████| 10/10 [00:47<00:00,  4.77s/it]


Retrieved records up to offset 22200000...


Fetching Data:  90%|█████████ | 9/10 [00:43<00:02,  2.11s/it]

Retrieved records up to offset 24200000...


Fetching Data: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]

Retrieved records up to offset 24200000...





Retrieved 50000 records so far...
Retrieved 100000 records so far...
Retrieved 150000 records so far...
Retrieved 200000 records so far...
Retrieved 250000 records so far...
Retrieved 300000 records so far...
Retrieved 350000 records so far...
Retrieved 400000 records so far...
Retrieved 450000 records so far...
Retrieved 500000 records so far...
Retrieved 550000 records so far...
Retrieved 600000 records so far...
Retrieved 650000 records so far...
Retrieved 700000 records so far...


In [12]:
# Show basic info about each dataframe
geodf_zipcode_data.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 262 entries, 0 to 262
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   ZIPCODE     262 non-null    object  
 1   BLDGZIP     262 non-null    object  
 2   PO_NAME     262 non-null    object  
 3   POPULATION  262 non-null    float64 
 4   AREA        262 non-null    float64 
 5   STATE       262 non-null    object  
 6   COUNTY      262 non-null    object  
 7   ST_FIPS     262 non-null    object  
 8   CTY_FIPS    262 non-null    object  
 9   URL         262 non-null    object  
 10  SHAPE_AREA  262 non-null    float64 
 11  SHAPE_LEN   262 non-null    float64 
 12  geometry    262 non-null    geometry
dtypes: float64(4), geometry(1), object(8)
memory usage: 28.7+ KB


In [13]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,ZIPCODE,BLDGZIP,PO_NAME,POPULATION,AREA,STATE,COUNTY,ST_FIPS,CTY_FIPS,URL,SHAPE_AREA,SHAPE_LEN,geometry
0,11436,0,Jamaica,18681.0,22699300.0,NY,Queens,36,81,http://www.usps.com/,0.0,0.0,"POLYGON ((-73.806 40.683, -73.806 40.683, -73...."
1,11213,0,Brooklyn,62426.0,29631000.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((-73.937 40.680, -73.935 40.680, -73...."
2,11212,0,Brooklyn,83866.0,41972100.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((-73.903 40.671, -73.902 40.668, -73...."
3,11225,0,Brooklyn,56527.0,23698630.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((-73.958 40.671, -73.956 40.670, -73...."
4,11218,0,Brooklyn,72280.0,36868800.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((-73.972 40.651, -73.972 40.650, -73...."


In [None]:
geodf_311_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24336519 entries, 0 to 24336518
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   unique_key      object
 1   created_date    object
 2   closed_date     object
 3   complaint_type  object
 4   incident_zip    object
 5   latitude        object
 6   longitude       object
dtypes: object(7)
memory usage: 1.3+ GB


In [None]:
geodf_311_data.head()

Unnamed: 0,unique_key,created_date,closed_date,complaint_type,incident_zip,latitude,longitude
0,58337824,2023-07-28T12:00:00.000,2023-08-01T12:00:00.000,Derelict Vehicles,11234.0,40.635140353456194,-73.92947230891184
1,58341879,2023-07-28T12:00:00.000,2023-07-29T12:00:00.000,Derelict Vehicles,10027.0,40.8111347014436,-73.94554917541718
2,58339183,2023-07-28T12:00:00.000,2023-07-28T12:00:00.000,Derelict Vehicles,10452.0,40.84613889606357,-73.91895485824278
3,58337816,2023-07-28T12:00:00.000,2023-07-28T12:00:00.000,Derelict Vehicles,,,
4,58337822,2023-07-28T12:00:00.000,2023-07-29T12:00:00.000,Derelict Vehicles,11418.0,40.69691921183129,-73.84283078706525


In [None]:
geodf_tree_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 652167 entries, 0 to 683787
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tree_id     652167 non-null  object
 1   latitude    652167 non-null  object
 2   longitude   652167 non-null  object
 3   status      652167 non-null  object
 4   health      652167 non-null  object
 5   spc_common  652167 non-null  object
 6   zipcode     652167 non-null  object
dtypes: object(7)
memory usage: 39.8+ MB


In [None]:
geodf_tree_data.head()

Unnamed: 0,tree_id,latitude,longitude,status,health,spc_common,zipcode
0,180683,40.72309177,-73.84421522,Alive,Fair,red maple,11375
1,200540,40.79411067,-73.81867946,Alive,Fair,pin oak,11357
2,204026,40.71758074,-73.9366077,Alive,Good,honeylocust,11211
3,204337,40.71353749,-73.93445616,Alive,Good,honeylocust,11211
4,189565,40.66677776,-73.97597938,Alive,Good,American linden,11215


In [None]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58380 entries, 311 to 58142
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   RegionName  58380 non-null  int64         
 1   Date        58380 non-null  datetime64[ns]
 2   RentPrice   58380 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 1.8 MB


In [None]:
df_zillow_data.head()

Unnamed: 0,RegionName,Date,RentPrice
311,10001,2015-01-31,3807.657462
867,10001,2015-02-28,3851.098684
1423,10001,2015-03-31,3844.716691
1979,10001,2015-04-30,3906.689196
2535,10001,2015-05-31,3960.68987


## Part 2: Storing Data

In [None]:
def setup_new_postgis_database(username, db_name):
    raise NotImplementedError()

In [None]:
setup_new_postgis_database(DB_USER, DB_NAME)

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [None]:
engine = db.create_engine(DB_URL)

#### Option 1: SQL

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the SQL statements to create your 4 tables
ZIPCODE_SCHEMA = """
TODO
"""

NYC_311_SCHEMA = """
TODO
"""

NYC_TREE_SCHEMA = """
TODO
"""

ZILLOW_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [None]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with engine.connect() as connection:
    pass

#### Option 2: SQLAlchemy

In [None]:
Base = declarative_base()

class Tree(Base):
    __tablename__ = "trees"

    ...


In [None]:
Base.metadata.create_all(engine)

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

#### Option 1: SQL

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    # write INSERT statements or use pandas/geopandas to write SQL
    raise NotImplemented()

In [None]:
tablename_to_dataframe = {
    "zipcodes": geodf_zipcode_data,
    "complaints": geodf_311_data,
    "trees": geodf_tree_data,
    "rents": df_zillow_data,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

#### Option 2: SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
for row in geodf_tree_data.iterrows():
    tree = Tree(...)
    session.add(tree)

In [None]:
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)