In [56]:
import json
import pandas as pd
from shapely.geometry import shape, Point, MultiPolygon, Polygon
import sys, os
this_path = '/home/ibi/Documents/GitHub/mas291-project/'
sys.path.append(this_path)
os.chdir(this_path)
print(os.getcwd())

/home/ibi/Documents/GitHub/mas291-project


In [57]:
def load_geojson(filepath):
    with open(filepath) as f:
        return json.load(f)


In [58]:
def extract_geometries(geojson):
    shapes = []
    for feature in geojson["features"]:
        geom = shape(feature["geometry"])
        if isinstance(geom, (MultiPolygon, Polygon)):
            if isinstance(geom, Polygon):
                shapes.append(geom)
            else:
                shapes.extend(list(geom.geoms))
    return shapes


In [59]:
def extract_community_geometries(geojson, community_key):
    community_shapes = []
    community_names = []
    for feature in geojson["features"]:
        geom = shape(feature["geometry"])
        community_name = feature["properties"][community_key]
        if isinstance(geom, (MultiPolygon, Polygon)):
            parts = [geom] if isinstance(geom, Polygon) else list(geom.geoms)
            community_shapes.extend(parts)
            community_names.extend([community_name] * len(parts))
    return community_shapes, community_names


In [60]:
def is_within_city(lon, lat, city_shapes):
    point = Point(lon, lat)
    return any(boundary.contains(point) for boundary in city_shapes)


In [61]:
def find_community(lon, lat, community_shapes, community_names):
    point = Point(lon, lat)
    for community_name, community_shape in zip(community_names, community_shapes):
        if community_shape.contains(point):
            return community_name
    return None


In [62]:
def add_within_city_column(city_name, boundary_geojson_path, houses_df):
    boundary_geojson = load_geojson(boundary_geojson_path)
    city_shapes = extract_geometries(boundary_geojson)
    houses_df[f'is_within_{city_name}'] = houses_df.apply(lambda row: is_within_city(row["longitude"], row["latitude"], city_shapes), axis=1)
    return houses_df


In [63]:
def add_community_area_column(city_name, community_geojson_path, houses_df, community_key):
    community_geojson = load_geojson(community_geojson_path)
    community_shapes, community_names = extract_community_geometries(community_geojson, community_key)
    houses_df['community_area'] = houses_df.apply(lambda row: find_community(row['longitude'], row['latitude'], community_shapes, community_names), axis=1)
    return houses_df


In [64]:
chicago_selling = pd.read_csv('data/cleaned/chicago_selling_cleaned.csv')
chicago_sold = pd.read_csv('data/cleaned/chicago_sold_cleaned.csv')

new_york_selling = pd.read_csv('data/cleaned/new_york_selling_cleaned.csv')
new_york_sold = pd.read_csv('data/cleaned/new_york_sold_cleaned.csv')

In [65]:
chicago_selling = add_within_city_column('chicago', 'data/geojson/chicago_boundary.geojson', chicago_selling)
chicago_selling.head()

Unnamed: 0,data_source,id,post_link,list_date,sold_date,days_until_sold,price,address,status,area,bedrooms,bathrooms,latitude,longitude,is_within_chicago
0,https://www.realtor.com/,7109976785,https://www.realtor.com/realestateandhomes-det...,2024-04-18,Not sold yet,Not sold yet,215000.0,"9906 S Seeley Ave, Chicago, Il 60643",FOR_SALE,1567.0,3,2.0,41.713489,-87.673732,True
1,https://www.realtor.com/,8027933003,https://www.realtor.com/realestateandhomes-det...,2024-07-03,Not sold yet,Not sold yet,79000.0,"11549 S Church St, Chicago, Il 60643",FOR_SALE,1386.0,4,2.0,41.683247,-87.669495,True
2,https://www.realtor.com/,8854936353,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,1200000.0,"3015 N Racine Ave, Chicago, Il 60657",FOR_SALE,3100.0,4,4.0,41.93674,-87.658509,True
3,https://www.realtor.com/,7106444683,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,1075000.0,"1316 N Campbell Ave, Chicago, Il 60622",FOR_SALE,3976.0,3,3.5,41.905352,-87.689905,True
4,https://www.realtor.com/,7617958997,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,350000.0,"9737 S Claremont Ave, Chicago, Il 60643",FOR_SALE,20000.0,2,1.0,41.716163,-87.680356,True


In [66]:
chicago_selling = add_community_area_column('chicago', 'data/geojson/Chicago_community_area.geojson', chicago_selling, 'community')
chicago_selling.head()

Unnamed: 0,data_source,id,post_link,list_date,sold_date,days_until_sold,price,address,status,area,bedrooms,bathrooms,latitude,longitude,is_within_chicago,community_area
0,https://www.realtor.com/,7109976785,https://www.realtor.com/realestateandhomes-det...,2024-04-18,Not sold yet,Not sold yet,215000.0,"9906 S Seeley Ave, Chicago, Il 60643",FOR_SALE,1567.0,3,2.0,41.713489,-87.673732,True,BEVERLY
1,https://www.realtor.com/,8027933003,https://www.realtor.com/realestateandhomes-det...,2024-07-03,Not sold yet,Not sold yet,79000.0,"11549 S Church St, Chicago, Il 60643",FOR_SALE,1386.0,4,2.0,41.683247,-87.669495,True,MORGAN PARK
2,https://www.realtor.com/,8854936353,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,1200000.0,"3015 N Racine Ave, Chicago, Il 60657",FOR_SALE,3100.0,4,4.0,41.93674,-87.658509,True,LAKE VIEW
3,https://www.realtor.com/,7106444683,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,1075000.0,"1316 N Campbell Ave, Chicago, Il 60622",FOR_SALE,3976.0,3,3.5,41.905352,-87.689905,True,WEST TOWN
4,https://www.realtor.com/,7617958997,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,350000.0,"9737 S Claremont Ave, Chicago, Il 60643",FOR_SALE,20000.0,2,1.0,41.716163,-87.680356,True,BEVERLY


In [67]:
chicago_selling = chicago_selling.drop(chicago_selling[chicago_selling[f'is_within_chicago'] == False].index)
chicago_selling = chicago_selling.drop(columns=[f'is_within_chicago'])

In [68]:
chicago_sold = add_within_city_column('chicago', 'data/geojson/chicago_boundary.geojson', chicago_sold)
chicago_sold = add_community_area_column('chicago', 'data/geojson/Chicago_community_area.geojson', chicago_sold, 'community')
chicago_sold = chicago_sold.drop(chicago_sold[chicago_sold[f'is_within_chicago'] == False].index)
chicago_sold = chicago_sold.drop(columns=[f'is_within_chicago'])
chicago_sold.head()

Unnamed: 0,data_source,id,post_link,list_date,sold_date,days_until_sold,price,address,status,area,bedrooms,bathrooms,latitude,longitude,community_area
0,https://www.realtor.com/,7638203850,https://www.realtor.com/realestateandhomes-det...,2024-05-30,2024-07-03,34.0,250000.0,"10330 S Whipple St, Chicago, Il 60655",SOLD,926.0,2.0,1.0,41.705251,-87.698129,MOUNT GREENWOOD
1,https://www.realtor.com/,7712196342,https://www.realtor.com/realestateandhomes-det...,2024-05-06,2024-07-03,58.0,149999.0,"455 W 126Th Pl, Chicago, Il 60628",SOLD,973.0,2.0,1.0,41.663994,-87.633883,WEST PULLMAN
2,https://www.realtor.com/,9484646484,https://www.realtor.com/realestateandhomes-det...,2024-05-30,2024-07-03,34.0,950000.0,"3915 N Monticello Ave, Chicago, Il 60618",SOLD,3200.0,3.0,3.5,41.952424,-87.718648,IRVING PARK
3,https://www.realtor.com/,7259996874,https://www.realtor.com/realestateandhomes-det...,2024-03-13,2024-07-03,112.0,155000.0,"9425 S Eberhart Ave, Chicago, Il 60619",SOLD,1674.0,4.0,3.0,41.722997,-87.611406,ROSELAND
4,https://www.realtor.com/,8692627061,https://www.realtor.com/realestateandhomes-det...,2024-04-30,2024-07-03,64.0,1450000.0,"1457 W Byron St, Chicago, Il 60613",SOLD,3300.0,3.0,3.5,41.952312,-87.666381,LAKE VIEW


In [69]:
new_york_selling = add_within_city_column('new_york', 'data/geojson/NYC_boundaries_merged.geojson', new_york_selling)
new_york_selling = add_community_area_column('new_york', 'data/geojson/NYC_community_area.geojson', new_york_selling, 'boro_cd')
new_york_selling = new_york_selling.drop(new_york_selling[new_york_selling[f'is_within_new_york'] == False].index)
new_york_selling = new_york_selling.drop(columns=[f'is_within_new_york'])
new_york_selling.head()

Unnamed: 0,data_source,id,post_link,list_date,sold_date,days_until_sold,price,address,status,area,bedrooms,bathrooms,latitude,longitude,community_area
0,https://www.realtor.com/,3243883955,https://www.realtor.com/realestateandhomes-det...,2023-12-04,Not sold yet,Not sold yet,260000,"620 Sinclair Ave, Staten Island, Ny 10312",FOR_SALE,2015.0,4,2.0,40.541781,-74.196109,503
1,https://www.realtor.com/,3974591407,https://www.realtor.com/realestateandhomes-det...,2024-03-20,Not sold yet,Not sold yet,99000,"77 City Blvd, Staten Island, Ny 10301",FOR_SALE,1176.0,3,1.0,40.628757,-74.104166,501
2,https://www.realtor.com/,3536803586,https://www.realtor.com/realestateandhomes-det...,2023-11-21,Not sold yet,Not sold yet,250000,"240-05 147 Ave, Rosedale, Ny 11422",FOR_SALE,2304.0,4,2.0,40.657539,-73.743602,413
3,https://www.realtor.com/,3767233034,https://www.realtor.com/realestateandhomes-det...,2024-05-24,Not sold yet,Not sold yet,899000,"182 Benedict Ave, Staten Island, Ny 10314",FOR_SALE,2352.0,4,5.0,40.62244,-74.128401,501
4,https://www.realtor.com/,3307596121,https://www.realtor.com/realestateandhomes-det...,2024-07-04,Not sold yet,Not sold yet,1698000,"363 Howard Ave, Staten Island, Ny 10301",FOR_SALE,3255.0,5,4.5,40.620764,-74.08834,501


In [70]:
new_york_sold = add_within_city_column('new_york', 'data/geojson/NYC_boundaries_merged.geojson', new_york_sold)
new_york_sold = add_community_area_column('new_york', 'data/geojson/NYC_community_area.geojson', new_york_sold, 'boro_cd')
new_york_sold = new_york_sold.drop(new_york_sold[new_york_sold[f'is_within_new_york'] == False].index)
new_york_sold = new_york_sold.drop(columns=[f'is_within_new_york'])
new_york_sold.head()

Unnamed: 0,data_source,id,post_link,list_date,sold_date,days_until_sold,price,address,status,area,bedrooms,bathrooms,latitude,longitude,community_area
0,https://www.realtor.com/,4136891435,https://www.realtor.com/realestateandhomes-det...,2023-09-07,2024-07-03,300.0,1500000.0,"1273 E 10Th St, Brooklyn, Ny 11230",SOLD,2015.0,4.0,1.5,40.618168,-73.964649,312
1,https://www.realtor.com/,3476436735,https://www.realtor.com/realestateandhomes-det...,2024-04-05,2024-07-03,89.0,549999.0,"274 Dixon Ave, Staten Island, Ny 10303",SOLD,1205.0,3.0,1.5,40.628432,-74.151338,501
2,https://www.realtor.com/,4877937449,https://www.realtor.com/realestateandhomes-det...,2023-09-20,2024-07-03,287.0,559000.0,"132 Elm St, Staten Island, Ny 10310",SOLD,1292.0,4.0,2.0,40.63851,-74.114339,501
3,https://www.realtor.com/,4669261316,https://www.realtor.com/realestateandhomes-det...,2023-08-09,2024-07-03,329.0,1099000.0,"1714 E 29Th St, Brooklyn, Ny 11229",SOLD,1660.0,3.0,2.5,40.60929,-73.944877,315
4,https://www.realtor.com/,4395237841,https://www.realtor.com/realestateandhomes-det...,2023-07-27,2024-07-03,342.0,789000.0,"1445 E 69Th St, Brooklyn, Ny 11234",SOLD,1836.0,3.0,1.0,40.621332,-73.911186,318


In [71]:
chicago_selling.to_csv('data/cleaned/chicago_selling_cleaned.csv', index=False)
chicago_sold.to_csv('data/cleaned/chicago_sold_cleaned.csv', index=False)
new_york_selling.to_csv('data/cleaned/new_york_selling_cleaned.csv', index=False)
new_york_sold.to_csv('data/cleaned/new_york_sold_cleaned.csv', index=False)