In [1]:
import json
import pandas as pd
from shapely.geometry import shape, Point, MultiPolygon, Polygon

In [2]:
def load_geojson(filepath):
    with open(filepath) as f:
        return json.load(f)


In [3]:
def extract_geometries(geojson):
    shapes = []
    for feature in geojson["features"]:
        geom = shape(feature["geometry"])
        if isinstance(geom, (MultiPolygon, Polygon)):
            if isinstance(geom, Polygon):
                shapes.append(geom)
            else:
                shapes.extend(list(geom.geoms))
    return shapes


In [4]:
def extract_city_geometries(geojson, city_key):
    city_shapes = []
    city_names = []
    for feature in geojson["features"]:
        geom = shape(feature["geometry"])
        city_name = feature["properties"][city_key]
        if isinstance(geom, (MultiPolygon, Polygon)):
            parts = [geom] if isinstance(geom, Polygon) else list(geom.geoms)
            city_shapes.extend(parts)
            city_names.extend([city_name] * len(parts))
    return city_shapes, city_names


In [5]:
def is_within_city(lon, lat, city_shapes):
    point = Point(lon, lat)
    return any(boundary.contains(point) for boundary in city_shapes)


In [6]:
def find_city(lon, lat, city_shapes, city_names):
    point = Point(lon, lat)
    for city_name, city_shape in zip(city_names, city_shapes):
        if city_shape.contains(point):
            return city_name
    return None


In [7]:
def add_within_city_column(city_name, boundary_geojson_path, houses_df):
    boundary_geojson = load_geojson(boundary_geojson_path)
    city_shapes = extract_geometries(boundary_geojson)
    houses_df[f'is_within_{city_name}'] = houses_df.apply(lambda row: is_within_city(row["long"], row["lat"], city_shapes), axis=1)
    return houses_df


In [8]:
def add_city_area_column(city_name, city_geojson_path, houses_df, city_key):
    city_geojson = load_geojson(city_geojson_path)
    city_shapes, city_names = extract_city_geometries(city_geojson, city_key)
    houses_df['city_area'] = houses_df.apply(lambda row: find_city(row['long'], row['lat'], city_shapes, city_names), axis=1)
    return houses_df


In [9]:
data = pd.read_csv("lab1_kc_house_data_cleaned.csv")
json_path = 'media/Health_Reporting_Areas_2020___health_reporting_2020_area.geojson'

In [10]:
data = add_within_city_column('king', json_path, data)
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_lot,floors,grade,sqft_above,yr_built,lat,long,is_within_king
0,7129300520,221900.0,3,1,5650,1.0,7,1180,1955,47.5112,-122.257,True
1,6414100192,538000.0,3,2,7242,2.0,7,2170,1951,47.721,-122.319,True
2,5631500400,180000.0,2,1,10000,1.0,6,770,1933,47.7379,-122.233,True
3,2487200875,604000.0,4,3,5000,1.0,7,1050,1965,47.5208,-122.393,True
4,1954400510,510000.0,3,2,8080,1.0,8,1680,1987,47.6168,-122.045,True


In [11]:
data = add_city_area_column('king', json_path, data, 'name')
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_lot,floors,grade,sqft_above,yr_built,lat,long,is_within_king,city_area
0,7129300520,221900.0,3,1,5650,1.0,7,1180,1955,47.5112,-122.257,True,Seattle - Rainier Valley and Rainier Beach
1,6414100192,538000.0,3,2,7242,2.0,7,2170,1951,47.721,-122.319,True,Seattle - Northgate and Lake City
2,5631500400,180000.0,2,1,10000,1.0,6,770,1933,47.7379,-122.233,True,Kenmore and Lake Forest Park
3,2487200875,604000.0,4,3,5000,1.0,7,1050,1965,47.5208,-122.393,True,Seattle - West Seattle
4,1954400510,510000.0,3,2,8080,1.0,8,1680,1987,47.6168,-122.045,True,Sammamish


In [12]:
data = data.drop(data[data[f'is_within_king'] == False].index)
data = data.drop(columns=[f'is_within_king'])

In [13]:
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_lot,floors,grade,sqft_above,yr_built,lat,long,city_area
0,7129300520,221900.0,3,1,5650,1.0,7,1180,1955,47.5112,-122.257,Seattle - Rainier Valley and Rainier Beach
1,6414100192,538000.0,3,2,7242,2.0,7,2170,1951,47.721,-122.319,Seattle - Northgate and Lake City
2,5631500400,180000.0,2,1,10000,1.0,6,770,1933,47.7379,-122.233,Kenmore and Lake Forest Park
3,2487200875,604000.0,4,3,5000,1.0,7,1050,1965,47.5208,-122.393,Seattle - West Seattle
4,1954400510,510000.0,3,2,8080,1.0,8,1680,1987,47.6168,-122.045,Sammamish


In [14]:
data.to_csv('lab1_kc_house_data_cleaned.csv', index=False)