<a href="https://colab.research.google.com/github/Hyperion8642/datathon_2026_asj/blob/main/datathon_EDA_saeah.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Connect to google drive and read data
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Colab Notebooks/Access_to_Everyday_Life_Dataset.csv'

df = pd.read_csv(file_path)

df.head()

In [None]:
df.info()
df.describe(include='all')
df.head()

# Check severity distribution
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='properties/severity', data=df)
plt.title("Severity Distribution")
plt.show()

In [None]:
# Check barrier type
sns.countplot(y='properties/label_type', data=df)
plt.title("Barrier Type Counts")
plt.show()

In [None]:
# Check neighborhoods
neigh_counts = df['properties/neighborhood'].value_counts()
neigh_counts.plot(kind='bar', figsize=(10,5))
plt.title("Observation Count per Neighborhood")
plt.show()

In [None]:
# severity vs barrier type
sns.boxplot(x='properties/label_type', y='properties/severity', data=df)
plt.title("Severity by Barrier Type")
plt.show()

Combine with Geojson data to correctly label neighborhood

In [None]:
!pip install geopandas shapely

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Conver df to GeoDataFrame
# longitude = x, latitude = y
df['geometry'] = df.apply(lambda row: Point(row['geometry/coordinates/0'], row['geometry/coordinates/1']), axis=1)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Load GeoJSON
neighborhoods = gpd.read_file("/content/drive/MyDrive/Colab Notebooks/Neighborhood_Map_Atlas_Neighborhoods.geojson")

# Spatial join
gdf_corrected = gpd.sjoin(gdf, neighborhoods[['L_HOOD','S_HOOD','geometry']], how='left', predicate='within')

# Add a new column: Corrected Neighborhood
gdf_corrected['corrected_neighborhood'] = gdf_corrected['S_HOOD']

gdf_corrected = gdf_corrected.drop(columns=['index_right'])

gdf_corrected[['geometry/coordinates/0', 'geometry/coordinates/1', 'properties/neighborhood', 'corrected_neighborhood']].head(100)

In [None]:
!pip install folium geopandas

import folium
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors

# Set the center as Seattle downtown area
seattle_center = [47.6062, -122.3321]  # lat, lon
m = folium.Map(location=seattle_center, zoom_start=12)

# Color by neighborhood (labeling)
neighborhoods_unique = gdf_corrected['corrected_neighborhood'].dropna().unique()
cmap = plt.get_cmap('Set3', len(neighborhoods_unique))  # Colormap
colors = [mcolors.rgb2hex(cmap(i)) for i in range(len(neighborhoods_unique))]  # RGB to HEX
color_dict = dict(zip(neighborhoods_unique, colors))

# Create map with folium
seattle_center = [47.6062, -122.3321]
m = folium.Map(location=seattle_center, zoom_start=12)

for idx, row in gdf_corrected.iterrows():
    folium.CircleMarker(
        location=[row['geometry/coordinates/1'], row['geometry/coordinates/0']],
        radius=3,
        color=color_dict.get(row['corrected_neighborhood'], 'gray'),
        fill=True,
        fill_opacity=0.7,
        popup=f"{row['properties/neighborhood']} → {row['corrected_neighborhood']}"
    ).add_to(m)

m

# Modeling

In [None]:
# Convert Severity into binary: 1,2,3 -> 0 (low), 4,5 -> 1 (high)
gdf_corrected['severity_binary'] = gdf_corrected['properties/severity'].apply(lambda x: 1 if x in [4,5] else 0)

In [None]:
# 기본 feature
X = gdf_corrected[['corrected_neighborhood', 'properties/label_type', 'properties/is_temporary']]

# categorical → one-hot encoding
X = pd.get_dummies(X, columns=['corrected_neighborhood', 'properties/label_type'], drop_first=True)

y = gdf_corrected['severity_binary']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))

# Feature Importance

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = rf.feature_importances_
feat_names = X.columns

# 상위 5개 feature index
top_idx = np.argsort(importances)[-5:][::-1]  # 큰 순서대로 5개

plt.figure(figsize=(8,5))
plt.barh(feat_names[top_idx], importances[top_idx], color='skyblue')
plt.xlabel("Feature Importance")
plt.title("Top 5 Random Forest Feature Importances")
plt.gca().invert_yaxis()  # 가장 중요한게 위로 오게
plt.show()

# Merge with population data

In [None]:
import geopandas as gpd
import pandas as pd

# Seattle neighborhood ACS data
url = "https://data-seattlecitygis.opendata.arcgis.com/datasets/SeattleCityGIS::seattle-neighborhoods-top-50-american-community-survey-data.geojson"
acs_gdf = gpd.read_file(url)

acs_gdf.head()

In [None]:
list(acs_gdf.columns)

In [None]:
# Select necessary columns only
acs_cols = [
    'NEIGH_NAME', 'TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'Children_under_5',
    'Children_under_18', 'Older_Adults_65_over', 'Median_Age', 'Male', 'Female',
    'PEOPLE_OF_COLOR_PERCENT', 'BACHELOR_HIGHER_PERCENT', 'PER_CAPITA_INCOME',
    'RENTER_HOUSEHOLDS_PERCENT', 'DETACHED_1_UNIT_PERCENT',
    'PUBLIC_TRANSPORTATION_PERCENT', 'POPULATION_DISABILITY_PERC',
    'geometry'
]
acs_gdf_small = acs_gdf[acs_cols]

# Match CRS
acs_gdf_small = acs_gdf_small.to_crs(gdf_corrected.crs)

import geopandas as gpd

merged_gdf = gpd.sjoin(
    gdf_corrected,      # points (LEFT)
    acs_gdf_small,     # polygons (RIGHT)
    how="left",
    predicate="within"
)

In [None]:
merged_gdf.head()

In [None]:
merged_gdf.to_csv("/content/drive/MyDrive/Colab Notebooks/merged_data.csv", index=False)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

acs_gdf.plot(
    column="TOTAL_POPULATION",
    cmap="OrRd",
    legend=True,
    ax=ax,
    edgecolor="black",
    linewidth=0.5
)

ax.set_title("Total Population by Seattle Neighborhood")
ax.axis("off")

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,12))

acs_gdf.plot(
    column="TOTAL_POPULATION",
    cmap="OrRd",
    legend=True,
    ax=ax,
    edgecolor="black",
    linewidth=0.5
)

high = gdf_corrected[gdf_corrected["properties/severity"] >= 4]

ax.scatter(
    high.geometry.x,
    high.geometry.y,
    s=4,
    alpha=0.6
)

xmin, ymin, xmax, ymax = acs_gdf.total_bounds
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)

ax.set_title("High Severity Accessibility Issues over Population Density", fontsize=14)
ax.axis("off")

plt.show()