<a href="https://colab.research.google.com/github/Hyperion8642/datathon_2026_asj/blob/main/datathon_EDA_saeah.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Connect to google drive and read data
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Colab Notebooks/Access_to_Everyday_Life_Dataset.csv'

df = pd.read_csv(file_path)

df.head()

In [None]:
df.info()
df.describe(include='all')
df.head()

# Check severity distribution
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='properties/severity', data=df)
plt.title("Severity Distribution")
plt.show()

In [None]:
# Check barrier type
sns.countplot(y='properties/label_type', data=df)
plt.title("Barrier Type Counts")
plt.show()

In [None]:
# Check neighborhoods
neigh_counts = df['properties/neighborhood'].value_counts()
neigh_counts.plot(kind='bar', figsize=(10,5))
plt.title("Observation Count per Neighborhood")
plt.show()

In [None]:
# severity vs barrier type
sns.boxplot(x='properties/label_type', y='properties/severity', data=df)
plt.title("Severity by Barrier Type")
plt.show()

Combine with Geojson data to correctly label neighborhood

In [None]:
!pip install geopandas shapely

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Conver df to GeoDataFrame
# longitude = x, latitude = y
df['geometry'] = df.apply(lambda row: Point(row['geometry/coordinates/0'], row['geometry/coordinates/1']), axis=1)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Load GeoJSON
neighborhoods = gpd.read_file("/content/drive/MyDrive/Colab Notebooks/Neighborhood_Map_Atlas_Neighborhoods.geojson")

# Spatial join
gdf_corrected = gpd.sjoin(gdf, neighborhoods[['L_HOOD','S_HOOD','geometry']], how='left', predicate='within')

# Add a new column: Corrected Neighborhood
gdf_corrected['corrected_neighborhood'] = gdf_corrected['S_HOOD']

gdf_corrected = gdf_corrected.drop(columns=['index_right'])

gdf_corrected[['geometry/coordinates/0', 'geometry/coordinates/1', 'properties/neighborhood', 'corrected_neighborhood']].head(100)

In [None]:
!pip install folium geopandas

import folium
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors

# Set the center as Seattle downtown area
seattle_center = [47.6062, -122.3321]  # lat, lon
m = folium.Map(location=seattle_center, zoom_start=12)

# Color by neighborhood (labeling)
neighborhoods_unique = gdf_corrected['corrected_neighborhood'].dropna().unique()
cmap = plt.get_cmap('Set3', len(neighborhoods_unique))  # Colormap
colors = [mcolors.rgb2hex(cmap(i)) for i in range(len(neighborhoods_unique))]  # RGB to HEX
color_dict = dict(zip(neighborhoods_unique, colors))

# Create map with folium
seattle_center = [47.6062, -122.3321]
m = folium.Map(location=seattle_center, zoom_start=12)

for idx, row in gdf_corrected.iterrows():
    folium.CircleMarker(
        location=[row['geometry/coordinates/1'], row['geometry/coordinates/0']],
        radius=3,
        color=color_dict.get(row['corrected_neighborhood'], 'gray'),
        fill=True,
        fill_opacity=0.7,
        popup=f"{row['properties/neighborhood']} → {row['corrected_neighborhood']}"
    ).add_to(m)

m

# Modeling

In [77]:
# Convert Severity into binary: 1,2,3 -> 0 (low), 4,5 -> 1 (high)
gdf_corrected['severity_binary'] = gdf_corrected['properties/severity'].apply(lambda x: 1 if x in [4,5] else 0)

In [78]:
# predictors
X = gdf_corrected[['corrected_neighborhood', 'properties/label_type', 'properties/is_temporary']]

# categorical: one-hot encoding
X = pd.get_dummies(X, columns=['corrected_neighborhood', 'properties/label_type'], drop_first=True)

y = gdf_corrected['severity_binary']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))

# Feature Importance

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = rf.feature_importances_
feat_names = X.columns

# top 5 feature index
top_idx = np.argsort(importances)[-5:][::-1]

plt.figure(figsize=(8,5))
plt.barh(feat_names[top_idx], importances[top_idx], color='skyblue')
plt.xlabel("Feature Importance")
plt.title("Top 5 Random Forest Feature Importances")
plt.gca().invert_yaxis()
plt.show()

# Merge with population data

In [None]:
import geopandas as gpd
import pandas as pd

# Seattle neighborhood ACS data
url = "https://data-seattlecitygis.opendata.arcgis.com/datasets/SeattleCityGIS::seattle-neighborhoods-top-50-american-community-survey-data.geojson"
acs_gdf = gpd.read_file(url)

acs_gdf.head()

In [None]:
list(acs_gdf.columns)

In [66]:
# Select necessary columns only
acs_cols = [
    'NEIGH_NAME', 'TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'Children_under_5',
    'Children_under_18', 'Older_Adults_65_over', 'Median_Age', 'Male', 'Female',
    'PEOPLE_OF_COLOR_PERCENT', 'BACHELOR_HIGHER_PERCENT', 'PER_CAPITA_INCOME',
    'RENTER_HOUSEHOLDS_PERCENT', 'PUBLIC_TRANSPORTATION_PERCENT', 'POPULATION_DISABILITY_PERC',
    'geometry'
]
acs_gdf_small = acs_gdf[acs_cols]

# Match CRS
acs_gdf_small = acs_gdf_small.to_crs(gdf_corrected.crs)

import geopandas as gpd

merged_gdf = gpd.sjoin(
    gdf_corrected,      # points (LEFT)
    acs_gdf_small,     # polygons (RIGHT)
    how="left",
    predicate="within"
)

In [67]:
merged_gdf.head()

Unnamed: 0,geometry/coordinates/0,geometry/coordinates/1,properties/label_type,properties/neighborhood,properties/is_temporary,properties/severity,geometry,L_HOOD,S_HOOD,corrected_neighborhood,...,Older_Adults_65_over,Median_Age,Male,Female,PEOPLE_OF_COLOR_PERCENT,BACHELOR_HIGHER_PERCENT,PER_CAPITA_INCOME,RENTER_HOUSEHOLDS_PERCENT,PUBLIC_TRANSPORTATION_PERCENT,POPULATION_DISABILITY_PERC
0,-122.298981,47.594616,SurfaceProblem,Atlantic,0,4.0,POINT (-122.29898 47.59462),Central Area,Atlantic,Atlantic,...,12794.0,36.4,56878.0,51134.0,35.9,74.6,99655.0,65.7,17.1,12.4
0,-122.298981,47.594616,SurfaceProblem,Atlantic,0,4.0,POINT (-122.29898 47.59462),Central Area,Atlantic,Atlantic,...,2721.0,36.3,8580.0,8144.0,51.7,61.9,70278.0,58.4,16.3,17.2
1,-122.301071,47.593357,SurfaceProblem,Atlantic,0,3.0,POINT (-122.30107 47.59336),Central Area,Atlantic,Atlantic,...,12794.0,36.4,56878.0,51134.0,35.9,74.6,99655.0,65.7,17.1,12.4
1,-122.301071,47.593357,SurfaceProblem,Atlantic,0,3.0,POINT (-122.30107 47.59336),Central Area,Atlantic,Atlantic,...,2721.0,36.3,8580.0,8144.0,51.7,61.9,70278.0,58.4,16.3,17.2
2,-122.301079,47.596844,SurfaceProblem,Atlantic,0,4.0,POINT (-122.30108 47.59684),Central Area,Atlantic,Atlantic,...,12794.0,36.4,56878.0,51134.0,35.9,74.6,99655.0,65.7,17.1,12.4


In [None]:
merged_gdf.to_csv("/content/drive/MyDrive/Colab Notebooks/merged_data.csv", index=False)

In [70]:
merged_gdf["vulnerability"] = (
    merged_gdf["POPULATION_DISABILITY_PERC"] +
    merged_gdf["Older_Adults_65_over"] +
    merged_gdf["RENTER_HOUSEHOLDS_PERCENT"]
)

vuln = merged_gdf.groupby("corrected_neighborhood")["vulnerability"].mean()
vuln

Unnamed: 0_level_0,vulnerability
corrected_neighborhood,Unnamed: 1_level_1
Alki,37267.650000
Arbor Heights,37267.650000
Atlantic,7896.559019
Ballard,13036.251861
Belltown,8039.250000
...,...
West Woodland,23232.921094
Westlake,25242.685950
Whittier Heights,30715.785124
Windermere,35068.050000


In [73]:
# Correlation
cols = [
    "PER_CAPITA_INCOME",
    "POPULATION_DISABILITY_PERC",
    "RENTER_HOUSEHOLDS_PERCENT",
    "vulnerability"
]

for c in cols:
    print(c, merged_gdf[c].corr(merged_gdf["properties/severity"]))

PER_CAPITA_INCOME -0.105657828136463
POPULATION_DISABILITY_PERC 0.021074748237412164
RENTER_HOUSEHOLDS_PERCENT -0.18726324847260614
vulnerability 0.08123087616471311


In [74]:
merged_gdf["high_sev"] = (merged_gdf["properties/severity"] >= 4).astype(int)

risk = (
    merged_gdf
    .groupby("corrected_neighborhood")
    .agg(
        high_sev_count=("high_sev","sum"),
        total_pop=("TOTAL_POPULATION","mean")
    )
    .dropna()
)

risk["per_1000"] = risk["high_sev_count"] / risk["total_pop"] * 1000

risk = risk.sort_values("per_1000", ascending=False)

risk.head(10)

Unnamed: 0_level_0,high_sev_count,total_pop,per_1000
corrected_neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SODO,3072,55224.798701,55.627183
Industrial District,3134,56913.918296,55.065617
South Park,958,55414.40694,17.287923
Wallingford,3174,198931.698473,15.955225
North Beacon Hill,1990,146176.074939,13.613719
Wedgwood,3182,255892.5,12.434909
Crown Hill,2094,183316.352319,11.422876
Broadway,842,114909.06101,7.327534
Meadowbrook,1700,253556.992366,6.704607
Atlantic,410,61672.904116,6.647976


In [88]:
merged_gdf['severity_binary'] = merged_gdf['properties/severity'].apply(lambda x: 1 if x in [4,5] else 0)

In [111]:
predictors = [
    'corrected_neighborhood',
    'properties/label_type',
    'properties/is_temporary',
    # 'TOTAL_POPULATION',
    'PER_CAPITA_INCOME',
    'RENTER_HOUSEHOLDS_PERCENT'
    # 'POPULATION_DISABILITY_PERC',
    # 'Children_under_5',
    # 'Children_under_18',
    # 'Older_Adults_65_over',
    # 'BACHELOR_HIGHER_PERCENT'
]

# Divide into test and train (no severity -> test)
train_df = merged_gdf[merged_gdf['properties/severity'].notna()]
test_df  = merged_gdf[merged_gdf['properties/severity'].isna()]

# Fillout numeric NAs with median value
train_df[predictors[3:]] = train_df[predictors[3:]].fillna(train_df[predictors[3:]].median())

# Fill categorical NAs with Unknown
cat_cols = ['corrected_neighborhood', 'properties/label_type', 'properties/is_temporary']
train_df[cat_cols] = train_df[cat_cols].fillna('Unknown')

# X = train_df[predictors]


# One-hot encoding
X = pd.get_dummies(train_df[predictors], columns=cat_cols, drop_first=True)

# Target
y = train_df['severity_binary']

In [112]:
# train/val split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [115]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)
print(classification_report(y_val, y_pred))
print("ROC AUC:", roc_auc_score(y_test, logreg.predict_proba(X_val)[:,1]))

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
print(classification_report(y_val, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_val, rf.predict_proba(X_val)[:,1]))

# XGBoost
from xgboost import XGBClassifier
n_0 = sum(y_train==0)
n_1 = sum(y_train==1)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                    random_state=42, scale_pos_weight=n_0/n_1)
xgb.fit(X_train, y_train)


y_pred_xgb = xgb.predict(X_val)
print(classification_report(y_val, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_val, xgb.predict_proba(X_val)[:,1]))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.92      0.79      0.85     22181
           1       0.64      0.85      0.73      9698

    accuracy                           0.81     31879
   macro avg       0.78      0.82      0.79     31879
weighted avg       0.84      0.81      0.81     31879

ROC AUC: 0.9038090172634229
              precision    recall  f1-score   support

           0       0.93      0.85      0.89     22181
           1       0.71      0.85      0.77      9698

    accuracy                           0.85     31879
   macro avg       0.82      0.85      0.83     31879
weighted avg       0.86      0.85      0.85     31879

ROC AUC: 0.9280856269881972


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.93      0.85      0.89     22181
           1       0.71      0.85      0.78      9698

    accuracy                           0.85     31879
   macro avg       0.82      0.85      0.83     31879
weighted avg       0.86      0.85      0.85     31879

ROC AUC: 0.9285327745021046


In [109]:
importances = rf.feature_importances_
feat_names = X.columns
feat_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)
print(feat_imp.head(10))  # Check top 10 features

properties/label_type_NoSidewalk                 0.304336
properties/label_type_NoCurbRamp                 0.142460
properties/label_type_SurfaceProblem             0.046315
properties/label_type_Obstacle                   0.045605
PER_CAPITA_INCOME                                0.024923
corrected_neighborhood_North Beach/Blue Ridge    0.023358
RENTER_HOUSEHOLDS_PERCENT                        0.020481
corrected_neighborhood_Wallingford               0.017781
BACHELOR_HIGHER_PERCENT                          0.016461
corrected_neighborhood_Wedgwood                  0.015479
dtype: float64


In [116]:
predictors = [
    'corrected_neighborhood',
    'properties/label_type',
    'properties/is_temporary'
    # 'TOTAL_POPULATION',
    # 'PER_CAPITA_INCOME',
    # 'RENTER_HOUSEHOLDS_PERCENT'
    # 'POPULATION_DISABILITY_PERC',
    # 'Children_under_5',
    # 'Children_under_18',
    # 'Older_Adults_65_over',
    # 'BACHELOR_HIGHER_PERCENT'
]

# Divide into test and train (no severity -> test)
train_df = merged_gdf[merged_gdf['properties/severity'].notna()]
test_df  = merged_gdf[merged_gdf['properties/severity'].isna()]

# Fillout numeric NAs with median value
# train_df[predictors[3:]] = train_df[predictors[3:]].fillna(train_df[predictors[3:]].median())

# Fill categorical NAs with Unknown
cat_cols = ['corrected_neighborhood', 'properties/label_type', 'properties/is_temporary']
train_df[cat_cols] = train_df[cat_cols].fillna('Unknown')

# X = train_df[predictors]


# One-hot encoding
X = pd.get_dummies(train_df[predictors], columns=cat_cols, drop_first=True)

# Target
y = train_df['severity_binary']

# train/val split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [117]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)
print(classification_report(y_val, y_pred))
print("ROC AUC:", roc_auc_score(y_test, logreg.predict_proba(X_val)[:,1]))

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
print(classification_report(y_val, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_val, rf.predict_proba(X_val)[:,1]))

# XGBoost
from xgboost import XGBClassifier
n_0 = sum(y_train==0)
n_1 = sum(y_train==1)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                    random_state=42, scale_pos_weight=n_0/n_1)
xgb.fit(X_train, y_train)


y_pred_xgb = xgb.predict(X_val)
print(classification_report(y_val, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_val, xgb.predict_proba(X_val)[:,1]))

              precision    recall  f1-score   support

           0       0.92      0.80      0.86     22181
           1       0.65      0.85      0.73      9698

    accuracy                           0.81     31879
   macro avg       0.79      0.82      0.79     31879
weighted avg       0.84      0.81      0.82     31879

ROC AUC: 0.9057128081272964
              precision    recall  f1-score   support

           0       0.93      0.85      0.89     22181
           1       0.71      0.85      0.78      9698

    accuracy                           0.85     31879
   macro avg       0.82      0.85      0.83     31879
weighted avg       0.86      0.85      0.85     31879

ROC AUC: 0.9300533568342176


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.93      0.85      0.89     22181
           1       0.72      0.85      0.78      9698

    accuracy                           0.85     31879
   macro avg       0.82      0.85      0.83     31879
weighted avg       0.86      0.85      0.85     31879

ROC AUC: 0.9284735749261157
