### Set up

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

RANDOM_STATE = 42
FAST_MODE = True     
LEAKAGE_SAFE = False  

DATA_DIR = Path(".")

### Load data

In [2]:

train = pd.read_csv(DATA_DIR / "/Users/fengci/Desktop/Machine Learning/project2/train.csv")
test  = pd.read_csv(DATA_DIR / "/Users/fengci/Desktop/Machine Learning/project2/test.csv")
acs   = pd.read_csv(DATA_DIR / "/Users/fengci/Desktop/Machine Learning/project2/community_acs.csv")
cross = pd.read_csv(DATA_DIR / "/Users/fengci/Desktop/Machine Learning/project2/community_region_crosswalk.csv")
cong  = pd.read_csv(DATA_DIR / "/Users/fengci/Desktop/Machine Learning/project2/congestion_region.csv")

print("train:", train.shape, "| test:", test.shape)
display(train.head())

train: (447272, 12) | test: (111819, 11)


Unnamed: 0,creation_date,zip,ward,police_district,community_area,latitude,longitude,ssa,current_activity,number_of_potholes_filled_on_block,completed_within_7_days,request_id
0,2012-03-02,60656.0,41.0,16.0,10.0,41.975,-87.795,,Dispatch Crew,5.0,1,c6fb34de-4e79-4e83-ac3b-cbbf31d6d834
1,2016-03-31,60659.0,50.0,24.0,2.0,41.995,-87.685,,Final Outcome,4.0,1,a50411a7-2c2c-439c-8cd1-62cf18fc2d4a
2,2018-02-27,60634.0,36.0,25.0,19.0,41.937,-87.788,,Final Outcome,76.0,1,55eb1b82-ae52-4e1a-9476-169a59293cf0
3,2011-04-06,60632.0,14.0,9.0,63.0,41.794,-87.688,,Dispatch Crew,0.0,1,59a51854-7646-40d5-a30f-04306a6f9d2f
4,2018-03-28,60643.0,34.0,22.0,75.0,41.68,-87.669,,,,1,06dbfc49-3118-4849-ad24-5d2fd3b1760b


### Tidy / join sanity checks

In [3]:
def quick_check_requests(df, name):
    assert "request_id" in df.columns, f"{name}: missing request_id"
    assert df["request_id"].is_unique, f"{name}: request_id not unique!"
    print(f"{name}: rows={len(df):,}, unique_request_id={df['request_id'].nunique():,}")

quick_check_requests(train, "train_raw")
quick_check_requests(test,  "test_raw")

TARGET = "completed_within_7_days"
print("Target mean (base rate):", train[TARGET].mean())
train[TARGET].value_counts()

train_raw: rows=447,272, unique_request_id=447,272
test_raw: rows=111,819, unique_request_id=111,819
Target mean (base rate): 0.4853981469888569


completed_within_7_days
0    230167
1    217105
Name: count, dtype: int64

### Feature engineering + joins

In [None]:
def add_time_features(df, date_col="creation_date"):
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out["year"] = out[date_col].dt.year.astype("float")
    out["month"] = out[date_col].dt.month.astype("float")
    out["dayofweek"] = out[date_col].dt.dayofweek.astype("float")
    out["weekofyear"] = out[date_col].dt.isocalendar().week.astype("float")

    # cyclic encoding
    out["month_sin"] = np.sin(2*np.pi*out["month"]/12.0)
    out["month_cos"] = np.cos(2*np.pi*out["month"]/12.0)
    out["dow_sin"] = np.sin(2*np.pi*out["dayofweek"]/7.0)
    out["dow_cos"] = np.cos(2*np.pi*out["dayofweek"]/7.0)
    return out

def add_geo_bins(df, lat="latitude", lon="longitude"):
    out = df.copy()
    if lat in out.columns and lon in out.columns:
        out["lat_bin"] = np.floor(out[lat]*1000)/1000
        out["lon_bin"] = np.floor(out[lon]*1000)/1000
        out["geo_cell"] = out["lat_bin"].astype(str) + "_" + out["lon_bin"].astype(str)  # object dtype
    return out

def add_missing_indicators(df, cols):
    out = df.copy()
    for c in cols:
        if c in out.columns:
            out[c+"_missing"] = out[c].isna().astype(int)
    return out

def make_features(df, cross, acs, cong, leakage_safe=False):
    out = df.copy()

    out = add_time_features(out, "creation_date")
    out = add_geo_bins(out, "latitude", "longitude")

    miss_cols = ["latitude","longitude","zip","ward","police_district","community_area","ssa",
                 "current_activity","number_of_potholes_filled_on_block"]
    out = add_missing_indicators(out, miss_cols)

    # joins: validate to prevent row duplication
    out = out.merge(cross, how="left",
                    left_on="community_area", right_on="community_area_id",
                    validate="m:1")

    out = out.merge(acs, how="left",
                    left_on="community_area_name", right_on="community_area",
                    validate="m:1", suffixes=("", "_acs"))

    out = out.merge(cong, how="left", on="region_id", validate="m:1")

    if leakage_safe:
        out = out.drop(columns=[c for c in ["current_activity","number_of_potholes_filled_on_block"] if c in out.columns])

    # drop raw datetime + helper cols if exist
    drop_cols = [c for c in ["creation_date","community_area_id","community_area_acs","record_id","region"] if c in out.columns]
    out = out.drop(columns=drop_cols, errors="ignore")

    return out

train_fe = make_features(train, cross, acs, cong, leakage_safe=LEAKAGE_SAFE)
test_fe  = make_features(test,  cross, acs, cong, leakage_safe=LEAKAGE_SAFE)

quick_check_requests(train_fe, "train_after_join")
quick_check_requests(test_fe,  "test_after_join")

train_fe.head()

train_after_join: rows=447,272, unique_request_id=447,272
test_after_join: rows=111,819, unique_request_id=111,819


Unnamed: 0,zip,ward,police_district,community_area,latitude,longitude,ssa,current_activity,number_of_potholes_filled_on_block,completed_within_7_days,...,asian,native_hawaiin_or_pacific,other_race,multiracial,white_not_hispanic_or_latino,hispanic_or_latino,min_speed,max_speed,avg_speed,median_speed
0,60656.0,41.0,16.0,10.0,41.975,-87.795,,Dispatch Crew,5.0,1,...,2809.0,0.0,1759.0,3819.0,27986.0,7802.0,0.0,109.43,26.106213,27.27
1,60659.0,50.0,24.0,2.0,41.995,-87.685,,Final Outcome,4.0,1,...,16696.0,17.0,6958.0,8615.0,31486.0,17531.0,0.0,37.88,19.96209,22.5
2,60634.0,36.0,25.0,19.0,41.937,-87.788,,Final Outcome,76.0,1,...,1939.0,23.0,25743.0,14011.0,10782.0,57511.0,0.0,142.5,22.779446,24.55
3,60632.0,14.0,9.0,63.0,41.794,-87.688,,Dispatch Crew,0.0,1,...,264.0,1.0,13063.0,5764.0,995.0,31985.0,0.0,131.42,24.723262,29.32
4,60643.0,34.0,22.0,75.0,41.68,-87.669,,,,1,...,211.0,0.0,335.0,1112.0,6485.0,981.0,0.0,71.93,25.096574,29.32


### Prepare X/y + categorical handling

In [5]:
TARGET = "completed_within_7_days"
y = train_fe[TARGET].astype(int)
X = train_fe.drop(columns=[TARGET]).copy()

X_test = test_fe.copy()
test_request_id = X_test["request_id"].copy()

X = X.drop(columns=["request_id"], errors="ignore")
X_test = X_test.drop(columns=["request_id"], errors="ignore")

# Convert object -> category, align categories
# Convert object/string -> category, align categories
cat_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()

for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")
    all_cats = pd.Index(X[c].cat.categories).union(pd.Index(X_test[c].cat.categories))
    X[c] = X[c].cat.set_categories(all_cats)
    X_test[c] = X_test[c].cat.set_categories(all_cats)

cat_cols[:10], len(cat_cols)
len(cat_cols), cat_cols[:10]

bad = X.dtypes[~(X.dtypes.apply(lambda t: t.kind in "ifb") | (X.dtypes.astype(str) == "category"))]
print("Bad dtypes:\n", bad)

Bad dtypes:
 Series([], dtype: object)


### CV helper (K-fold AUC)

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

def lgb_cv_auc(X, y, params, n_splits=5, seed=42, early_stopping=150):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    aucs = []
    oof = np.zeros(len(y), dtype=float)

    for fold, (tr, va) in enumerate(skf.split(X, y), 1):
        print(f"\n=== Fold {fold}/{n_splits} ===")
        dtr = lgb.Dataset(X.iloc[tr], y.iloc[tr], categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(X.iloc[va], y.iloc[va], categorical_feature=cat_cols, free_raw_data=False)

        booster = lgb.train(
            params,
            dtr,
            num_boost_round=params.get("num_boost_round", 5000),
            valid_sets=[dva],
            callbacks=[
                lgb.early_stopping(stopping_rounds=early_stopping, verbose=True),
                lgb.log_evaluation(period=50),
            ],
        )

        pred = booster.predict(X.iloc[va], num_iteration=booster.best_iteration)
        oof[va] = pred
        fold_auc = roc_auc_score(y.iloc[va], pred)
        aucs.append(fold_auc)
        print(f"Fold {fold} AUC: {fold_auc:.5f} | best_iter={booster.best_iteration}")

    return float(np.mean(aucs)), float(np.std(aucs)), oof

### Strong baseline + CV

In [7]:
base_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": RANDOM_STATE,
    "force_row_wise": True,
    "num_boost_round": 800,
}

mean_auc, std_auc, _ = lgb_cv_auc(X, y, base_params, n_splits=3, seed=RANDOM_STATE, early_stopping=80)
print(f"Baseline 3-fold AUC: {mean_auc:.5f} ± {std_auc:.5f}")


=== Fold 1/3 ===
Training until validation scores don't improve for 80 rounds
[50]	valid_0's auc: 0.845935
[100]	valid_0's auc: 0.854558
[150]	valid_0's auc: 0.855745
[200]	valid_0's auc: 0.856394
[250]	valid_0's auc: 0.856939
[300]	valid_0's auc: 0.857408
[350]	valid_0's auc: 0.857667
[400]	valid_0's auc: 0.858876
[450]	valid_0's auc: 0.859145
[500]	valid_0's auc: 0.859874
[550]	valid_0's auc: 0.860404
[600]	valid_0's auc: 0.860869
[650]	valid_0's auc: 0.861361
[700]	valid_0's auc: 0.86184
[750]	valid_0's auc: 0.862423
[800]	valid_0's auc: 0.862711
Did not meet early stopping. Best iteration is:
[800]	valid_0's auc: 0.862711
Fold 1 AUC: 0.86271 | best_iter=800

=== Fold 2/3 ===
Training until validation scores don't improve for 80 rounds
[50]	valid_0's auc: 0.841347
[100]	valid_0's auc: 0.849872
[150]	valid_0's auc: 0.851019
[200]	valid_0's auc: 0.851742
[250]	valid_0's auc: 0.852369
[300]	valid_0's auc: 0.853031
[350]	valid_0's auc: 0.853358
[400]	valid_0's auc: 0.854542
[450]	valid

### Final params 

In [None]:
best_params = base_params.copy()
best_params.update({
    "learning_rate": 0.08,
    "num_boost_round": 2000,
})

mean_auc, std_auc, _ = lgb_cv_auc(X, y, best_params, n_splits=3, seed=RANDOM_STATE, early_stopping=80)
print(f"Final params 3-fold AUC: {mean_auc:.5f} ± {std_auc:.5f}")


=== Fold 1/3 ===
Training until validation scores don't improve for 80 rounds
[50]	valid_0's auc: 0.852721
[100]	valid_0's auc: 0.855819
[150]	valid_0's auc: 0.856669
[200]	valid_0's auc: 0.857794
[250]	valid_0's auc: 0.858752
[300]	valid_0's auc: 0.859508
[350]	valid_0's auc: 0.860061
[400]	valid_0's auc: 0.861327
[450]	valid_0's auc: 0.861662
[500]	valid_0's auc: 0.862396
[550]	valid_0's auc: 0.863005
[600]	valid_0's auc: 0.863341
[650]	valid_0's auc: 0.863828
[700]	valid_0's auc: 0.864492
[750]	valid_0's auc: 0.865055
[800]	valid_0's auc: 0.865174
[850]	valid_0's auc: 0.865439
[900]	valid_0's auc: 0.865895
[950]	valid_0's auc: 0.866206
[1000]	valid_0's auc: 0.86662
[1050]	valid_0's auc: 0.867104
[1100]	valid_0's auc: 0.867607
[1150]	valid_0's auc: 0.867963
[1200]	valid_0's auc: 0.868048
[1250]	valid_0's auc: 0.868371
[1300]	valid_0's auc: 0.868539
[1350]	valid_0's auc: 0.868911
[1400]	valid_0's auc: 0.869084
[1450]	valid_0's auc: 0.869087
[1500]	valid_0's auc: 0.869277
[1550]	valid

In [12]:
print(f"Final params 3-fold AUC: {mean_auc:.5f} ± {std_auc:.5f}")

Final params 3-fold AUC: 0.87030 ± 0.00141


### Multi-seed predict test

In [11]:
# train full model(s) and predict test
import lightgbm as lgb
import numpy as np
import pandas as pd

def train_full_and_predict(X, y, X_test, params, seed):
    params = params.copy()
    params["seed"] = seed

    dtrain = lgb.Dataset(X, y, categorical_feature=cat_cols, free_raw_data=False)

    booster = lgb.train(
        params,
        dtrain,
        num_boost_round=params.get("num_boost_round", 5000),
        valid_sets=[dtrain],                
        callbacks=[lgb.log_evaluation(period=0)]
    )


    return booster.predict(X_test, num_iteration=booster.current_iteration())


SEEDS = [42, 52]

preds = []
for s in SEEDS:
    p = train_full_and_predict(X, y, X_test, best_params, seed=s)
    preds.append(p)

p_test = np.mean(np.vstack(preds), axis=0)

pd.Series(p_test).describe()

count    111819.000000
mean          0.485337
std           0.324047
min           0.003055
25%           0.176724
50%           0.443066
75%           0.805718
max           0.999641
dtype: float64

### Export

In [None]:
submission = pd.DataFrame({
    "request_id": test_request_id,
    "prediction": p_test
})
submission.to_csv("submission.csv", index=False)

print("Wrote submission.csv | rows:", len(submission))
submission.head()

Wrote submission.csv | rows: 111819


Unnamed: 0,request_id,prediction
0,b98b4a12-1e86-4634-b7e6-8a8d4c687ef6,0.781123
1,90ad3297-203b-4636-81fa-26e279a323e6,0.940913
2,c5e678b6-5fe5-4bc5-97ed-a9df3ba7f197,0.299925
3,734f19b6-1a03-40f7-9aa0-2a575ff65fcc,0.034271
4,f9cf7e4f-5745-4685-8378-03ee40040cc8,0.883476
