# stage06_data-preprocessing

 - filter the none needed data
 - filling all the NaN

In [44]:
import pandas as pd
import numpy as np
import sys, os, pathlib

parent_dir = os.path.abspath("..")
sys.path.append(parent_dir)
from src import cleaning

from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Load Raw Dataset - citibike_all_trips
- I want to achieve auto load, even I reload the data in the raw file with different time name

In [29]:
folder = "../data/raw"
prefix = "citibike_all_trips"

files = [os.path.join(folder, f) for f in os.listdir(folder) if f.startswith(prefix)]

# we want the latest file that was updated that is started with the prefix
latest_file = max(files, key=os.path.getmtime)
print("read csv:", latest_file)

df = pd.read_csv(latest_file)
print(df.head())

read csv: ../data/raw/citibike_all_trips20250820-203446.csv


  df = pd.read_csv(latest_file)


            ride_id  rideable_type               started_at  \
0  7FFC2F964DC9F335  electric_bike  2025-05-28 07:25:55.506   
1  EF51D1DA2B8F391C  electric_bike  2025-05-16 22:47:38.278   
2  3065F3AABFC64C5D  electric_bike  2025-05-25 15:52:48.010   
3  17F1785E42AF0DFE  electric_bike  2025-05-29 08:29:18.717   
4  4F9FA463022AAD49   classic_bike  2025-05-15 18:37:16.877   

                  ended_at               start_station_name start_station_id  \
0  2025-05-28 07:32:06.123                   E 7 St & Ave C          5545.01   
1  2025-05-16 23:07:01.352                  9 Ave & W 39 St          6644.08   
2  2025-05-25 16:03:29.522        Central Park W & W 103 St          7577.27   
3  2025-05-29 08:54:24.022  Pulaski St & Marcus Garvey Blvd          4656.03   
4  2025-05-15 18:45:37.416         Lafayette St & Jersey St          5561.06   

            end_station_name end_station_id  start_lat  start_lng    end_lat  \
0    Bleecker St & Crosby St        5679.08  40.724129 -73.9

## cleaning the data, throw the one we do not need

In [30]:
# Use Help to drop all NaN values
# Since this is a system data, I am assuming it is missing completely at Random (MCAR)
df = cleaning.drop_missing(df, threshold=1)

In [32]:
import numpy as np
import pandas as pd
import shapely
from shapely.geometry import shape
import requests

# --- 0) 清洗经纬度列：统一成 float ---
for col in ["start_lat", "start_lng", "end_lat", "end_lng"]:
    # 去除空格再转数值；失败的记为 NaN
    df[col] = pd.to_numeric(df[col].astype(str).str.strip(), errors="coerce")

# 可选：粗过滤数值范围（避免把 400.0 之类的脏数据带进来）
# As the approcimate long and lat of NYC is：lat [40, 41]，lng [-75, -72]
# Mega Filter out the obviously wrong data
valid = (
    df["start_lat"].between(40, 41) & df["start_lng"].between(-75, -72) &
    df["end_lat"].between(40, 41)   & df["end_lng"].between(-75, -72)
)
df = df[valid].reset_index(drop=True)

# --- 1) 取 Brooklyn 多边形 ---
url = "https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Borough_Boundary/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson"
features = requests.get(url).json()["features"]
bk = shape(next(f["geometry"] for f in features if f["properties"]["BoroName"] == "Brooklyn"))

# --- 2) bbox 粗筛（全是 float 之后就不会再 TypeError）---
# coarse filtering, find the bounding max(long,lat) of Brooklyn
minx, miny, maxx, maxy = bk.bounds

start_bbox = df["start_lng"].between(minx, maxx) & df["start_lat"].between(miny, maxy)
end_bbox   = df["end_lng"].between(minx, maxx)   & df["end_lat"].between(miny, maxy)
bbox_mask  = start_bbox | end_bbox

# --- 3) 精筛（Shapely 2.x 向量化；如果你是 1.x，可改用 GeoPandas）---
# fine filtering using Shapely
cand = df[bbox_mask]

# As we literally have millions of pair 2D coordinates, I learned online to use shapely's vectorized operations storing points
p_start = shapely.points(cand["start_lng"].to_numpy(), cand["start_lat"].to_numpy())
p_end   = shapely.points(cand["end_lng"].to_numpy(),   cand["end_lat"].to_numpy())

# 点在多边形内；若希望“边界也算在内”，用 shapely.covers(bk, p_start/p_end)
# contain would create a mask for points that are strictly inside the polygon
# result in a boolean array
in_bk_start = shapely.contains(bk, p_start)
in_bk_end   = shapely.contains(bk, p_end)
keep_cand   = np.logical_or(in_bk_start, in_bk_end) 

#CAUTION: LENGTH OF keep_cand HERE MAY NOT MATCH THE ORIGINAL df!!!!!!
#KEEP_CAND ONLY WRITES ON WHATEVER bbox_mask IS TRUE

# create a final mask to filter the original DataFrame, assume all will toss out
final_mask = np.zeros(len(df), dtype=bool)
# WE ONLY WANT TO FILL THE POSITIONS WHERE bbox_mask IS TRUE
final_mask[np.where(bbox_mask)[0]] = keep_cand

df_bk = df[final_mask].copy()
print("After filtering, number of trips in Brooklyn:", len(df_bk))

After filtering, number of trips in Brooklyn: 301051


# Data Storage for citibike_all_trips

In [43]:
import datetime as dt
import os, pathlib, datetime as dt
import pandas as pd

import sys 
sys.path.append("..")
from src.config import load_env, get_key
load_env()


try:
    import pyarrow as pa
    import pandas as pd
    print("pyarrow:", pa.__version__)
    print("pandas:", pd.__version__)
except Exception as e:
    print("Import failed:", repr(e))

RAW = pathlib.Path("..") / "data/raw"; RAW.mkdir(parents=True, exist_ok=True)
PROC = pathlib.Path("..") / "data/processed"; PROC.mkdir(parents=True, exist_ok=True)

def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

output_file = PROC / f"Trips_Processed_{ts()}.csv"
df_bk.to_csv(output_file, index=False)

print(f"Processed data saved to {output_file.resolve()}")

# Save as Parquet
pq_path = PROC / f"Trips_Processed_{ts()}.parquet"
try:
    df_bk.to_parquet(pq_path, engine="fastparquet")
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

Imports OK
PROJECT_ROOT: /Users/kevinjiang/bootcamp_Jianke_Jiang/project/notebooks
DATA_DIR: /Users/kevinjiang/bootcamp_Jianke_Jiang/project/notebooks/data
.env loaded (if present)
pyarrow: 21.0.0
pandas: 2.3.1
RAW -> /Users/kevinjiang/bootcamp_Jianke_Jiang/project/data/raw
PROC -> /Users/kevinjiang/bootcamp_Jianke_Jiang/project/data/processed
Processed data saved to /Users/kevinjiang/bootcamp_Jianke_Jiang/project/data/processed/Trips_Processed_20250820-230826.csv
Parquet engine not available. Install pyarrow or fastparquet to complete this step.


# Find the data for station Lawrence and Willougghby St.(closest to tandon for analysis, data way to big)

In [None]:
import pandas as pd
import pathlib
import datetime as dt  # <-- add this

# Read CSV
df_bk = pd.read_csv("/Users/kevinjiang/bootcamp_Jianke_Jiang/project/data/processed/Trips_Processed_20250820-230826.csv")

# Create timestamp function
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

PROC = pathlib.Path("..") / "data/processed"
PROC.mkdir(parents=True, exist_ok=True)

# Create mask
tandon_mask = (
    (df_bk['start_station_name'] == 'Lawrence St & Willoughby St') |
    (df_bk['end_station_name'] == 'Lawrence St & Willoughby St')
)

# Filter
df_bk = df_bk[tandon_mask].copy()
print("head ")

# Output file
output_file = PROC / f"Trips_Tandon_Processed_{ts()}.csv"
df_bk.to_csv(output_file, index=False)

def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')
PROC = pathlib.Path("..") / "data/processed"
PROC.mkdir(parents=True, exist_ok=True)
output_file = PROC / f"Trips_Tandon_EDA_{ts()}.csv"
df.to_csv(output_file, index=False)

head 


  df_bk = pd.read_csv("/Users/kevinjiang/bootcamp_Jianke_Jiang/project/data/processed/Trips_Processed_20250820-230826.csv")
