In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import glob
import os
import numpy as np
from tqdm import tqdm

def resample_station_data(df: pd.DataFrame, station_id_col: str, timestamp_col: str, freq: str = '10T') -> pd.DataFrame:
    """
    Resamples irregular time-series data from bike stations to a fixed frequency.
    """
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    df = df.set_index(timestamp_col)

    def get_closest(x):
        if x.empty:
            return np.nan
        target_timestamp = x.name
        closest_index = (x.index - target_timestamp).to_series().abs().idxmin()
        return x.loc[closest_index]

    resampled_dfs = []
    for station_id, station_df in tqdm(df.groupby(station_id_col), desc="Resampling stations"):
        resampled = station_df.resample(freq).apply(get_closest)
        resampled = resampled.ffill().bfill()
        resampled[station_id_col] = station_id
        resampled_dfs.append(resampled)

    final_df = pd.concat(resampled_dfs).reset_index()
    # Ensure original timestamp column name is used in the output
    final_df = final_df.rename(columns={'index': timestamp_col})
    cols = [station_id_col, timestamp_col] + [c for c in final_df.columns if c not in [station_id_col, timestamp_col]]
    return final_df[cols]


# --- Configuration ---
DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data/'
OUTPUT_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. Process and Clean Site Information ---
print("--- Step 1: Consolidating and Cleaning Site Information ---")
site_files = glob.glob(os.path.join(DATA_DIR, '*_site.csv'))

if not site_files:
    print(f"Error: No site files found in '{DATA_DIR}'.")
else:
    all_sites_df = pd.concat((pd.read_csv(file) for file in site_files), ignore_index=True)
    print(f"Loaded {len(all_sites_df)} records from {len(site_files)} site files.")
    all_sites_df = all_sites_df.drop(columns=['sarea', 'ar'])
    sites_info_df = all_sites_df.sort_values('sno').drop_duplicates(subset='sno', keep='last').copy()
    print(f"Created a clean lookup table with {len(sites_info_df)} unique stations.")

# --- 2. Load and Resample Time-Series Snapshot Data ---
print("\n--- Step 2: Loading and Resampling Snapshot Data ---")
slot_files = glob.glob(os.path.join(DATA_DIR, '*_slot.csv'))

if not slot_files:
    print(f"Error: No snapshot/slot files found in '{DATA_DIR}'.")
else:
    all_slots_list = [pd.read_csv(file) for file in slot_files]
    slots_df = pd.concat(all_slots_list, ignore_index=True)
    print(f"Loaded {len(slots_df)} snapshot records from {len(slot_files)} files.")

    # --- DEBUGGING STEP ---
    # Print the columns of the loaded dataframe to confirm the name
    print("Columns found in the snapshot data:", slots_df.columns.tolist())

    print("Resampling snapshot data to 10-minute intervals...")
    # --- THE FIX IS HERE ---
    # The timestamp column is 'infoTime', not 'mday'.
    resampled_slots_df = resample_station_data(slots_df, station_id_col='sno', timestamp_col='infoTime', freq='10T')
    print("Resampling complete.")

# --- 3. Perform the Final Merge and Save ---
print("\n--- Step 3: Merging Site Info with Resampled Snapshot Data ---")
if 'resampled_slots_df' in locals() and 'sites_info_df' in locals():
    site_info_to_merge = sites_info_df[['sno', 'sna', 'tot', 'lat', 'lng']]

    # Rename 'infoTime' to 'mday' for consistency if you prefer
    resampled_slots_df = resampled_slots_df.rename(columns={'infoTime': 'mday'})

    final_df = pd.merge(resampled_slots_df, site_info_to_merge, on='sno', how='left')
    final_df = final_df.sort_values(by=['sno', 'mday']).reset_index(drop=True)

    print("Merge complete. Master dataset is ready.")
    print("\nPreview of the final consolidated DataFrame:")
    print(final_df.head())

    output_path = os.path.join(OUTPUT_DIR, 'consolidated_youbike_data_processed.csv')
    final_df.to_csv(output_path, index=False)
    print(f"\nMaster dataset has been saved to: '{output_path}'")
else:
    print("\nHalting script because one or both data sources could not be loaded or processed.")



--- Step 1: Consolidating and Cleaning Site Information ---
Loaded 391601 records from 261 site files.
Created a clean lookup table with 1613 unique stations.

--- Step 2: Loading and Resampling Snapshot Data ---
Loaded 41137128 snapshot records from 416 files.
Resampling snapshot data to 10-minute intervals...


KeyError: 'mday'

In [3]:
!pip install --upgrade translators

Collecting translators
  Downloading translators-6.0.1-py3-none-any.whl.metadata (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting niquests>=3.14.0 (from translators)
  Downloading niquests-3.15.2-py3-none-any.whl.metadata (16 kB)
Collecting exejs>=0.0.4 (from translators)
  Downloading exejs-0.0.6-py3-none-any.whl.metadata (5.1 kB)
Collecting pathos>=0.3.4 (from translators)
  Downloading pathos-0.3.4-py3-none-any.whl.metadata (11 kB)
Collecting cloudscraper>=1.2.71 (from translators)
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting urllib3-future<3,>=2.13.903 (from niquests>=3.14.0->translators)
  Downloading urllib3_future-2.13.906-py3-none-any.whl.metadata (15 kB)
Collecting wassima<3,>=1.0.1 (from niquests>=3.14.0->translators)
  Downloading wassim