In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import glob
import os
import numpy as np
from tqdm import tqdm

def resample_station_data(df: pd.DataFrame, station_id_col: str, timestamp_col: str, freq: str = '10min') -> pd.DataFrame:
    """
    Resamples irregular time-series data using a robust and efficient pd.merge_asof method.
    """
    df = df.sort_values(timestamp_col)
    all_resampled_dfs = []
    for station_id, station_df in tqdm(df.groupby(station_id_col), desc="Resampling stations"):
        if station_df.empty:
            continue
        start_time = station_df[timestamp_col].min().floor(freq)
        end_time = station_df[timestamp_col].max().ceil(freq)
        time_grid = pd.DataFrame({timestamp_col: pd.date_range(start=start_time, end=end_time, freq=freq)})
        resampled_station = pd.merge_asof(
            left=time_grid,
            right=station_df,
            on=timestamp_col,
            direction='nearest',
            tolerance=pd.Timedelta('15min')
        )
        resampled_station[station_id_col] = station_id
        resampled_station = resampled_station.ffill().bfill()
        resampled_station.dropna(subset=[c for c in resampled_station.columns if c not in [timestamp_col]], inplace=True)
        all_resampled_dfs.append(resampled_station)
    if not all_resampled_dfs:
        return pd.DataFrame()
    final_df = pd.concat(all_resampled_dfs, ignore_index=True)
    return final_df

# --- Configuration ---
DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data/'
OUTPUT_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
BATCH_SIZE = 50
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. Process and Clean Site Information (Dimension Table) ---
print("--- Step 1: Consolidating and Cleaning Site Information ---")
site_files = glob.glob(os.path.join(DATA_DIR, '*_site.csv'))

if not site_files:
    print(f"Error: No site files found in '{DATA_DIR}'.")
    sites_info_df = None
else:
    all_sites_df = pd.concat((pd.read_csv(file) for file in site_files), ignore_index=True)
    print(f"Loaded {len(all_sites_df)} records from {len(site_files)} site files.")
    cols_to_drop_from_sites = ['sarea', 'ar']
    all_sites_df = all_sites_df.drop(columns=[col for col in cols_to_drop_from_sites if col in all_sites_df.columns])
    sites_info_df = all_sites_df.sort_values('sno').drop_duplicates(subset='sno', keep='last').copy()
    print(f"Created a clean lookup table with {len(sites_info_df)} unique stations.")

# --- 2. Process Snapshot Data in Batches to Conserve Memory ---
print("\n--- Step 2: Processing Snapshot Data in Batches ---")
slot_files = sorted(glob.glob(os.path.join(DATA_DIR, '*_slot.csv')))
processed_batch_files = []

if not slot_files or sites_info_df is None:
    print(f"Error: No snapshot/slot files found or site info is missing. Halting.")
else:
    num_batches = int(np.ceil(len(slot_files) / BATCH_SIZE))
    for i in range(num_batches):
        start_index = i * BATCH_SIZE
        end_index = start_index + BATCH_SIZE
        batch_files = slot_files[start_index:end_index]

        print(f"\n--- Processing Batch {i+1}/{num_batches} ---")

        batch_df = pd.concat((pd.read_csv(file) for file in batch_files), ignore_index=True)
        print(f"Loaded {len(batch_df)} records from {len(batch_files)} files.")

        timestamp_col = 'infoTime'
        numeric_cols = ['total', 'available_rent_bikes', 'available_return_bikes']

        batch_df[timestamp_col] = pd.to_datetime(batch_df[timestamp_col], errors='coerce')
        batch_df.dropna(subset=[timestamp_col], inplace=True)

        for col in numeric_cols:
            if col in batch_df.columns:
                batch_df[col] = pd.to_numeric(batch_df[col], errors='coerce')

        batch_df.dropna(subset=[c for c in numeric_cols if c in batch_df.columns], inplace=True)

        resampled_batch_df = resample_station_data(batch_df, station_id_col='sno', timestamp_col=timestamp_col)

        # --- THE FIX IS HERE ---
        # Include 'sareaen' as it's a critical clustering/categorical feature.
        site_info_to_merge = sites_info_df[['sno', 'sna', 'latitude', 'longitude', 'sareaen']].copy()

        # Rename for consistency before merging
        site_info_to_merge = site_info_to_merge.rename(columns={'latitude': 'lat', 'longitude': 'lng'})

        final_batch_df = pd.merge(resampled_batch_df, site_info_to_merge, on='sno', how='left')

        batch_output_path = os.path.join(OUTPUT_DIR, f'temp_batch_{i+1}.csv')
        final_batch_df.to_csv(batch_output_path, index=False)
        processed_batch_files.append(batch_output_path)
        print(f"Processed batch saved to '{batch_output_path}'")

# --- 3. Consolidate Processed Batches into Final Master File (Memory Efficiently) ---
# This part remains the same and will correctly handle the new column.
print("\n--- Step 3: Consolidating all Processed Batches ---")
if processed_batch_files:
    output_path = os.path.join(OUTPUT_DIR, 'consolidated_youbike_data_processed.csv')

    first_batch_df = pd.read_csv(processed_batch_files[0])
    first_batch_df = first_batch_df.rename(columns={'infoTime': 'mday'})
    first_batch_df['mday'] = pd.to_datetime(first_batch_df['mday'])
    first_batch_df.to_csv(output_path, index=False, header=True)

    if len(processed_batch_files) > 1:
        for file in tqdm(processed_batch_files[1:], desc="Appending remaining batches"):
            batch_df = pd.read_csv(file)
            batch_df = batch_df.rename(columns={'infoTime': 'mday'})
            batch_df.to_csv(output_path, mode='a', index=False, header=False)

    print("\nConsolidation of all batches is complete.")
    print("\nPreview of the final consolidated DataFrame (first 5 rows):")
    print(pd.read_csv(output_path, nrows=5))
    print(f"\nMaster dataset has been saved to: '{output_path}'")

    for file in processed_batch_files:
        os.remove(file)
    print("Temporary batch files have been removed.")
else:
    print("No batches were processed. Final file not created.")



--- Step 1: Consolidating and Cleaning Site Information ---
Loaded 391601 records from 261 site files.
Created a clean lookup table with 1613 unique stations.

--- Step 2: Processing Snapshot Data in Batches ---

--- Processing Batch 1/9 ---
Loaded 5248387 records from 50 files.


Resampling stations: 100%|██████████| 1424/1424 [00:07<00:00, 200.78it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_1.csv'

--- Processing Batch 2/9 ---
Loaded 4886354 records from 50 files.


Resampling stations: 100%|██████████| 1439/1439 [00:08<00:00, 169.93it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_2.csv'

--- Processing Batch 3/9 ---
Loaded 5580967 records from 50 files.


Resampling stations: 100%|██████████| 1461/1461 [00:11<00:00, 131.37it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_3.csv'

--- Processing Batch 4/9 ---
Loaded 3899791 records from 50 files.


Resampling stations: 100%|██████████| 1498/1498 [00:09<00:00, 155.13it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_4.csv'

--- Processing Batch 5/9 ---
Loaded 5562547 records from 50 files.


Resampling stations: 100%|██████████| 1499/1499 [00:13<00:00, 113.21it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_5.csv'

--- Processing Batch 6/9 ---
Loaded 5502489 records from 50 files.


Resampling stations: 100%|██████████| 1520/1520 [00:16<00:00, 90.53it/s] 


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_6.csv'

--- Processing Batch 7/9 ---
Loaded 5015749 records from 50 files.


Resampling stations: 100%|██████████| 1561/1561 [00:10<00:00, 149.66it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_7.csv'

--- Processing Batch 8/9 ---
Loaded 3973600 records from 50 files.


Resampling stations: 100%|██████████| 1584/1584 [00:12<00:00, 122.57it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_8.csv'

--- Processing Batch 9/9 ---
Loaded 1467244 records from 16 files.


Resampling stations: 100%|██████████| 1593/1593 [00:05<00:00, 285.91it/s]


Processed batch saved to '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/temp_batch_9.csv'

--- Step 3: Consolidating all Processed Batches ---


Appending remaining batches: 100%|██████████| 8/8 [19:16<00:00, 144.52s/it]



Consolidation of all batches is complete.

Preview of the final consolidated DataFrame (first 5 rows):
                  mday        sno  total  available_rent_bikes  \
0  2024-05-04 00:00:00  500101001   28.0                   6.0   
1  2024-05-04 00:10:00  500101001   28.0                   6.0   
2  2024-05-04 00:20:00  500101001   28.0                   3.0   
3  2024-05-04 00:30:00  500101001   28.0                   0.0   
4  2024-05-04 00:40:00  500101001   28.0                   1.0   

   available_return_bikes                 sna       lat       lng     sareaen  
0                    22.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  Daan Dist.  
1                    22.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  Daan Dist.  
2                    25.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  Daan Dist.  
3                    28.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  Daan Dist.  
4                    27.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  Daan Dist.  

Master dataset has

In [None]:
!pip install --upgrade translators

Collecting translators
  Downloading translators-6.0.1-py3-none-any.whl.metadata (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting niquests>=3.14.0 (from translators)
  Downloading niquests-3.15.2-py3-none-any.whl.metadata (16 kB)
Collecting exejs>=0.0.4 (from translators)
  Downloading exejs-0.0.6-py3-none-any.whl.metadata (5.1 kB)
Collecting pathos>=0.3.4 (from translators)
  Downloading pathos-0.3.4-py3-none-any.whl.metadata (11 kB)
Collecting cloudscraper>=1.2.71 (from translators)
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting urllib3-future<3,>=2.13.903 (from niquests>=3.14.0->translators)
  Downloading urllib3_future-2.13.906-py3-none-any.whl.metadata (15 kB)
Collecting wassima<3,>=1.0.1 (from niquests>=3.14.0->translators)
  Downloading wassim

In [None]:
import pandas as pd
import glob
import os
from tqdm import tqdm

# --- Configuration ---
# Ensure this path matches the output directory from your previous script
OUTPUT_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'

# --- Step 3: Consolidate Processed Batches into Final Master File (Memory Efficiently) ---
print("--- Step 3: Consolidating all Processed Batches ---")

# Find all the temporary batch files created by the previous step
processed_batch_files = sorted(glob.glob(os.path.join(OUTPUT_DIR, 'temp_batch_*.csv')))

if processed_batch_files:
    output_path = os.path.join(OUTPUT_DIR, 'consolidated_youbike_data_processed.csv')

    # --- Process and write the first batch with a header ---
    print("Processing first batch to create final file with header...")
    first_batch_df = pd.read_csv(processed_batch_files[0])
    first_batch_df = first_batch_df.rename(columns={'infoTime': 'mday'})
    first_batch_df['mday'] = pd.to_datetime(first_batch_df['mday'])
    first_batch_df.to_csv(output_path, index=False, header=True)

    # --- Append the remaining batches without a header ---
    if len(processed_batch_files) > 1:
        for file in tqdm(processed_batch_files[1:], desc="Appending remaining batches"):
            batch_df = pd.read_csv(file)
            batch_df = batch_df.rename(columns={'infoTime': 'mday'})
            # No need to convert mday to datetime here, as it's just being written to CSV
            batch_df.to_csv(output_path, mode='a', index=False, header=False)

    print("\nConsolidation of all batches is complete.")

    print("\nPreview of the final consolidated DataFrame (first 5 rows):")
    # Read just the start of the file for a quick preview
    print(pd.read_csv(output_path, nrows=5))

    print(f"\nMaster dataset has been saved to: '{output_path}'")
    print("NOTE: The final file is not globally sorted to prevent memory crashes.")

    # Clean up temporary batch files
    for file in processed_batch_files:
        os.remove(file)
    print("Temporary batch files have been removed.")
else:
    print("No processed batch files (temp_batch_*.csv) were found. Final file not created.")


--- Step 3: Consolidating all Processed Batches ---
Processing first batch to create final file with header...


Appending remaining batches: 100%|██████████| 8/8 [14:59<00:00, 112.43s/it]


Consolidation of all batches is complete.

Preview of the final consolidated DataFrame (first 5 rows):
                  mday        sno  total  available_rent_bikes  \
0  2024-05-04 00:00:00  500101001   28.0                   6.0   
1  2024-05-04 00:10:00  500101001   28.0                   6.0   
2  2024-05-04 00:20:00  500101001   28.0                   3.0   
3  2024-05-04 00:30:00  500101001   28.0                   0.0   
4  2024-05-04 00:40:00  500101001   28.0                   1.0   

   available_return_bikes                 sna       lat       lng  
0                    22.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  
1                    22.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  
2                    25.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  
3                    28.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  
4                    27.0  YouBike2.0_捷運科技大樓站  25.02605  121.5436  

Master dataset has been saved to: '/content/drive/MyDrive/Youbike_Master_Project/YouBike_D




In [None]:
import pandas as pd
import glob
import os
from tqdm import tqdm

# --- Configuration ---
RAW_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data/'
CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
PROCESSED_FILE_PATH = os.path.join(CLEAN_DATA_DIR, 'consolidated_youbike_data_processed.csv')
CHUNK_SIZE = 1_000_000 # Process 1 million rows at a time

print("--- Starting Memory-Efficient Data Validation ---")

# --- 1. File Existence Check ---
print(f"\n[1/7] Checking for final processed file: '{PROCESSED_FILE_PATH}'...")
if not os.path.exists(PROCESSED_FILE_PATH):
    print("... FAIL: Processed file not found. Please run the consolidation script first.")
    exit()
print("... PASS: File exists.")

# --- 2. Verify Station Completeness against Raw Data ---
print(f"\n[2/7] Verifying Station Completeness (this step loads raw site files)...")
try:
    site_files = glob.glob(os.path.join(RAW_DATA_DIR, '*_site.csv'))
    if not site_files:
        raise FileNotFoundError("No raw site files found.")

    raw_sites_df = pd.concat((pd.read_csv(file, usecols=['sno']) for file in site_files), ignore_index=True)
    unique_stations_raw = set(raw_sites_df['sno'].unique())
    print(f"      Found {len(unique_stations_raw):,} unique stations in raw site files.")

    # We will build the processed stations set chunk by chunk
    unique_stations_processed = set()
    for chunk in tqdm(pd.read_csv(PROCESSED_FILE_PATH, usecols=['sno'], chunksize=CHUNK_SIZE), desc="      Scanning for stations"):
        unique_stations_processed.update(chunk['sno'].unique())

    missing_stations = unique_stations_raw - unique_stations_processed

    if not missing_stations:
        print("... PASS: All stations from raw data are present in the final processed file.")
    else:
        print(f"... FAIL: {len(missing_stations)} stations from the raw data are missing in the final file.")
        print(f"      Missing station IDs (sno): {list(missing_stations)}")

except Exception as e:
    print(f"... FAIL: An error occurred: {e}")

# --- Initialize variables for chunk-based validation ---
nan_report = pd.Series(dtype=int)
merge_check_report = pd.Series(dtype=int)
total_incorrect_intervals = 0
last_row_of_chunk = None

# --- Perform Chunk-Based Validations (Checks 3, 4, 5, 6, 7) ---
print("\n[3-7] Performing chunk-based validation on the processed file...")
reader = pd.read_csv(PROCESSED_FILE_PATH, chunksize=CHUNK_SIZE, parse_dates=['mday'])

for i, chunk in tqdm(enumerate(reader), desc="Validating chunks"):
    # First chunk checks
    if i == 0:
        # --- 6. Verify Column Schema ---
        print("\n[6/7] Verifying final column schema (on first chunk)...")
        final_cols = set(chunk.columns)
        expected_cols = {'sno', 'mday', 'total', 'available_rent_bikes', 'available_return_bikes', 'sna', 'lat', 'lng', 'sareaen'}
        missing_expected = expected_cols - final_cols
        if not missing_expected:
            print("... PASS: Final column schema is correct.")
        else:
            print(f"... FAIL: Missing expected columns: {missing_expected}")

        # --- 7. Check Data Types ---
        print("\n[7/7] Verifying column data types (on first chunk)...")
        print("      Data types of final DataFrame:")
        print(chunk.dtypes)

    # --- 4. Check for Missing Values (NaNs) ---
    nan_report = nan_report.add(chunk.isnull().sum(), fill_value=0)

    # --- 5. Validate Successful Merging ---
    merge_check_cols = ['sna', 'lat', 'lng', 'sareaen']
    merge_check_report = merge_check_report.add(chunk[merge_check_cols].isnull().sum(), fill_value=0)

    # --- 3. Check for Correct 10-Minute Resampling Interval ---
    chunk = chunk.sort_values(by=['sno', 'mday'])

    # Check intervals *within* the chunk
    chunk['time_diff'] = chunk.groupby('sno')['mday'].diff()
    incorrect_in_chunk = chunk[chunk['time_diff'].notna() & (chunk['time_diff'] != pd.Timedelta('10 minutes'))]
    total_incorrect_intervals += len(incorrect_in_chunk)

    # Check interval *between* the last chunk and this one
    if last_row_of_chunk is not None:
        first_row_of_chunk = chunk.iloc[0]
        if last_row_of_chunk['sno'] == first_row_of_chunk['sno']:
            between_chunk_diff = first_row_of_chunk['mday'] - last_row_of_chunk['mday']
            if between_chunk_diff != pd.Timedelta('10 minutes'):
                total_incorrect_intervals += 1

    last_row_of_chunk = chunk.iloc[-1]

# --- Final Reports for Chunk-Based Checks ---
print("\n--- Final Validation Reports ---")

# Report for Check 4
print("\n[4/7] Final Report: Missing Values (NaNs)...")
nan_in_key_cols = nan_report[nan_report > 0]
if nan_in_key_cols.empty:
    print("... PASS: No missing values found in any column across the entire dataset.")
else:
    print("... FAIL: Missing values were found in the following columns:")
    print(nan_in_key_cols)

# Report for Check 5
print("\n[5/7] Final Report: Merge Success...")
if merge_check_report.sum() == 0:
    print("... PASS: Station name, coordinates, and district were successfully merged for all records.")
else:
    print("... FAIL: Some records are missing site information, indicating a merge issue.")
    print(merge_check_report[merge_check_report > 0])

# Report for Check 3
print("\n[3/7] Final Report: Resampling Interval...")
if total_incorrect_intervals == 0:
    print("... PASS: All time intervals between records for each station are exactly 10 minutes.")
else:
    print(f"... FAIL: Found {total_incorrect_intervals} records with incorrect time intervals.")

print("\n--- Validation Complete ---")



--- Starting Memory-Efficient Data Validation ---

[1/7] Checking for final processed file: '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/consolidated_youbike_data_processed.csv'...
... PASS: File exists.

[2/7] Verifying Station Completeness (this step loads raw site files)...
      Found 1,613 unique stations in raw site files.


      Scanning for stations: 89it [01:13,  1.21it/s]


... PASS: All stations from raw data are present in the final processed file.

[3-7] Performing chunk-based validation on the processed file...


Validating chunks: 0it [00:00, ?it/s]


[6/7] Verifying final column schema (on first chunk)...
... PASS: Final column schema is correct.

[7/7] Verifying column data types (on first chunk)...
      Data types of final DataFrame:
mday                      datetime64[ns]
sno                                int64
total                            float64
available_rent_bikes             float64
available_return_bikes           float64
sna                               object
lat                              float64
lng                              float64
sareaen                           object
dtype: object


Validating chunks: 89it [02:17,  1.54s/it]


--- Final Validation Reports ---

[4/7] Final Report: Missing Values (NaNs)...
... PASS: No missing values found in any column across the entire dataset.

[5/7] Final Report: Merge Success...
... PASS: Station name, coordinates, and district were successfully merged for all records.

[3/7] Final Report: Resampling Interval...
... PASS: All time intervals between records for each station are exactly 10 minutes.

--- Validation Complete ---





In [1]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the file path
FILE_PATH = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/consolidated_youbike_data_processed.csv'

try:
    # Use the 'usecols' parameter to load only the 'sareaen' column
    print("Reading only the 'sareaen' column to save memory...")
    df_column = pd.read_csv(FILE_PATH, usecols=['sareaen'])

    # Get the unique values from that single column
    unique_districts = df_column['sareaen'].unique()

    print("\n✅ Success! Here are the unique districts found in your file:")
    for district in sorted(unique_districts): # sorted() makes the list alphabetical
        print(district)

except FileNotFoundError:
    print(f"\n❌ Error: The file was not found at '{FILE_PATH}'.")
    print("Please double-check the path, folder names, and file name are correct.")
except ValueError as e:
    # This error often occurs if 'sareaen' is not in the CSV's header
    print(f"\n❌ Error: {e}. Is 'sareaen' definitely the correct column name?")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading only the 'sareaen' column to save memory...

✅ Success! Here are the unique districts found in your file:
Beitou Dist
Daan Dist.
Datong Dist
NTU Dist
Nangang Dist
Neihu Dist
Shilin Dist
Songshan Dist
Wanhua Dist
Wenshan Dist
Xinyi Dist
Zhongshan Dist
Zhongzheng Dist


In [4]:
import requests
import pandas as pd
import os
from google.colab import drive

# --- 1. Define Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

# The specific folder you want to save the file in
CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
# The name for your new file
output_filename = 'youbike_station_names.csv'
# Combine the folder and filename into a full path
full_output_path = os.path.join(CLEAN_DATA_DIR, output_filename)

# Create the directory if it doesn't exist
os.makedirs(CLEAN_DATA_DIR, exist_ok=True)


# --- 2. Fetch and Process Data ---
url = "https://tcgbusfs.blob.core.windows.net/dotapp/youbike/v2/youbike_immediate.json"

try:
    print("\nFetching and processing data...")
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    df = pd.DataFrame(data)

    # Clean the 'snaen' column by removing the prefix
    df['snaen'] = df['snaen'].str.removeprefix('YouBike2.0_')

    # Select only the 'sno' and the now-cleaned 'snaen' columns
    df_to_save = df[['sno', 'snaen']]
    print("✓ Data processed and filtered successfully.")

    print("\nPreview of the data to be saved:")
    print(df_to_save.head())


    # --- 3. Save the Filtered DataFrame to CSV ---
    df_to_save.to_csv(full_output_path, index=False, encoding='utf-8-sig')
    print(f"\n✅ Success! Data has been saved to:\n{full_output_path}")

except Exception as e:
    print(f"An error occurred: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Fetching and processing data...
✓ Data processed and filtered successfully.

Preview of the data to be saved:
         sno                                       snaen
0  500101001                   MRT Technology Bldg. Sta.
1  500101002               No.273， Sec. 2， Fuxing S. Rd.
2  500101003    NTUE Experiment Elementary School (East)
3  500101004                          Heping Park (East)
4  500101005  Xinhai Fuxing Rd. Intersection (Northwest)

✅ Success! Data has been saved to:
/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/youbike_station_names.csv


In [6]:
df_to_save.head()

Unnamed: 0,sno,snaen
0,500101001,MRT Technology Bldg. Sta.
1,500101002,No.273， Sec. 2， Fuxing S. Rd.
2,500101003,NTUE Experiment Elementary School (East)
3,500101004,Heping Park (East)
4,500101005,Xinhai Fuxing Rd. Intersection (Northwest)


In [8]:
import requests
import pandas as pd
import os
from google.colab import drive

# --- 1. Setup Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
input_filename = 'consolidated_youbike_data_processed.csv'
output_filename = 'consolidated_data_with_snaen_corrected.csv' # New output file

input_path = os.path.join(CLEAN_DATA_DIR, input_filename)
output_path = os.path.join(CLEAN_DATA_DIR, output_filename)


# --- 2. Fetch Master Station List and Standardize Data Type ---
url = "https://tcgbusfs.blob.core.windows.net/dotapp/youbike/v2/youbike_immediate.json"
print("\nFetching master station list from the YouBike API...")

try:
    response = requests.get(url)
    response.raise_for_status()
    api_data = response.json()

    station_map_df = pd.DataFrame(api_data)

    # **FIX 1: Convert API 'sno' to string before creating the lookup**
    station_map_df['sno'] = station_map_df['sno'].astype(str)

    station_map_df['snaen'] = station_map_df['snaen'].str.removeprefix('YouBike2.0_')

    station_lookup = station_map_df.set_index('sno')['snaen'].to_dict()
    print(f"✓ Created a lookup map for {len(station_lookup)} unique stations.")

except Exception as e:
    print(f"❌ Error fetching or processing API data: {e}")
    exit()


# --- 3. Process the Large CSV in Chunks with Standardized Data Type ---
print(f"\nProcessing your large dataset from: {input_path}")
try:
    chunk_num = 1

    chunk_iterator = pd.read_csv(input_path, chunksize=500000, low_memory=False)

    for chunk in chunk_iterator:
        print(f"  -> Processing chunk {chunk_num}...")

        # **FIX 2: Convert 'sno' in your data chunk to string before mapping**
        chunk['sno'] = chunk['sno'].astype(str)

        # Add the 'snaen' column using the now-compatible lookup
        chunk['snaen'] = chunk['sno'].map(station_lookup)

        # Fill any remaining missing values for stations that are truly decommissioned
        chunk['snaen'].fillna('Decommissioned Station', inplace=True)

        # Save the processed chunk
        if chunk_num == 1:
            chunk.to_csv(output_path, mode='w', index=False, header=True, encoding='utf-8-sig')
        else:
            chunk.to_csv(output_path, mode='a', index=False, header=False, encoding='utf-8-sig')

        chunk_num += 1

    print("\n✓ All chunks processed and saved.")
    print(f"\n✅ Success! The updated data has been saved to:\n{output_path}")

except FileNotFoundError:
    print(f"❌ Error: The file '{input_path}' was not found.")
except Exception as e:
    print(f"An error occurred during chunk processing: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Fetching master station list from the YouBike API...
✓ Created a lookup map for 1647 unique stations.

Processing your large dataset from: /content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/consolidated_youbike_data_processed.csv
  -> Processing chunk 1...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 2...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 3...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 4...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 5...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 6...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 7...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 8...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 9...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 10...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 11...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 12...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 13...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 14...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 15...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 16...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 17...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 18...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 19...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 20...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 21...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 22...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 23...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 24...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 25...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 26...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 27...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 28...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 29...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 30...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 31...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 32...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 33...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 34...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 35...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 36...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 37...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 38...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 39...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 40...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 41...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 42...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 43...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 44...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 45...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 46...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 47...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 48...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 49...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 50...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 51...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 52...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 53...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 54...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 55...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 56...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 57...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 58...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 59...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 60...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 61...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 62...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 63...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 64...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 65...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


  -> Processing chunk 66...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk['snaen'].fillna('Decommissioned Station', inplace=True)


KeyboardInterrupt: 

In [10]:
import requests
import pandas as pd
import os
from google.colab import drive

# --- 1. Setup Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
input_filename = 'consolidated_youbike_data_processed.csv'
output_filename = 'consolidated_data_with_snaen_corrected.csv'

input_path = os.path.join(CLEAN_DATA_DIR, input_filename)
output_path = os.path.join(CLEAN_DATA_DIR, output_filename)


# --- 2. Fetch Master Station List and Standardize Data Type ---
url = "https://tcgbusfs.blob.core.windows.net/dotapp/youbike/v2/youbike_immediate.json"
print("\nFetching master station list from the YouBike API...")

try:
    response = requests.get(url)
    response.raise_for_status()
    api_data = response.json()

    station_map_df = pd.DataFrame(api_data)

    # FIX 1: Convert API 'sno' to string
    station_map_df['sno'] = station_map_df['sno'].astype(str)

    station_map_df['snaen'] = station_map_df['snaen'].str.removeprefix('YouBike2.0_')

    station_lookup = station_map_df.set_index('sno')['snaen'].to_dict()
    print(f"✓ Created a lookup map for {len(station_lookup)} unique stations.")

except Exception as e:
    print(f"❌ Error fetching or processing API data: {e}")
    exit()


# --- 3. Process the ENTIRE Large CSV in Chunks ---
print(f"\nProcessing your large dataset...")
try:
    chunk_num = 1
    chunk_iterator = pd.read_csv(input_path, chunksize=500000, low_memory=False)

    for chunk in chunk_iterator:
        print(f"  -> Processing chunk {chunk_num}...")

        # FIX 2: Convert 'sno' in your data chunk to string before mapping
        chunk['sno'] = chunk['sno'].astype(str)

        # Add the 'snaen' column using the now-compatible lookup
        chunk['snaen'] = chunk['sno'].map(station_lookup)

        # Fill any remaining missing values for stations that are truly decommissioned
        chunk['snaen'] = chunk['snaen'].fillna('Decommissioned Station')

        # Save the processed chunk
        if chunk_num == 1:
            chunk.to_csv(output_path, mode='w', index=False, header=True, encoding='utf-8-sig')
        else:
            chunk.to_csv(output_path, mode='a', index=False, header=False, encoding='utf-8-sig')

        chunk_num += 1

    print("\n✓ All chunks processed and saved.")
    print(f"\n✅ Success! The updated data has been saved to:\n{output_path}")

except FileNotFoundError:
    print(f"❌ Error: The file '{input_path}' was not found.")
except Exception as e:
    print(f"An error occurred during chunk processing: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Fetching master station list from the YouBike API...
✓ Created a lookup map for 1647 unique stations.

Processing your large dataset...
  -> Processing chunk 1...
  -> Processing chunk 2...
  -> Processing chunk 3...
  -> Processing chunk 4...
  -> Processing chunk 5...
  -> Processing chunk 6...
  -> Processing chunk 7...
  -> Processing chunk 8...
  -> Processing chunk 9...
  -> Processing chunk 10...
  -> Processing chunk 11...
  -> Processing chunk 12...
  -> Processing chunk 13...
  -> Processing chunk 14...
  -> Processing chunk 15...
  -> Processing chunk 16...
  -> Processing chunk 17...
  -> Processing chunk 18...
  -> Processing chunk 19...
  -> Processing chunk 20...
  -> Processing chunk 21...
  -> Processing chunk 22...
  -> Processing chunk 23...
  -> Processing chunk 24...
  -> Processing chunk 25...
  -> Processi

In [11]:
import pandas as pd
import os
from google.colab import drive

# --- 1. Setup Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
# The file you just created
file_to_check = 'consolidated_data_with_snaen_corrected.csv'
file_path = os.path.join(CLEAN_DATA_DIR, file_to_check)

# --- 2. Load a Sample of the New File ---
print(f"\nLoading a sample from your new file:\n{file_path}")
try:
    # Read just the first 10,000 rows to check
    sample_df = pd.read_csv(file_path, nrows=10000)
    print("✓ Sample loaded successfully.")

    # --- 3. Run Verification Checks ---
    print("\n--- Verification Report ---")

    # Check 1: Does the 'snaen' column exist?
    if 'snaen' in sample_df.columns:
        print("✅ Success: The 'snaen' column exists in the new file.")
    else:
        print("❌ Failed: The 'snaen' column was NOT found.")
        exit() # Stop if the column isn't even there

    # Check 2: How many stations were successfully matched in this sample?
    total_rows = len(sample_df)
    decommissioned_count = (sample_df['snaen'] == 'Decommissioned Station').sum()
    successful_matches = total_rows - decommissioned_count

    print(f"✅ Success: Found {successful_matches} matched station names in the first {total_rows} rows.")
    if decommissioned_count > 0:
        print(f"   - Found {decommissioned_count} rows corresponding to decommissioned stations.")

    # Check 3: Show a visual preview
    print("\n--- Data Preview ---")
    print("Here are the first 10 rows of your new data:")
    print(sample_df[['sno', 'sna', 'snaen']].head(10))

except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found. Please make sure the previous script ran successfully and the filename is correct.")
except Exception as e:
    print(f"An error occurred: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Loading a sample from your new file:
/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/consolidated_data_with_snaen_corrected.csv
✓ Sample loaded successfully.

--- Verification Report ---
✅ Success: The 'snaen' column exists in the new file.
✅ Success: Found 10000 matched station names in the first 10000 rows.

--- Data Preview ---
Here are the first 10 rows of your new data:
         sno                 sna                      snaen
0  500101001  YouBike2.0_捷運科技大樓站  MRT Technology Bldg. Sta.
1  500101001  YouBike2.0_捷運科技大樓站  MRT Technology Bldg. Sta.
2  500101001  YouBike2.0_捷運科技大樓站  MRT Technology Bldg. Sta.
3  500101001  YouBike2.0_捷運科技大樓站  MRT Technology Bldg. Sta.
4  500101001  YouBike2.0_捷運科技大樓站  MRT Technology Bldg. Sta.
5  500101001  YouBike2.0_捷運科技大樓站  MRT Technology Bldg. Sta.
6  500101

In [None]:
import requests
import pandas as pd
import os
from google.colab import drive

# --- 1. Setup Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
input_filename = 'consolidated_youbike_data_processed.csv'
output_filename = 'consolidated_data_with_snaen_corrected.csv'

input_path = os.path.join(CLEAN_DATA_DIR, input_filename)
output_path = os.path.join(CLEAN_DATA_DIR, output_filename)


# --- 2. Fetch Master Station List and Standardize Data Type ---
url = "https://tcgbusfs.blob.core.windows.net/dotapp/youbike/v2/youbike_immediate.json"
print("\nFetching master station list from the YouBike API...")

try:
    response = requests.get(url)
    response.raise_for_status()
    api_data = response.json()

    station_map_df = pd.DataFrame(api_data)
    station_map_df['sno'] = station_map_df['sno'].astype(str)
    station_map_df['snaen'] = station_map_df['snaen'].str.removeprefix('YouBike2.0_')

    station_lookup = station_map_df.set_index('sno')['snaen'].to_dict()
    print(f"✓ Created a lookup map for {len(station_lookup)} unique stations.")

except Exception as e:
    print(f"❌ Error fetching or processing API data: {e}")
    exit()


# --- 3. Process the Large CSV in Larger Chunks ---
print(f"\nProcessing your large dataset with an increased chunk size...")
try:
    chunk_num = 1
    # **OPTIMIZATION: Increased chunksize from 500,000 to 2,000,000**
    chunk_iterator = pd.read_csv(input_path, chunksize=2000000, low_memory=False)

    for chunk in chunk_iterator:
        print(f"  -> Processing chunk {chunk_num}...")

        chunk['sno'] = chunk['sno'].astype(str)
        chunk['snaen'] = chunk['sno'].map(station_lookup)
        chunk['snaen'] = chunk['snaen'].fillna('Decommissioned Station')

        if chunk_num == 1:
            chunk.to_csv(output_path, mode='w', index=False, header=True, encoding='utf-8-sig')
        else:
            chunk.to_csv(output_path, mode='a', index=False, header=False, encoding='utf-8-sig')

        chunk_num += 1

    print("\n✓ All chunks processed and saved.")
    print(f"\n✅ Success! The updated data has been saved to:\n{output_path}")

except FileNotFoundError:
    print(f"❌ Error: The file '{input_path}' was not found.")
except Exception as e:
    print(f"An error occurred during chunk processing: {e}")

In [13]:
import pandas as pd
import os
from google.colab import drive

# --- 1. Setup Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
file_to_check = 'consolidated_data_with_snaen_corrected.csv'
file_path = os.path.join(CLEAN_DATA_DIR, file_to_check)

# --- 2. Process the ENTIRE File in Chunks for Verification ---
print(f"\nStarting full verification of file:\n{file_path}")
try:
    # Initialize counters for the final report
    total_rows_processed = 0
    total_decommissioned = 0
    chunk_num = 1

    # Create an iterator to read the CSV in chunks of 2,000,000 rows
    chunk_iterator = pd.read_csv(file_path, chunksize=2000000, low_memory=False)

    for chunk in chunk_iterator:
        print(f"  -> Verifying chunk {chunk_num}...")

        # Check if 'snaen' column exists in this chunk
        if 'snaen' not in chunk.columns:
            print("❌ Critical Error: 'snaen' column is missing in this chunk. Stopping.")
            break

        # Update the total counts
        total_rows_processed += len(chunk)
        total_decommissioned += (chunk['snaen'] == 'Decommissioned Station').sum()

        chunk_num += 1

    # Calculate the final numbers
    total_successful = total_rows_processed - total_decommissioned

    # --- 3. Display the Final, Aggregated Report ---
    print("\n--- Full File Verification Report ---")
    print(f"✅ Total rows processed: {total_rows_processed:,}")
    print(f"✅ Total successful matches found: {total_successful:,}")
    print(f"ℹ️  Total rows for decommissioned stations: {total_decommissioned:,}")

    if total_rows_processed > 0:
        success_rate = (total_successful / total_rows_processed) * 100
        print(f"   -> Match Rate: {success_rate:.4f}%")

    print("\n✅ Verification of the entire file is complete.")


except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Starting full verification of file:
/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/consolidated_data_with_snaen_corrected.csv
  -> Verifying chunk 1...
  -> Verifying chunk 2...
  -> Verifying chunk 3...
  -> Verifying chunk 4...
  -> Verifying chunk 5...
  -> Verifying chunk 6...
  -> Verifying chunk 7...
  -> Verifying chunk 8...
  -> Verifying chunk 9...
  -> Verifying chunk 10...
  -> Verifying chunk 11...
  -> Verifying chunk 12...
  -> Verifying chunk 13...
  -> Verifying chunk 14...
  -> Verifying chunk 15...
  -> Verifying chunk 16...
  -> Verifying chunk 17...
  -> Verifying chunk 18...
  -> Verifying chunk 19...
  -> Verifying chunk 20...
  -> Verifying chunk 21...
  -> Verifying chunk 22...
  -> Verifying chunk 23...
  -> Verifying chunk 24...
  -> Verifying chunk 25...
  -> Verifying

In [14]:
import pandas as pd
import os
from google.colab import drive

# --- 1. Setup Paths and Mount Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')

CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
file_to_analyze = 'consolidated_data_with_snaen_corrected.csv'
file_path = os.path.join(CLEAN_DATA_DIR, file_to_analyze)

# --- 2. Process the Entire File in Chunks to Find Decommissioned Stations ---
print(f"\nScanning the entire file for decommissioned stations...")
try:
    # Use a dictionary to store the unique sno -> sna mapping for decommissioned stations
    decommissioned_stations = {}
    chunk_num = 1

    # Create an iterator to read the CSV in chunks
    chunk_iterator = pd.read_csv(file_path, chunksize=6000000, low_memory=False)

    for chunk in chunk_iterator:
        print(f"  -> Scanning chunk {chunk_num}...")

        # Filter the chunk to find rows where 'snaen' is 'Decommissioned Station'
        decommissioned_chunk = chunk[chunk['snaen'] == 'Decommissioned Station']

        # If any are found, add their unique 'sno' and 'sna' to our dictionary
        if not decommissioned_chunk.empty:
            # Drop duplicates to only get unique sno/sna pairs from this chunk
            unique_in_chunk = decommissioned_chunk[['sno', 'sna']].drop_duplicates()
            # Update the master dictionary
            for index, row in unique_in_chunk.iterrows():
                if row['sno'] not in decommissioned_stations:
                    decommissioned_stations[row['sno']] = row['sna']

        chunk_num += 1

    # --- 3. Display the Final Report ---
    print("\n--- Decommissioned Stations Report ---")
    if decommissioned_stations:
        print(f"✅ Found {len(decommissioned_stations)} unique decommissioned stations in the entire file.")

        # Convert the dictionary to a DataFrame for nice printing
        decommissioned_df = pd.DataFrame(list(decommissioned_stations.items()), columns=['SNO', 'Original SNA (Chinese Name)'])

        print("\nHere is the complete list:")
        print(decommissioned_df.to_string())
    else:
        print("✅ No decommissioned stations found in the entire file.")

except FileNotFoundError:
    print(f"❌ Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Scanning the entire file for decommissioned stations...
  -> Scanning chunk 1...
  -> Scanning chunk 2...
  -> Scanning chunk 3...
  -> Scanning chunk 4...
  -> Scanning chunk 5...
  -> Scanning chunk 6...
  -> Scanning chunk 7...
  -> Scanning chunk 8...
  -> Scanning chunk 9...
  -> Scanning chunk 10...
  -> Scanning chunk 11...
  -> Scanning chunk 12...
  -> Scanning chunk 13...
  -> Scanning chunk 14...
  -> Scanning chunk 15...

--- Decommissioned Stations Report ---
✅ Found 6 unique decommissioned stations in the entire file.

Here is the complete list:
         SNO Original SNA (Chinese Name)
0  500101183      YouBike2.0_忠孝東路三段217巷口
1  500105052      YouBike2.0_國立政治大學(萬壽路)
2  500107057           YouBike2.0_中山公民會館
3  500107074           YouBike2.0_中原民生路口
4  500108130          YouBike2.0_行善路383巷
5  500106141        YouBike2

In [16]:
import requests
import json
import sys

def check_youbike_stations():
    """
    Downloads the YouBike station data and checks if a list of
    specified station IDs are present in the dataset.
    """
    # URL for the YouBike 2.0 immediate data API
    url = "https://tcgbusfs.blob.core.windows.net/dotapp/youbike/v2/youbike_immediate.json"

    # The list of station IDs provided by the user to check
    # Note: These are converted to strings to match the data type in the JSON.
    stations_to_check = [
        "500101183",
        "500105052",
        "500107057",
        "500107074",
        "500108130",
        "500106141"
    ]

    print("Fetching YouBike station data from the API...")

    try:
        # Send a GET request to the URL
        response = requests.get(url, timeout=10)

        # Raise an HTTPError for bad responses (4xx or 5xx)
        response.raise_for_status()

        # Parse the JSON content from the response
        data = response.json()

        # Determine if the data is a dictionary or a list
        if isinstance(data, dict) and 'retVal' in data:
            stations = data['retVal']
        elif isinstance(data, list):
            stations = data
        else:
            print("The API response is in an unexpected format.", file=sys.stderr)
            return

        # A set is used for efficient lookup (O(1) average time complexity)
        # We extract the 'sno' (station number) from each station entry
        existing_station_ids = {station['sno'] for station in stations}

        print("\n--- Checking Stations ---\n")

        # Check each station ID from the list
        for station_id in stations_to_check:
            if station_id in existing_station_ids:
                print(f"✅ Station ID '{station_id}' was found in the data.")
            else:
                print(f"❌ Station ID '{station_id}' was NOT found in the data.")

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the data: {e}", file=sys.stderr)
        print("Please check your internet connection or the API URL.", file=sys.stderr)

    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing the JSON data: {e}", file=sys.stderr)
        print("The data received from the API may be in an incorrect format.", file=sys.stderr)

    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)

if __name__ == "__main__":
    check_youbike_stations()


Fetching YouBike station data from the API...

--- Checking Stations ---

❌ Station ID '500101183' was NOT found in the data.
❌ Station ID '500105052' was NOT found in the data.
❌ Station ID '500107057' was NOT found in the data.
❌ Station ID '500107074' was NOT found in the data.
❌ Station ID '500108130' was NOT found in the data.
❌ Station ID '500106141' was NOT found in the data.


In [4]:
import pandas as pd
import os
import sys
from google.colab import drive

print("Connecting to Google Drive...")
drive.mount('/content/drive')

# Define the directory and file path as provided
CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
file_to_analyze = 'consolidated_data_with_snaen_corrected.csv'
filepath = os.path.join(CLEAN_DATA_DIR, file_to_analyze)

# The list of station IDs to be deleted from the data
stations_to_delete = [
    "500101183",
    "500105052",
    "500107057",
    "500107074",
    "500108130",
    "500106141"
]

def remove_stations_from_csv_in_batches():
    """
    Reads a large CSV file in batches, removes specified station data,
    and saves the result to a new file to avoid overwriting the original.
    """
    print(f"Reading data from: {filepath}")

    try:
        # Create a new output file path with a '_cleaned' suffix
        filename, file_extension = os.path.splitext(file_to_analyze)
        new_filepath = os.path.join(CLEAN_DATA_DIR, f"{filename}_cleaned{file_extension}")

        # Check if the input file exists
        if not os.path.exists(filepath):
            print(f"Error: The input file was not found at {filepath}. Please check the path.", file=sys.stderr)
            return

        # Initialize counters
        total_rows_read = 0
        total_rows_deleted = 0
        batch_size = 6000000

        # Read the CSV in chunks
        chunk_iterator = pd.read_csv(filepath, chunksize=batch_size, iterator=True)

        # We need a flag to handle the header
        is_first_chunk = True

        for batch_number, chunk in enumerate(chunk_iterator, 1):
            # Check if the 'sno' column exists in the first chunk
            if is_first_chunk and 'sno' not in chunk.columns:
                print("Error: The 'sno' column was not found in the CSV file.", file=sys.stderr)
                return

            # Filter the chunk to remove the specified stations
            cleaned_chunk = chunk[~chunk['sno'].isin(stations_to_delete)]

            # Write the cleaned chunk to the new file
            # If it's the first chunk, write with header. Otherwise, append without header.
            cleaned_chunk.to_csv(
                new_filepath,
                index=False,
                mode='w' if is_first_chunk else 'a',
                header=is_first_chunk
            )

            total_rows_read += len(chunk)
            total_rows_deleted += len(chunk) - len(cleaned_chunk)

            print(f"Processed batch {batch_number}. Rows read: {len(chunk)}. Rows deleted: {len(chunk) - len(cleaned_chunk)}.")

            is_first_chunk = False

        print("\n--- Operation Complete ---")
        print(f"Total rows read: {total_rows_read}")
        print(f"Total rows deleted: {total_rows_deleted}")
        print(f"Final number of rows: {total_rows_read - total_rows_deleted}")
        print(f"Cleaned data saved to: {new_filepath}")

    except FileNotFoundError:
        print(f"Error: The file was not found at {filepath}. Please check the path.", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)

if __name__ == "__main__":
    remove_stations_from_csv_in_batches()


Connecting to Google Drive...
Mounted at /content/drive
Reading data from: /content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/consolidated_data_with_snaen_corrected.csv
Processed batch 1. Rows read: 6000000. Rows deleted: 0.
Processed batch 2. Rows read: 6000000. Rows deleted: 0.
Processed batch 3. Rows read: 6000000. Rows deleted: 0.
Processed batch 4. Rows read: 6000000. Rows deleted: 0.
Processed batch 5. Rows read: 6000000. Rows deleted: 0.
Processed batch 6. Rows read: 6000000. Rows deleted: 0.
Processed batch 7. Rows read: 6000000. Rows deleted: 0.
Processed batch 8. Rows read: 6000000. Rows deleted: 0.
Processed batch 9. Rows read: 6000000. Rows deleted: 0.
Processed batch 10. Rows read: 6000000. Rows deleted: 0.
Processed batch 11. Rows read: 6000000. Rows deleted: 0.
Processed batch 12. Rows read: 6000000. Rows deleted: 0.
Processed batch 13. Rows read: 6000000. Rows deleted: 0.
Processed batch 14. Rows read: 6000000. Rows deleted: 0.
Processed ba

In [5]:
import pandas as pd
import os
import sys

# Define the directory and file path as provided
CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
file_to_analyze = 'ubike_consolidated_data.csv'
filepath = os.path.join(CLEAN_DATA_DIR, file_to_analyze)

# The list of station IDs to be deleted from the data
stations_to_delete = [
    "500101183",
    "500105052",
    "500107057",
    "500107074",
    "500108130",
    "500106141"
]

def remove_stations_from_csv_in_batches():
    """
    Reads a large CSV file in batches, removes specified station data,
    and saves the result to a new file to avoid overwriting the original.
    """
    print(f"Reading data from: {filepath}")

    try:
        # Create a new output file path with a '_cleaned' suffix
        filename, file_extension = os.path.splitext(file_to_analyze)
        new_filepath = os.path.join(CLEAN_DATA_DIR, f"{filename}_cleaned{file_extension}")

        # Check if the input file exists
        if not os.path.exists(filepath):
            print(f"Error: The input file was not found at {filepath}. Please check the path.", file=sys.stderr)
            return

        # Initialize counters
        total_rows_read = 0
        total_rows_deleted = 0
        batch_size = 12000000

        # Read the CSV in chunks
        chunk_iterator = pd.read_csv(filepath, chunksize=batch_size, iterator=True)

        # We need a flag to handle the header
        is_first_chunk = True

        for batch_number, chunk in enumerate(chunk_iterator, 1):
            # Check if the 'sno' column exists in the first chunk
            if is_first_chunk and 'sno' not in chunk.columns:
                print("Error: The 'sno' column was not found in the CSV file.", file=sys.stderr)
                return

            # CRITICAL FIX: Convert the 'sno' column to a string and strip whitespace
            chunk['sno'] = chunk['sno'].astype(str).str.strip()

            # Filter the chunk to remove the specified stations
            cleaned_chunk = chunk[~chunk['sno'].isin(stations_to_delete)]

            # Write the cleaned chunk to the new file
            # If it's the first chunk, write with header. Otherwise, append without header.
            cleaned_chunk.to_csv(
                new_filepath,
                index=False,
                mode='w' if is_first_chunk else 'a',
                header=is_first_chunk
            )

            total_rows_read += len(chunk)
            total_rows_deleted += len(chunk) - len(cleaned_chunk)

            print(f"Processed batch {batch_number}. Total rows processed so far: {total_rows_read}. Rows deleted in this batch: {len(chunk) - len(cleaned_chunk)}.")

            is_first_chunk = False

        print("\n--- Operation Complete ---")
        print(f"Total rows read: {total_rows_read}")
        print(f"Total rows deleted: {total_rows_deleted}")
        print(f"Final number of rows: {total_rows_read - total_rows_deleted}")
        print(f"Cleaned data saved to: {new_filepath}")

    except FileNotFoundError:
        print(f"Error: The file was not found at {filepath}. Please check the path.", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)

if __name__ == "__main__":
    remove_stations_from_csv_in_batches()


Reading data from: /content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/ubike_consolidated_data.csv
Processed batch 1. Total rows processed so far: 12000000. Rows deleted in this batch: 23351.
Processed batch 2. Total rows processed so far: 24000000. Rows deleted in this batch: 14982.
Processed batch 3. Total rows processed so far: 36000000. Rows deleted in this batch: 28749.
Processed batch 4. Total rows processed so far: 48000000. Rows deleted in this batch: 14387.
Processed batch 5. Total rows processed so far: 60000000. Rows deleted in this batch: 14396.
Processed batch 6. Total rows processed so far: 72000000. Rows deleted in this batch: 11846.
Processed batch 7. Total rows processed so far: 84000000. Rows deleted in this batch: 9521.
Processed batch 8. Total rows processed so far: 88611411. Rows deleted in this batch: 2298.

--- Operation Complete ---
Total rows read: 88611411
Total rows deleted: 119530
Final number of rows: 88491881
Cleaned data saved

In [2]:
import pandas as pd
import os
import sys
from google.colab import drive

print("Connecting to Google Drive...")
drive.mount('/content/drive')

# Define the directory and file path as provided
CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
file_to_analyze = 'ubike_consolidated_data_cleaned.csv'
filepath = os.path.join(CLEAN_DATA_DIR, file_to_analyze)

# Define the output file path to avoid overwriting the original
output_file = 'ubike_consolidated_data_no_sna.csv'
output_filepath = os.path.join(CLEAN_DATA_DIR, output_file)

def drop_column_from_csv_in_batches():
    """
    Reads a large CSV file in batches, removes a specified column,
    and saves the result to a new file.
    """
    print(f"Reading data from: {filepath}")

    try:
        if not os.path.exists(filepath):
            print(f"Error: The input file was not found at {filepath}. Please check the path.", file=sys.stderr)
            return

        batch_size = 12000000
        chunk_iterator = pd.read_csv(filepath, chunksize=batch_size, iterator=True)

        is_first_chunk = True
        total_rows_processed = 0

        for batch_number, chunk in enumerate(chunk_iterator, 1):
            if is_first_chunk and 'sna' not in chunk.columns:
                print("Error: The 'sna' column was not found in the CSV file.", file=sys.stderr)
                return

            # Drop the 'sna' column from the current chunk
            chunk_without_sna = chunk.drop(columns=['sna'])

            # Write the processed chunk to the new file
            chunk_without_sna.to_csv(
                output_filepath,
                index=False,
                mode='w' if is_first_chunk else 'a',
                header=is_first_chunk
            )

            total_rows_processed += len(chunk)

            print(f"Processed batch {batch_number}. Total rows processed so far: {total_rows_processed}.")

            is_first_chunk = False

        print("\n--- Operation Complete ---")
        print(f"Total rows processed: {total_rows_processed}")
        print(f"Cleaned data saved to: {output_filepath}")

    except FileNotFoundError:
        print(f"Error: The file was not found at {filepath}. Please check the path.", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)

if __name__ == "__main__":
    drop_column_from_csv_in_batches()


Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading data from: /content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/ubike_consolidated_data_cleaned.csv
Processed batch 1. Total rows processed so far: 12000000.
Processed batch 2. Total rows processed so far: 24000000.
Processed batch 3. Total rows processed so far: 36000000.
Processed batch 4. Total rows processed so far: 48000000.
Processed batch 5. Total rows processed so far: 60000000.
Processed batch 6. Total rows processed so far: 72000000.
Processed batch 7. Total rows processed so far: 84000000.
Processed batch 8. Total rows processed so far: 88491881.

--- Operation Complete ---
Total rows processed: 88491881
Cleaned data saved to: /content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/ubike_consolidated_data_no_sna.csv


In [4]:
import pandas as pd
import os
import sys
from google.colab import drive

print("Connecting to Google Drive...")
drive.mount('/content/drive')

# Define the directory and file path
CLEAN_DATA_DIR = '/content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/'
file_to_view = 'ubike_consolidated_data_no_sna.csv'
filepath = os.path.join(CLEAN_DATA_DIR, file_to_view)

def show_csv_head():
    """
    Loads and displays the first few rows of the specified CSV file.
    """
    print(f"Loading and displaying the head of: {filepath}")

    try:
        if not os.path.exists(filepath):
            print(f"Error: The file was not found at {filepath}. Please check the path.", file=sys.stderr)
            return

        # Read only the first 5 rows to save memory and time
        df_head = pd.read_csv(filepath, nrows=5)

        print("\n--- File Head ---")
        print(df_head)

    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)

if __name__ == "__main__":
    show_csv_head()


Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading and displaying the head of: /content/drive/MyDrive/Youbike_Master_Project/YouBike_Demand_Forecast/data_clean/ubike_consolidated_data_no_sna.csv

--- File Head ---
                  mday        sno  total  available_rent_bikes  \
0  2024-05-04 00:00:00  500101001   28.0                   6.0   
1  2024-05-04 00:10:00  500101001   28.0                   6.0   
2  2024-05-04 00:20:00  500101001   28.0                   3.0   
3  2024-05-04 00:30:00  500101001   28.0                   0.0   
4  2024-05-04 00:40:00  500101001   28.0                   1.0   

   available_return_bikes       lat       lng     sareaen  \
0                    22.0  25.02605  121.5436  Daan Dist.   
1                    22.0  25.02605  121.5436  Daan Dist.   
2                    25.0  25.02605  121.5436  Daan Dist.   
3                    28.0  25