In [3]:
import pandas as pd
import os

# The directory where you unzipped the five 'gtfs_...' folders
DATA_DIR = os.path.join("..", "..", "data", "raw") 

# A new folder to save your combined files
COMBINED_DIR = os.path.join("..", "..", "data", "processed", "gtfs_combined")

# Create the output directory if it doesn't exist
os.makedirs(COMBINED_DIR, exist_ok=True)

In [4]:
import pandas as pd
import os

# List of the five borough abbreviations used in your folder names
boroughs = ['b', 'bx', 'm', 'q', 'si']

# List of the essential files you want to combine
files_to_combine = [
    'stops.txt',
    'routes.txt',
    'stop_times.txt',
    'trips.txt',
    'calendar.txt',
    'calendar_dates.txt'
]

print("Starting the file combination and deduplication process...\n")

# Loop through each filename
for filename in files_to_combine:
    
    list_of_dfs = [] # Holds the data from each borough for the current file
    
    # Loop through each borough folder
    for borough in boroughs:
        file_path = os.path.join(DATA_DIR, f"gtfs_{borough}", filename)
        
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, low_memory=False)
            list_of_dfs.append(df)
    
    if list_of_dfs:
        combined_df = pd.concat(list_of_dfs, ignore_index=True)
        
        # --- UPDATED DUPLICATE REMOVAL LOGIC ---
        before_count = len(combined_df)
        
        # For files with a clear unique ID, drop duplicates based on that key
        if filename == 'stops.txt':
            combined_df.drop_duplicates(subset=['stop_id'], inplace=True)
        elif filename == 'routes.txt':
            combined_df.drop_duplicates(subset=['route_id'], inplace=True)
        # For all other files, drop rows that are completely identical
        else:
            combined_df.drop_duplicates(inplace=True)
        
        after_count = len(combined_df)
        duplicates_removed = before_count - after_count
            
        print(f"Combined '{filename}': {after_count} rows ({duplicates_removed} duplicates removed)")
            
        # Save the new combined file
        output_path = os.path.join(COMBINED_DIR, filename)
        combined_df.to_csv(output_path, index=False)
    else:
        print(f"No files found for '{filename}'")

print("\n✅ Combination and deduplication complete!")

Starting the file combination and deduplication process...

Combined 'stops.txt': 11523 rows (175 duplicates removed)
Combined 'routes.txt': 288 rows (1152 duplicates removed)
Combined 'stop_times.txt': 6374563 rows (0 duplicates removed)
Combined 'trips.txt': 185050 rows (0 duplicates removed)
Combined 'calendar.txt': 104 rows (0 duplicates removed)
Combined 'calendar_dates.txt': 2284 rows (0 duplicates removed)

✅ Combination and deduplication complete!
