In [3]:
import pandas as pd
import glob
import os

# ‚úÖ Make sure the output folder exists
os.makedirs('D:/MSDS/Network Science/project/green_combined_data_sampled/', exist_ok=True)

# Get all green taxi files (2014‚Äì2024)
file_list = sorted(glob.glob('D:/MSDS/Network Science/project/raw_data/green_tripdata_*.parquet'))

print(f"Found {len(file_list)} green taxi files to process.")

# Group into batches of 4 files (for every 4 months)
batch_size = 4
batches = [file_list[i:i + batch_size] for i in range(0, len(file_list), batch_size)]

for batch_num, batch_files in enumerate(batches, start=1):
    print(f"\nüîÑ Processing Green Taxi batch {batch_num}:")
    batch_dfs = []
    
    for file in batch_files:
        try:
            needed_columns = ['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
                              'PULocationID', 'DOLocationID',
                              'trip_distance', 'fare_amount', 'tip_amount', 'total_amount']
            
            # Read the full file, then subset columns
            df = pd.read_parquet(file)
            available_columns = [col for col in needed_columns if col in df.columns]
            df = df[available_columns]
            
            # Random sample of 10,000 rows
            df_sampled = df.sample(n=10000, random_state=42)
            
            # Drop rows with missing PU/DO locations
            df_cleaned = df_sampled.dropna(subset=['PULocationID', 'DOLocationID'])
            
            batch_dfs.append(df_cleaned)
            print(f" - Sampled and cleaned {file} ({df_cleaned.shape[0]} rows kept)")

        except Exception as e:
            print(f"‚ùó Error reading/cleaning {file}: {e}")
    
    if batch_dfs:
        # Combine the 4 files into one DataFrame
        combined_df = pd.concat(batch_dfs, ignore_index=True)
        
        # Extract the start and end month/year for naming
        start_file = os.path.basename(batch_files[0])
        end_file = os.path.basename(batch_files[-1])
        
        start_period = start_file.replace('green_tripdata_', '').replace('.parquet', '')
        end_period = end_file.replace('green_tripdata_', '').replace('.parquet', '')
        
        # Save combined CSV file
        output_file = f'D:/MSDS/Network Science/project/green_combined_data_sampled/green_tripdata_{start_period}_to_{end_period}_sampled.csv'
        
        combined_df.to_csv(output_file, index=False)
        print(f"üíæ Saved sampled Green Taxi CSV file: {output_file}")
    else:
        print(f"‚ö†Ô∏è No data to save for Green Taxi batch {batch_num}.")



Found 121 green taxi files to process.

üîÑ Processing Green Taxi batch 1:
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\green_tripdata_2014-01.parquet (10000 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\green_tripdata_2014-02.parquet (10000 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\green_tripdata_2014-03.parquet (10000 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\green_tripdata_2014-04.parquet (10000 rows kept)
üíæ Saved sampled Green Taxi CSV file: D:/MSDS/Network Science/project/green_combined_data_sampled/green_tripdata_2014-01_to_2014-04_sampled.csv

üîÑ Processing Green Taxi batch 2:
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\green_tripdata_2014-05.parquet (10000 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\green_tripdata_2014-06.parquet (10000 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data

In [5]:
import pandas as pd

# Load one Yellow Taxi file (change the path to a file you have)
df = pd.read_parquet('D:/MSDS/Network Science/project/raw_data/yellow_tripdata_2019-01.parquet')

# Show first 5 rows
print(df.head(5))

# Show all columns to see what's available
print("\nColumns in this dataset:")
print(df.columns.tolist())


   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2019-01-01 00:46:40   2019-01-01 00:53:20              1.0   
1         1  2019-01-01 00:59:47   2019-01-01 01:18:59              1.0   
2         2  2018-12-21 13:48:30   2018-12-21 13:52:40              3.0   
3         2  2018-11-28 15:52:25   2018-11-28 15:55:45              5.0   
4         2  2018-11-28 15:56:57   2018-11-28 15:58:33              5.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0            1.5         1.0                  N           151           239   
1            2.6         1.0                  N           239           246   
2            0.0         1.0                  N           236           236   
3            0.0         1.0                  N           193           193   
4            0.0         2.0                  N           193           193   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [7]:
import pandas as pd
import glob
import os

# ‚úÖ Make sure the output folder exists
os.makedirs('D:/MSDS/Network Science/project/yellow_combined_data_sampled/', exist_ok=True)

# Get all yellow taxi files
file_list = sorted(glob.glob('D:/MSDS/Network Science/project/raw_data/yellow_tripdata_*.parquet'))

print(f"Found {len(file_list)} yellow taxi files to process.")

# Group into batches of 4 files (for every 4 months)
batch_size = 4
batches = [file_list[i:i + batch_size] for i in range(0, len(file_list), batch_size)]

for batch_num, batch_files in enumerate(batches, start=1):
    print(f"\nüîÑ Processing Yellow Taxi batch {batch_num}:")
    batch_dfs = []
    
    for file in batch_files:
        try:
            # Extract year from filename
            file_name = os.path.basename(file)
            year_str = file_name.split('_')[2].split('-')[0]
            year = int(year_str)
            
            # Skip files from 2009‚Äì2014
            if year < 2015:
                print(f"‚ö†Ô∏è Skipping {file} because it is from {year} (before 2015).")
                continue
            
            needed_columns = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
                              'PULocationID', 'DOLocationID',
                              'trip_distance', 'fare_amount', 'tip_amount', 'total_amount']
            
            # Read the full file
            df = pd.read_parquet(file)
            
            # Subset only available needed columns
            available_columns = [col for col in needed_columns if col in df.columns]
            df = df[available_columns]
            
            # Random sample of 10,000 rows
            df_sampled = df.sample(n=10000, random_state=42)
            
            # Drop rows with missing PU/DO locations
            df_cleaned = df_sampled.dropna(subset=['PULocationID', 'DOLocationID'])
            
            batch_dfs.append(df_cleaned)
            print(f" - Sampled and cleaned {file} ({df_cleaned.shape[0]} rows kept)")

        except Exception as e:
            print(f"‚ùó Error reading/cleaning {file}: {e}")
    
    if batch_dfs:
        # Combine the 4 files into one DataFrame
        combined_df = pd.concat(batch_dfs, ignore_index=True)
        
        # Extract the start and end month/year for naming
        start_file = os.path.basename(batch_files[0])
        end_file = os.path.basename(batch_files[-1])
        
        start_period = start_file.replace('yellow_tripdata_', '').replace('.parquet', '')
        end_period = end_file.replace('yellow_tripdata_', '').replace('.parquet', '')
        
        # Save combined CSV file
        output_file = f'D:/MSDS/Network Science/project/yellow_combined_data_sampled/yellow_tripdata_{start_period}_to_{end_period}_sampled.csv'
        
        combined_df.to_csv(output_file, index=False)
        print(f"üíæ Saved sampled Yellow Taxi CSV file: {output_file}")
    else:
        print(f"‚ö†Ô∏è No data to save for Yellow Taxi batch {batch_num}.")



    



Found 181 yellow taxi files to process.

üîÑ Processing Yellow Taxi batch 1:
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_tripdata_2009-01.parquet because it is from 2009 (before 2015).
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_tripdata_2009-02.parquet because it is from 2009 (before 2015).
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_tripdata_2009-03.parquet because it is from 2009 (before 2015).
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_tripdata_2009-04.parquet because it is from 2009 (before 2015).
‚ö†Ô∏è No data to save for Yellow Taxi batch 1.

üîÑ Processing Yellow Taxi batch 2:
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_tripdata_2009-05.parquet because it is from 2009 (before 2015).
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_tripdata_2009-06.parquet because it is from 2009 (before 2015).
‚ö†Ô∏è Skipping D:/MSDS/Network Science/project/raw_data\yellow_trip

In [8]:
import pandas as pd
import glob
import os

# ‚úÖ Make sure the output folder exists
os.makedirs('D:/MSDS/Network Science/project/fhv_combined_data_sampled/', exist_ok=True)

# Get all FHV files
file_list = sorted(glob.glob('D:/MSDS/Network Science/project/raw_data/fhv_tripdata_*.parquet'))

print(f"Found {len(file_list)} FHV files to process.")

# Group into batches of 4 files (for every 4 months)
batch_size = 4
batches = [file_list[i:i + batch_size] for i in range(0, len(file_list), batch_size)]

for batch_num, batch_files in enumerate(batches, start=1):
    print(f"\nüîÑ Processing FHV batch {batch_num}:")
    batch_dfs = []
    
    for file in batch_files:
        try:
            # Extract year from filename
            file_name = os.path.basename(file)
            year_str = file_name.split('_')[2].split('-')[0]
            year = int(year_str)
            
            # Skip files from 2009‚Äì2014
            if year < 2015:
                print(f"‚ö†Ô∏è Skipping {file} because it is from {year} (before 2015).")
                continue
            
            needed_columns = ['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
                              'PUlocationID', 'DOlocationID']
            
            # Read the full file
            df = pd.read_parquet(file)
            
            # Skip if PU/DO are not in the data
            if 'PUlocationID' not in df.columns or 'DOlocationID' not in df.columns:
                print(f"‚ö†Ô∏è Skipping {file} because it lacks PU/DO location IDs.")
                continue
            
            # Subset only needed columns
            available_columns = [col for col in needed_columns if col in df.columns]
            df = df[available_columns]
            
            # Random sample of 10,000 rows
            df_sampled = df.sample(n=10000, random_state=42)
            
            # Drop rows with missing PU/DO locations
            df_cleaned = df_sampled.dropna(subset=['PUlocationID', 'DOlocationID'])
            
            batch_dfs.append(df_cleaned)
            print(f" - Sampled and cleaned {file} ({df_cleaned.shape[0]} rows kept)")

        except Exception as e:
            print(f"‚ùó Error reading/cleaning {file}: {e}")
    
    if batch_dfs:
        # Combine the 4 files into one DataFrame
        combined_df = pd.concat(batch_dfs, ignore_index=True)
        
        # Extract the start and end month/year for naming
        start_file = os.path.basename(batch_files[0])
        end_file = os.path.basename(batch_files[-1])
        
        start_period = start_file.replace('fhv_tripdata_', '').replace('.parquet', '')
        end_period = end_file.replace('fhv_tripdata_', '').replace('.parquet', '')
        
        # Save combined CSV file
        output_file = f'D:/MSDS/Network Science/project/fhv_combined_data_sampled/fhv_tripdata_{start_period}_to_{end_period}_sampled.csv'
        
        combined_df.to_csv(output_file, index=False)
        print(f"üíæ Saved sampled FHV CSV file: {output_file}")
    else:
        print(f"‚ö†Ô∏è No data to save for FHV batch {batch_num}.")


Found 109 FHV files to process.

üîÑ Processing FHV batch 1:
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-01.parquet (11 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-02.parquet (106 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-03.parquet (131 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-04.parquet (471 rows kept)
üíæ Saved sampled FHV CSV file: D:/MSDS/Network Science/project/fhv_combined_data_sampled/fhv_tripdata_2015-01_to_2015-04_sampled.csv

üîÑ Processing FHV batch 2:
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-05.parquet (324 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-06.parquet (28 rows kept)
 - Sampled and cleaned D:/MSDS/Network Science/project/raw_data\fhv_tripdata_2015-07.parquet (19 rows kept)
 - Sampled an