In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
def stratified_sample_dataframe(df, size, col1, col2):
    """
    Optimized function to stratify sample without creating a temporary column.
    """
    if size >= len(df):
        return df

    stratify_on = pd.MultiIndex.from_frame(df[[col1, col2]])

    _, stratified_df = train_test_split(df, test_size=size/len(df), stratify=stratify_on, random_state=42)
    
    return stratified_df


parquet_file_paths = ['01-2021.parquet', '02-2021.parquet', '03-2021.parquet', '04-2021.parquet', '05-2021.parquet'] # List of parquet files to process

sampled_dfs = [] # empty list to store the sampled dataframes

for path in parquet_file_paths:
    df = pd.read_parquet(path)

    df['month'] = df['request_datetime'].dt.month 
    df['day_of_month'] = df['request_datetime'].dt.day

    sampled_df = stratified_sample_dataframe(df, 1_000_000, 'hvfhs_license_num', 'day_of_month')

    sampled_dfs.append(sampled_df)


df_all = pd.concat(sampled_dfs, ignore_index=True) # Concatenate all sampled dataframes

df_all.to_csv('Price_waiting_processed.csv', index=False)

print("Processed data saved to 'Price_waiting_processed.csv'")

In [2]:
file_names = ['01-2021.parquet', '02-2021.parquet', '03-2021.parquet', '04-2021.parquet', '05-2021.parquet'] # List of file names

all_number_fares_df = pd.DataFrame() # empty DataFrame to hold all the results

for file_name in file_names:
    df = pd.read_parquet(file_name)
    
    df['request_datetime'] = pd.to_datetime(df['request_datetime']) # Convert 'request_datetime' to datetime
    df['date_request_h'] = df['request_datetime'].dt.strftime('%Y-%m-%d-%H') # Create a new column with year-month-day-hour

    number_fares_series = df.groupby('date_request_h').size()

    number_fares_df = number_fares_series.reset_index(name='Number_fares')
    
    all_number_fares_df = pd.concat([all_number_fares_df, number_fares_df])

all_number_fares_df = all_number_fares_df.groupby('date_request_h').sum().reset_index()

all_number_fares_df.to_csv('Demand_forecast.csv', index=False) # Save the DataFrame to a CSV file

all_number_fares_df.head()