In [1]:
import pandas as pd

# Load the dataset
taxi_data = pd.read_csv('/workspaces/codespaces-jupyter/data/final_cleaned_taxi_data.csv')



In [2]:
taxi_data.columns

Index(['trip_start_timestamp', 'trip_end_timestamp', 'trip_duration_seconds',
       'trip_distance_miles', 'pickup_community_area',
       'dropoff_community_area', 'pickup_latitude', 'pickup_longitude',
       'dropoff_latitude', 'dropoff_longitude'],
      dtype='object')

In [2]:
taxi_data.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
import pandas as pd

def aggregate_taxi_data(taxi_data, months, focus_areas):
    """
    Preprocesses and aggregates taxi data based on specified months and community areas.

    Args:
    taxi_data (pd.DataFrame): The taxi trip data.
    months (list): List of integers representing the months to include in the analysis.
    focus_areas (list): List of integers representing the community areas to focus on.

    Returns:
    pd.DataFrame: Aggregated taxi data based on specified conditions.
    """

    # Ensure 'trip_start_timestamp' is in datetime format
    taxi_data['trip_start_timestamp'] = pd.to_datetime(taxi_data['trip_start_timestamp'])

    # Filter taxi_data for trips occurring in the specified months
    data_filtered = taxi_data[taxi_data['trip_start_timestamp'].dt.month.isin(months)]

    # Further filter to include only trips within the selected community areas
    data_filtered = data_filtered[(data_filtered['pickup_community_area'].isin(focus_areas)) & 
                                  (data_filtered['dropoff_community_area'].isin(focus_areas))]

    # Calculate quarter of the hour
    data_filtered['quarter_of_hour'] = data_filtered['trip_start_timestamp'].dt.minute // 15

    # Create a new timestamp that represents the start of the quarter hour
    data_filtered['trip_start_timestamp'] = data_filtered['trip_start_timestamp'].dt.floor('H') + \
        pd.to_timedelta(data_filtered['quarter_of_hour'] * 15, unit='min')

    # Group by location, community areas, and timestamp quarter of hour, and compute mean distance and duration
    aggregated_data = data_filtered.groupby([
        'trip_start_timestamp',
        'trip_end_timestamp',
        'pickup_community_area',
        'dropoff_community_area',
        'pickup_latitude', 
        'pickup_longitude', 
        'dropoff_latitude', 
        'dropoff_longitude', 
    ]).agg({
        'trip_distance_miles': 'mean',
        'trip_duration_seconds': 'mean'
    }).reset_index()


    return aggregated_data



## Updated Travel Time Matrix

In [10]:
taxi_data.columns

Index(['trip_start_timestamp', 'trip_end_timestamp', 'trip_duration_seconds',
       'trip_distance_miles', 'pickup_community_area',
       'dropoff_community_area', 'pickup_latitude', 'pickup_longitude',
       'dropoff_latitude', 'dropoff_longitude'],
      dtype='object')

In [100]:
# Filter taxi_data for trips occurring in the first 10 months of the year
months = [12] 
# Further filter to include only trips within the selected community areas
focus_areas = [6,7, 8, 32, 21,22,23,24,26,28, 27,28] 
nov_records = aggregate_taxi_data(taxi_data, months, focus_areas)

  data_filtered['trip_start_timestamp'] = data_filtered['trip_start_timestamp'].dt.floor('H') + \


In [5]:
nov_records.head(20)

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,pickup_community_area,dropoff_community_area,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,trip_duration_seconds
0,2023-12-01 00:00:00,2023-12-01 00:00:00,28.0,8.0,41.874005,-87.663518,41.899602,-87.633308,2.0,360.0
1,2023-12-01 00:00:00,2023-12-01 00:00:00,28.0,32.0,41.874005,-87.663518,41.878866,-87.625192,2.427921,437.025806
2,2023-12-01 00:00:00,2023-12-01 00:15:00,8.0,7.0,41.899602,-87.633308,41.922686,-87.649489,3.06,550.8
3,2023-12-01 00:00:00,2023-12-01 00:15:00,8.0,28.0,41.899602,-87.633308,41.874005,-87.663518,3.35,603.0
4,2023-12-01 00:00:00,2023-12-01 00:15:00,8.0,32.0,41.899602,-87.633308,41.878866,-87.625192,1.41,253.8
5,2023-12-01 00:00:00,2023-12-01 00:15:00,28.0,7.0,41.874005,-87.663518,41.922686,-87.649489,3.54,637.2
6,2023-12-01 00:00:00,2023-12-01 00:15:00,32.0,8.0,41.878866,-87.625192,41.899602,-87.633308,1.55,279.0
7,2023-12-01 00:15:00,2023-12-01 00:15:00,28.0,8.0,41.874005,-87.663518,41.899602,-87.633308,1.64,295.2
8,2023-12-01 00:15:00,2023-12-01 00:15:00,28.0,32.0,41.874005,-87.663518,41.878866,-87.625192,1.518677,273.361925
9,2023-12-01 00:15:00,2023-12-01 00:30:00,8.0,7.0,41.899602,-87.633308,41.922686,-87.649489,2.32,417.6


In [101]:
df= nov_records.copy()

In [102]:
import pandas as pd

# Assuming your DataFrame is named df
# Parse the date columns if they are not already in datetime format
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])

# Filter for records between the 1st and 2nd of December
filtered_df = df[(df['trip_start_timestamp'] >= '2023-12-01') & (df['trip_start_timestamp'] < '2023-12-02')]

# Extract the hour from the trip_start_timestamp
filtered_df['hour'] = filtered_df['trip_start_timestamp'].dt.hour

# Filter for records between 8 AM and 2 PM
time_filtered_df = filtered_df[(filtered_df['hour'] >= 8) & (filtered_df['hour'] < 14)]
# Group by pickup and dropoff locations and count the number of trips
location_pairs = filtered_df.groupby(['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']).size().reset_index(name='trip_count')

# Sort by trip count in descending order
top_location_pairs = location_pairs.sort_values(by='trip_count', ascending=False).head(30)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['hour'] = filtered_df['trip_start_timestamp'].dt.hour


In [103]:
top_location_pairs.shape

(30, 5)

In [104]:
top_location_pairs

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_count
533,41.899602,-87.633308,41.922686,-87.649489,113
529,41.899602,-87.633308,41.878866,-87.625192,101
536,41.899602,-87.633308,41.944227,-87.655998,98
50,41.874005,-87.663518,41.899602,-87.633308,95
704,41.922686,-87.649489,41.899602,-87.633308,87
92,41.878866,-87.625192,41.899602,-87.633308,84
195,41.884987,-87.620993,41.880994,-87.632746,83
527,41.899602,-87.633308,41.874005,-87.663518,79
114,41.879255,-87.642649,41.880994,-87.632746,77
765,41.944227,-87.655998,41.899602,-87.633308,77


In [105]:
# Filter the original dataset to keep only the records of the top location pairs
top_locations_df = df.merge(top_location_pairs[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']], on=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])

# Further filter to keep only the records between 8 AM and 2 PM
final_filtered_df = top_locations_df[(top_locations_df['trip_start_timestamp'] >= '2023-12-01') & (top_locations_df['trip_start_timestamp'] < '2023-12-03')]
final_filtered_df = final_filtered_df[(final_filtered_df['trip_start_timestamp'].dt.hour >= 8) & (final_filtered_df['trip_start_timestamp'].dt.hour < 14)]



In [106]:
final_filtered_df.sort_values(by='trip_start_timestamp', ascending=True)

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,pickup_community_area,dropoff_community_area,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,trip_duration_seconds
230,2023-12-01 08:00:00,2023-12-01 08:00:00,8.0,28.0,41.899602,-87.633308,41.874005,-87.663518,2.000000,360.000000
255,2023-12-01 08:00:00,2023-12-01 08:30:00,28.0,24.0,41.874005,-87.663518,41.901207,-87.676356,2.228654,401.157746
254,2023-12-01 08:00:00,2023-12-01 08:30:00,6.0,8.0,41.944227,-87.655998,41.899602,-87.633308,5.190000,934.200000
253,2023-12-01 08:00:00,2023-12-01 08:15:00,32.0,32.0,41.884987,-87.620993,41.880994,-87.632746,1.035748,186.434580
252,2023-12-01 08:00:00,2023-12-01 08:15:00,32.0,28.0,41.884987,-87.620993,41.879255,-87.642649,1.577938,284.028785
...,...,...,...,...,...,...,...,...,...,...
2576,2023-12-02 13:45:00,2023-12-02 13:45:00,8.0,32.0,41.892508,-87.626215,41.884987,-87.620993,0.804041,144.727463
2575,2023-12-02 13:45:00,2023-12-02 13:45:00,8.0,24.0,41.899602,-87.633308,41.901207,-87.676356,2.200000,396.000000
2595,2023-12-02 13:45:00,2023-12-02 14:15:00,32.0,6.0,41.878866,-87.625192,41.944227,-87.655998,5.750000,1035.000000
2584,2023-12-02 13:45:00,2023-12-02 14:00:00,7.0,8.0,41.922686,-87.649489,41.899602,-87.633308,1.700000,306.000000


In [107]:
final_filtered_df.head(30)

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,pickup_community_area,dropoff_community_area,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,trip_duration_seconds
230,2023-12-01 08:00:00,2023-12-01 08:00:00,8.0,28.0,41.899602,-87.633308,41.874005,-87.663518,2.0,360.0
231,2023-12-01 08:00:00,2023-12-01 08:00:00,8.0,32.0,41.892508,-87.626215,41.880994,-87.632746,1.151043,207.187708
232,2023-12-01 08:00:00,2023-12-01 08:00:00,8.0,32.0,41.899602,-87.633308,41.878866,-87.625192,1.16,208.8
233,2023-12-01 08:00:00,2023-12-01 08:00:00,28.0,24.0,41.874005,-87.663518,41.901207,-87.676356,1.34,241.2
234,2023-12-01 08:00:00,2023-12-01 08:00:00,28.0,32.0,41.879255,-87.642649,41.877406,-87.621972,1.256561,226.181052
235,2023-12-01 08:00:00,2023-12-01 08:00:00,28.0,32.0,41.879255,-87.642649,41.880994,-87.632746,0.660695,118.925045
236,2023-12-01 08:00:00,2023-12-01 08:00:00,28.0,32.0,41.879255,-87.642649,41.884987,-87.620993,1.26,226.8
237,2023-12-01 08:00:00,2023-12-01 08:00:00,28.0,32.0,41.8853,-87.642808,41.880994,-87.632746,0.846467,152.364053
238,2023-12-01 08:00:00,2023-12-01 08:00:00,32.0,8.0,41.880994,-87.632746,41.892508,-87.626215,1.2,216.0
239,2023-12-01 08:00:00,2023-12-01 08:00:00,32.0,8.0,41.884987,-87.620993,41.892508,-87.626215,0.804041,144.727463


In [108]:
final_filtered_df.to_csv('/workspaces/codespaces-jupyter/data/test/filtered_sample_2.csv', index=False)

In [109]:
top_location_pairs.to_csv('/workspaces/codespaces-jupyter/data/test/filtered_sample_2_locations.csv', index=False)

In [110]:
final_filtered_df.drop(columns=['trip_end_timestamp',	'pickup_community_area',	'dropoff_community_area'], inplace=True)

In [111]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def prepare_dataset(df):
    """
    Prepares the dataset by extracting temporal features and dropping unnecessary columns.

    Parameters:
    - df: The original DataFrame containing the dataset.

    Returns:
    - A DataFrame with the prepared dataset.
    """
    # Convert trip_start_timestamp and trip_end_timestamp to datetime
    df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])

    # Extract hour of the day (cosine and sine)
    df['hour_cos'] = np.cos(2 * np.pi * df['trip_start_timestamp'].dt.hour / 24)
    df['hour_sin'] = np.sin(2 * np.pi * df['trip_start_timestamp'].dt.hour / 24)

    # Extract day of the week (cosine and sine)
    df['day_cos'] = np.cos(2 * np.pi * df['trip_start_timestamp'].dt.dayofweek / 6)
    df['day_sin'] = np.sin(2 * np.pi * df['trip_start_timestamp'].dt.dayofweek / 6)

    
    # Extract quarter of the hour (cosine and sine)
    df['quarter_cos'] = np.cos(2 * np.pi * df['trip_start_timestamp'].dt.minute / 15)
    df['quarter_sin'] = np.sin(2 * np.pi * df['trip_start_timestamp'].dt.minute / 15)
    # Extract is weekend
    df['is_weekend'] = df['trip_start_timestamp'].dt.dayofweek >= 5
    df['is_weekend'] = df['is_weekend'].astype(int)  # Convert to 0 or 1

    # Extract is morning rush hour (6-9 AM)
    df['is_morning_rush'] = (df['trip_start_timestamp'].dt.hour >= 6) & (df['trip_start_timestamp'].dt.hour < 9)
    df['is_morning_rush'] = df['is_morning_rush'].astype(int)  # Convert to 0 or 1

    # Extract is evening rush hour (4-7 PM)
    df['is_evening_rush'] = (df['trip_start_timestamp'].dt.hour >= 16) & (df['trip_start_timestamp'].dt.hour < 19)
    df['is_evening_rush'] = df['is_evening_rush'].astype(int)  # Convert to 0 or 1
    # # Drop unnecessary columns
    # df.drop(['trip_end_timestamp', 'pickup_community_area', 'dropoff_community_area'], axis=1, inplace=True)

    return df




In [112]:
taxi_trips=final_filtered_df.copy()

In [72]:
# import pandas as pd
# taxi_trips=pd.read_csv("/workspaces/codespaces-jupyter/data/test/generated_trips.csv")

In [73]:
# # Rename columns

# taxi_trips = taxi_trips.rename(columns={
#     'pickup_latitude': 'pickup_latitude',
#     'pickup_longitude': 'pickup_longitude',
#     'dropoff_latitude': 'dropoff_latitude',
#     'dropoff_longitude': 'dropoff_longitude',
#     'trip_start_timestamp': 'trip_start_timestamp',
#     'trip_distance_miles': 'trip_distance_miles'
# })

# # Reorder columns
# taxi_trips = taxi_trips[['trip_start_timestamp', 'pickup_latitude', 'pickup_longitude',
#                          'dropoff_latitude', 'dropoff_longitude', 'trip_distance_miles']]



In [113]:
taxi_trips.head(10)

Unnamed: 0,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,trip_duration_seconds
230,2023-12-01 08:00:00,41.899602,-87.633308,41.874005,-87.663518,2.0,360.0
231,2023-12-01 08:00:00,41.892508,-87.626215,41.880994,-87.632746,1.151043,207.187708
232,2023-12-01 08:00:00,41.899602,-87.633308,41.878866,-87.625192,1.16,208.8
233,2023-12-01 08:00:00,41.874005,-87.663518,41.901207,-87.676356,1.34,241.2
234,2023-12-01 08:00:00,41.879255,-87.642649,41.877406,-87.621972,1.256561,226.181052
235,2023-12-01 08:00:00,41.879255,-87.642649,41.880994,-87.632746,0.660695,118.925045
236,2023-12-01 08:00:00,41.879255,-87.642649,41.884987,-87.620993,1.26,226.8
237,2023-12-01 08:00:00,41.8853,-87.642808,41.880994,-87.632746,0.846467,152.364053
238,2023-12-01 08:00:00,41.880994,-87.632746,41.892508,-87.626215,1.2,216.0
239,2023-12-01 08:00:00,41.884987,-87.620993,41.892508,-87.626215,0.804041,144.727463


In [115]:
taxi_trips.shape

(1066, 7)

In [119]:
taxi_trips.drop(columns=['trip_duration_seconds'], inplace=True)

In [75]:
# # Ensure the column is in datetime format
# taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])

# # Define the new date
# new_date = pd.Timestamp('2023-12-02')

# # Function to change the date part of the timestamp
# def change_date(timestamp, new_date):
#     return timestamp.replace(year=new_date.year, month=new_date.month, day=new_date.day)

# # Apply the function to the trip_start_timestamp column
# taxi_trips['trip_start_timestamp'] = taxi_trips['trip_start_timestamp'].apply(lambda x: change_date(x, new_date))


In [120]:
prepared_df = prepare_dataset(taxi_trips)

In [121]:
prepared_df.head()

Unnamed: 0,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,hour_cos,hour_sin,day_cos,day_sin,quarter_cos,quarter_sin,is_weekend,is_morning_rush,is_evening_rush
230,2023-12-01 08:00:00,41.899602,-87.633308,41.874005,-87.663518,2.0,-0.5,0.866025,-0.5,-0.866025,1.0,0.0,0,1,0
231,2023-12-01 08:00:00,41.892508,-87.626215,41.880994,-87.632746,1.151043,-0.5,0.866025,-0.5,-0.866025,1.0,0.0,0,1,0
232,2023-12-01 08:00:00,41.899602,-87.633308,41.878866,-87.625192,1.16,-0.5,0.866025,-0.5,-0.866025,1.0,0.0,0,1,0
233,2023-12-01 08:00:00,41.874005,-87.663518,41.901207,-87.676356,1.34,-0.5,0.866025,-0.5,-0.866025,1.0,0.0,0,1,0
234,2023-12-01 08:00:00,41.879255,-87.642649,41.877406,-87.621972,1.256561,-0.5,0.866025,-0.5,-0.866025,1.0,0.0,0,1,0


**Create a separate class specifically designed for preparing data for predictions.**

This will make the code easier to manage, especially when working with different data flows in a production environment.

In [122]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

class PredictionDataPreparer:
    def __init__(self, feature_scaler, target_scaler, input_width):
        """
        Initializes the PredictionDataPreparer class with given scalers and input width.
        
        Args:
        feature_scaler (StandardScaler): The scaler used to normalize features, already fitted to the training data.
        target_scaler (StandardScaler): The scaler used to normalize the target variable, already fitted to the training data.
        input_width (int): The number of timesteps each input sequence should contain.
        """
        self.feature_scaler = feature_scaler
        self.target_scaler = target_scaler
        self.input_width = input_width

    def prepare_data(self, data):
        """
        Prepares the data for prediction by scaling features and target, and converting data into a suitable format for the model.
        
        Args:
        data (pd.DataFrame): The preprocessed data to be prepared for prediction.
        
        Returns:
        tuple: A tuple containing the data ready for prediction (as sequences), the corresponding timestamps, and the scaled target.
        """
        # Extract the trip_start_timestamp column
        timestamps = data['trip_start_timestamp'].values
        
        # Define feature columns and target column
        feature_columns = [col for col in data.columns if col not in ['trip_start_timestamp']]

        
        # Apply the feature scaler on feature columns
        data[feature_columns] = self.feature_scaler.transform(data[feature_columns])
        

        
        # Create sequences of the required input width
        sequence_data = []
        sequence_timestamps = []
        sequence_target = []
        if len(data) >= self.input_width:
            for start_idx in range(len(data) - self.input_width + 1):
                end_idx = start_idx + self.input_width
                sequence_data.append(data.iloc[start_idx:end_idx][feature_columns].values)
                sequence_timestamps.append(timestamps[start_idx:end_idx])


        sequence_data = np.array(sequence_data, dtype=np.float32)
        sequence_timestamps = np.array(sequence_timestamps)
        
        return sequence_data, sequence_timestamps


In [123]:
from joblib import load

feature_scaler = load('/workspaces/codespaces-jupyter/notebooks/best_model/feature_scaler_FV_2.pkl')
target_scaler = load('/workspaces/codespaces-jupyter/notebooks/best_model/target_scaler_FV_2.pkl')


In [124]:
preparer = PredictionDataPreparer(feature_scaler,target_scaler,input_width=8)
prepared_data, prepared_timestamps = preparer.prepare_data(prepared_df)

**Upload the model and scalers**

In [125]:
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError

# Load the model and explicitly define 'mse' if it's unrecognized
model = load_model('/workspaces/codespaces-jupyter/notebooks/best_model/Improved_LSTM_model_11_M.h5', custom_objects={'mse': MeanSquaredError()})





In [126]:
predictions = model.predict(prepared_data)

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step


In [127]:
predictions

array([[[-0.07101591],
        [-0.5668808 ],
        [-0.5627879 ],
        ...,
        [-0.85888916],
        [-0.50566036],
        [-0.74837655]],

       [[-0.5685528 ],
        [-0.5626503 ],
        [-0.4660332 ],
        ...,
        [-0.5057034 ],
        [-0.74711055],
        [-0.54323506]],

       [[-0.5619605 ],
        [-0.46542555],
        [-0.50787336],
        ...,
        [-0.7487095 ],
        [-0.541832  ],
        [-0.77353716]],

       ...,

       [[-0.3165704 ],
        [-0.5728974 ],
        [-0.7730388 ],
        ...,
        [-0.73147887],
        [ 2.2184715 ],
        [ 1.7150903 ]],

       [[-0.5710321 ],
        [-0.7729521 ],
        [ 0.7317021 ],
        ...,
        [ 2.2169712 ],
        [ 1.7009366 ],
        [ 2.1264825 ]],

       [[-0.7695963 ],
        [ 0.7339745 ],
        [-0.55031645],
        ...,
        [ 1.699424  ],
        [ 2.1230152 ],
        [ 1.6744184 ]]], dtype=float32)

In [128]:
# Flatten the 3D predictions to 2D
predictions_reshaped = predictions.reshape(-1, predictions.shape[2])

# Apply inverse scaling
original_scale_predictions = target_scaler.inverse_transform(predictions_reshaped)

# Optionally reshape back to 3D if needed for further processing
original_scale_predictions = original_scale_predictions.reshape(predictions.shape)


In [129]:
original_scale_predictions

array([[[ 359.689  ],
        [ 209.32153],
        [ 210.56268],
        ...,
        [ 120.77212],
        [ 227.8862 ],
        [ 154.28427]],

       [[ 208.81453],
        [ 210.6044 ],
        [ 239.90285],
        ...,
        [ 227.87315],
        [ 154.66818],
        [ 216.49194]],

       [[ 210.81358],
        [ 240.08711],
        [ 227.21512],
        ...,
        [ 154.1833 ],
        [ 216.91742],
        [ 146.6545 ]],

       ...,

       [[ 285.22638],
        [ 207.49706],
        [ 146.80562],
        ...,
        [ 159.40837],
        [1053.9595 ],
        [ 901.3128 ]],

       [[ 208.06268],
        [ 146.83192],
        [ 603.1074 ],
        ...,
        [1053.5045 ],
        [ 897.02075],
        [1026.0645 ]],

       [[ 147.84955],
        [ 603.7965 ],
        [ 214.34456],
        ...,
        [ 896.5621 ],
        [1025.0131 ],
        [ 888.9793 ]]], dtype=float32)

In [130]:
# Flatten the 3D predictions to 2D
predictions_reshaped = predictions.reshape(-1, predictions.shape[2])

# Apply inverse scaling to predictions
original_scale_predictions = target_scaler.inverse_transform(predictions_reshaped)

# Optionally reshape back to 3D if needed for further processing
original_scale_predictions = original_scale_predictions.reshape(predictions.shape)

# Reshape prepared_data for inverse transformation (flatten the timesteps for simplicity)
num_samples, num_timesteps, num_features = prepared_data.shape
flat_prepared_data = prepared_data.reshape(-1, num_features)

# Reverse normalization of features
original_scale_features = feature_scaler.inverse_transform(flat_prepared_data)

# Reshape back to original dimensions
original_scale_features = original_scale_features.reshape(num_samples, num_timesteps, num_features)

# Flatten the original_scale_features and original_scale_predictions for DataFrame creation
flat_data = original_scale_features.reshape(-1, num_features)  # Reshape to (num_samples * num_timesteps, num_features)
flat_predictions = original_scale_predictions.reshape(-1, 1)  # Reshape to (num_samples * num_timesteps, 1)

# Convert to pandas DataFrames
feature_names = [
    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
    'trip_distance_miles', 'hour_cos', 'hour_sin', 'day_cos', 'day_sin',
    'quarter_cos', 'quarter_sin', 'is_weekend', 'is_morning_rush', 'is_evening_rush'
]
data_df = pd.DataFrame(flat_data, columns=feature_names)
predictions_df = pd.DataFrame(flat_predictions, columns=['predicted_trip_duration_seconds'])

# Flatten and repeat timestamps to match the shape of flat_data and flat_predictions
flat_timestamps = prepared_timestamps.reshape(-1)  # Flatten the 2D array to 1D

# Add the timestamps to the data DataFrame
data_df['trip_start_timestamp'] = flat_timestamps

# Concatenate dataframes
final_df = pd.concat([data_df, predictions_df], axis=1)

In [131]:
import pandas as pd

desired_order = [
    'trip_start_timestamp',
    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
    'trip_distance_miles', 'hour_cos', 'hour_sin', 'day_cos', 'day_sin',
    'quarter_cos', 'quarter_sin', 'is_weekend', 'is_morning_rush', 'is_evening_rush',
    'predicted_trip_duration_seconds'
]

# Reindex the DataFrame to reflect the desired column order
final_df = final_df[desired_order]


In [132]:
final_df.head()

Unnamed: 0,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,hour_cos,hour_sin,day_cos,day_sin,quarter_cos,quarter_sin,is_weekend,is_morning_rush,is_evening_rush,predicted_trip_duration_seconds
0,2023-12-01 08:00:00,41.899601,-87.633308,41.874004,-87.663521,2.0,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,359.688995
1,2023-12-01 08:00:00,41.892509,-87.626213,41.880993,-87.632744,1.151043,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,209.321533
2,2023-12-01 08:00:00,41.899601,-87.633308,41.878864,-87.625191,1.16,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,210.562683
3,2023-12-01 08:00:00,41.874004,-87.663521,41.901207,-87.676353,1.34,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,239.900299
4,2023-12-01 08:00:00,41.879253,-87.642647,41.877407,-87.621971,1.256561,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,227.190094


In [133]:
final_df.to_csv('/workspaces/codespaces-jupyter/data/test/predicted_sample.csv', index=False)

In [134]:
import pandas as pd

# Assuming final_df is your final DataFrame
# Combine pickup and dropoff locations to find unique locations
pickup_locations = final_df[['pickup_latitude', 'pickup_longitude']].rename(columns={
    'pickup_latitude': 'latitude', 'pickup_longitude': 'longitude'
})
dropoff_locations = final_df[['dropoff_latitude', 'dropoff_longitude']].rename(columns={
    'dropoff_latitude': 'latitude', 'dropoff_longitude': 'longitude'
})

# Concatenate pickup and dropoff locations
all_locations = pd.concat([pickup_locations, dropoff_locations]).drop_duplicates().reset_index(drop=True)

# Create an index for each unique location
all_locations['location_index'] = all_locations.index

In [135]:
all_locations.head(10)

Unnamed: 0,latitude,longitude,location_index
0,41.899601,-87.633308,0
1,41.892509,-87.626213,1
2,41.874004,-87.663521,2
3,41.879253,-87.642647,3
4,41.8853,-87.642807,4
5,41.880993,-87.632744,5
6,41.884987,-87.620995,6
7,41.878864,-87.625191,7
8,41.922688,-87.64949,8
9,41.944225,-87.655998,9


In [136]:
all_locations.to_csv("/workspaces/codespaces-jupyter/data/test/sample_locations.csv", index=False)

### Create travel time matrix

In [137]:
# Merge the location indexes back to the final_df for pickup and dropoff locations
final_df = final_df.merge(all_locations.rename(columns={
    'latitude': 'pickup_latitude', 'longitude': 'pickup_longitude', 'location_index': 'pickup_index'
}), on=['pickup_latitude', 'pickup_longitude'], how='left')

final_df = final_df.merge(all_locations.rename(columns={
    'latitude': 'dropoff_latitude', 'longitude': 'dropoff_longitude', 'location_index': 'dropoff_index'
}), on=['dropoff_latitude', 'dropoff_longitude'], how='left')

# Verify that there are no duplicate columns
final_df = final_df.loc[:, ~final_df.columns.duplicated()]


In [138]:
final_df.head()

Unnamed: 0,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_distance_miles,hour_cos,hour_sin,day_cos,day_sin,quarter_cos,quarter_sin,is_weekend,is_morning_rush,is_evening_rush,predicted_trip_duration_seconds,pickup_index,dropoff_index
0,2023-12-01 08:00:00,41.899601,-87.633308,41.874004,-87.663521,2.0,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,359.688995,0,2
1,2023-12-01 08:00:00,41.892509,-87.626213,41.880993,-87.632744,1.151043,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,209.321533,1,5
2,2023-12-01 08:00:00,41.899601,-87.633308,41.878864,-87.625191,1.16,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,210.562683,0,7
3,2023-12-01 08:00:00,41.874004,-87.663521,41.901207,-87.676353,1.34,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,239.900299,2,11
4,2023-12-01 08:00:00,41.879253,-87.642647,41.877407,-87.621971,1.256561,-0.5,0.866025,-0.5,-0.866025,1.0,1.7839340000000002e-23,2.153939e-09,1.0,-4.942907e-09,227.190094,3,12


**Step 1: Create List of DataFrames for Each Timestamp**

In [68]:
# Group the final DataFrame by the 'trip_start_timestamp' column
grouped = final_df.groupby('trip_start_timestamp')

# Create a list of DataFrames for each timestamp
timestamp_dfs = {timestamp: group.reset_index(drop=True) for timestamp, group in grouped}

# Optionally, display the first few records of each smaller DataFrame for verification
for timestamp, df in timestamp_dfs.items():
    print(f"Timestamp: {timestamp}")
    print(df.head())
    print("\n")


Timestamp: 2024-01-12 08:00:00
  trip_start_timestamp  pickup_latitude  pickup_longitude  dropoff_latitude  \
0  2024-01-12 08:00:00        41.934761        -87.639854         41.900223   
1  2024-01-12 08:00:00        41.934761        -87.639854         41.911972   
2  2024-01-12 08:00:00        41.934761        -87.639854         41.878864   
3  2024-01-12 08:00:00        41.934761        -87.639854         41.929047   
4  2024-01-12 08:00:00        41.934761        -87.639854         41.899590   

   dropoff_longitude  trip_distance_miles  hour_cos  hour_sin  day_cos  \
0         -87.629105             2.936951      -0.5  0.866025     -0.5   
1         -87.683640             3.301227      -0.5  0.866025     -0.5   
2         -87.625191             4.717405      -0.5  0.866025     -0.5   
3         -87.651314             0.852113      -0.5  0.866025     -0.5   
4         -87.674721             3.624476      -0.5  0.866025     -0.5   

    day_sin  quarter_cos   quarter_sin    is_week

**Step 2: Create Travel Time Matrix for Each Timestamp**

In [139]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic

# Group by 'trip_start_timestamp' to create a list of DataFrames
grouped = final_df.groupby('trip_start_timestamp')
timestamp_dfs = {timestamp: group.reset_index(drop=True) for timestamp, group in grouped}

# Define the path where the matrices will be saved
save_path = "/workspaces/codespaces-jupyter/data/test/tt_matrices/15_min"  # Change this to your desired path
os.makedirs(save_path, exist_ok=True)

# Function to create the travel time matrix for a given DataFrame and timestamp
def create_travel_time_matrix(df, num_locations):
    travel_time_matrix = np.zeros((num_locations, num_locations))
    for _, row in df.iterrows():
        pickup_index = row['pickup_index']
        dropoff_index = row['dropoff_index']
        predicted_time = row['predicted_trip_duration_seconds']
        travel_time_matrix[pickup_index, dropoff_index] = predicted_time
    return travel_time_matrix

# Create and save travel time matrices for each timestamp
travel_time_matrices = {}
num_locations = len(all_locations)
for timestamp, df in timestamp_dfs.items():
    travel_time_matrix = create_travel_time_matrix(df, num_locations)
    travel_time_matrices[timestamp] = travel_time_matrix
    
    # Convert the matrix to a DataFrame for easier saving to CSV
    matrix_df = pd.DataFrame(travel_time_matrix, index=all_locations['location_index'], columns=all_locations['location_index'])
    
    # Generate the filename based on the timestamp
    timestamp_str = timestamp.strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"travel_time_matrix_{timestamp_str}.csv"
    
    # Save the matrix as a CSV file
    matrix_df.to_csv(os.path.join(save_path, filename))
    
    print(f"Saved travel time matrix for {timestamp} to {filename}")

# Optionally, display the first few matrices for verification
for timestamp, matrix in list(travel_time_matrices.items())[:3]:  # Display first 3 for brevity
    print(f"Travel Time Matrix for {timestamp}:\n{matrix}\n")

Saved travel time matrix for 2023-12-01 08:00:00 to travel_time_matrix_2023-12-01_08-00-00.csv
Saved travel time matrix for 2023-12-01 08:15:00 to travel_time_matrix_2023-12-01_08-15-00.csv
Saved travel time matrix for 2023-12-01 08:30:00 to travel_time_matrix_2023-12-01_08-30-00.csv
Saved travel time matrix for 2023-12-01 08:45:00 to travel_time_matrix_2023-12-01_08-45-00.csv
Saved travel time matrix for 2023-12-01 09:00:00 to travel_time_matrix_2023-12-01_09-00-00.csv
Saved travel time matrix for 2023-12-01 09:15:00 to travel_time_matrix_2023-12-01_09-15-00.csv
Saved travel time matrix for 2023-12-01 09:30:00 to travel_time_matrix_2023-12-01_09-30-00.csv
Saved travel time matrix for 2023-12-01 09:45:00 to travel_time_matrix_2023-12-01_09-45-00.csv
Saved travel time matrix for 2023-12-01 10:00:00 to travel_time_matrix_2023-12-01_10-00-00.csv
Saved travel time matrix for 2023-12-01 10:15:00 to travel_time_matrix_2023-12-01_10-15-00.csv
Saved travel time matrix for 2023-12-01 10:30:00 t

In [140]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic


# Resample to hourly intervals and aggregate by mean travel time
final_df['hourly_timestamp'] = final_df['trip_start_timestamp'].dt.floor('H')
hourly_grouped = final_df.groupby(['hourly_timestamp', 'pickup_index', 'dropoff_index']).agg({
    'predicted_trip_duration_seconds': 'mean'
}).reset_index()

# Define the path where the matrices will be saved
save_path = "/workspaces/codespaces-jupyter/data/test/tt_matrices/1_H"  # Change this to your desired path
os.makedirs(save_path, exist_ok=True)

# Function to create the travel time matrix for a given DataFrame and timestamp
def create_travel_time_matrix(df, num_locations):
    travel_time_matrix = np.zeros((num_locations, num_locations))
    for _, row in df.iterrows():
        pickup_index = row['pickup_index']
        dropoff_index = row['dropoff_index']
        predicted_time = row['predicted_trip_duration_seconds']
        travel_time_matrix[pickup_index, dropoff_index] = predicted_time
    return travel_time_matrix

# Create and save travel time matrices for each hourly timestamp
travel_time_matrices = {}
num_locations = len(all_locations)
for timestamp, df in hourly_grouped.groupby('hourly_timestamp'):
    travel_time_matrix = create_travel_time_matrix(df, num_locations)
    travel_time_matrices[timestamp] = travel_time_matrix
    
    # Convert the matrix to a DataFrame for easier saving to CSV
    matrix_df = pd.DataFrame(travel_time_matrix, index=all_locations['location_index'], columns=all_locations['location_index'])
    
    # Generate the filename based on the timestamp
    timestamp_str = timestamp.strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"travel_time_matrix_{timestamp_str}.csv"
    
    # Save the matrix as a CSV file
    matrix_df.to_csv(os.path.join(save_path, filename))
    
    print(f"Saved travel time matrix for {timestamp} to {filename}")

# Optionally, display the first few matrices for verification
for timestamp, matrix in list(travel_time_matrices.items())[:3]:  # Display first 3 for brevity
    print(f"Travel Time Matrix for {timestamp}:\n{matrix}\n")


Saved travel time matrix for 2023-12-01 08:00:00 to travel_time_matrix_2023-12-01_08-00-00.csv
Saved travel time matrix for 2023-12-01 09:00:00 to travel_time_matrix_2023-12-01_09-00-00.csv
Saved travel time matrix for 2023-12-01 10:00:00 to travel_time_matrix_2023-12-01_10-00-00.csv
Saved travel time matrix for 2023-12-01 11:00:00 to travel_time_matrix_2023-12-01_11-00-00.csv
Saved travel time matrix for 2023-12-01 12:00:00 to travel_time_matrix_2023-12-01_12-00-00.csv
Saved travel time matrix for 2023-12-01 13:00:00 to travel_time_matrix_2023-12-01_13-00-00.csv
Saved travel time matrix for 2023-12-02 08:00:00 to travel_time_matrix_2023-12-02_08-00-00.csv
Saved travel time matrix for 2023-12-02 09:00:00 to travel_time_matrix_2023-12-02_09-00-00.csv
Saved travel time matrix for 2023-12-02 10:00:00 to travel_time_matrix_2023-12-02_10-00-00.csv
Saved travel time matrix for 2023-12-02 11:00:00 to travel_time_matrix_2023-12-02_11-00-00.csv
Saved travel time matrix for 2023-12-02 12:00:00 t

  final_df['hourly_timestamp'] = final_df['trip_start_timestamp'].dt.floor('H')
