# 1_preprocessing

Goal of this notebook is to test diffrent combination of preprocessed datasets to get best result. 
The best outcome is going to be saved in "preprocessed.py".

In [7]:
# Frameworks that we're going to use
import pandas as pd
import osmnx as ox
import numpy as np
from sklearn.model_selection import train_test_split


In [8]:
city = 'New York'

# History data
# Picking right coulmns for our problem
df = pd.read_parquet(r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\yellow_tripdata_2025-01.parquet',
                     columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'RatecodeID', 'congestion_surcharge',
                              'PULocationID', 'DOLocationID', 'fare_amount', 'extra',
                                'tolls_amount', 'Airport_fee', 'cbd_congestion_fee'])



In [9]:



# Renaming to know units
df = df.rename(columns={'trip_distance': 'trip_distance km'})

# Feature engineering
df['time_diffrence'] =  df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'] 


# Second table containing names of all places
df_dist = pd.read_csv(r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\id_lookup.csv')
df_dist_OSM = pd.read_csv(r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\OSM_Street_lookup.csv', delimiter=';')


# Merging tables on location id
df = pd.merge(df, df_dist, left_on='PULocationID', right_on='LocationID', how='left')
df = pd.merge(df, df_dist, left_on='DOLocationID', right_on='LocationID', how='left')
#df = pd.merge(df, df_dist_OSM, left_on='Zone_x', right_on='NTA')
#df = pd.merge(df, df_dist_OSM, left_on='Zone_y', right_on='NTA')


# Filtering important columns
df = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'time_diffrence','trip_distance km', 'RatecodeID', 'congestion_surcharge',
                'PULocationID', 'Borough_x', 'DOLocationID', 'Borough_y', 'fare_amount', 'extra',
                                'tolls_amount', 'Airport_fee', 'cbd_congestion_fee']]

# Renaming for better convenience
#df = df.rename(columns={'OpenStreetMap_x': 'PULZone'})
df = df.rename(columns={'Borough_x': 'PULBorough'})
#df = df.rename(columns={'OpenStreetMap_y': 'DOLZone'})
df = df.rename(columns={'Borough_y': 'DOLBorough'})


# Changing timedelta64[us] output to Hours
df['time_diffrence'] = df['time_diffrence'].dt.total_seconds() / 3600
df = df.rename(columns={'time_diffrence': 'time_diffrence h'})


# Average speed 
df['average_speed km/h'] = round(df['trip_distance km'] / df['time_diffrence h'],2)
df['average_speed km/h'] = df['average_speed km/h'].replace(0, np.nan)
df['average_speed km/h'] = df['average_speed km/h'].replace([np.inf, -np.inf], np.nan)


# Rounding pickup time to 1 hour for api weather data
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].dt.round('h')

# Reading weather csv & changing datatype to datetime64[us]
df_weather = pd.read_csv(r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\weather-data.csv')
df_weather['Time'] = pd.to_datetime(df_weather['Time']).astype('datetime64[us]')

# Merging df (taxi data) and df_weather (weather data)
#df_time_weather = pd.merge(df, df_weather, left_on='tpep_pickup_datetime', right_on='Time')

'''
print(f'DataFrame df_time_weather: {df_time_weather.columns}')
print(f'DataFrame df kolumny: {df.columns}') '''

# Dropping useless columns
#df_time_weather = df_time_weather[['Time', 'Temperature', 'Snowfall',
      #'Showers', 'Rain', 'Visibility', 'Precipitation', 'Wind_speed_10m']]

# Setting index to join two datasets
df = df.set_index('tpep_pickup_datetime', drop=False)
df_time_weather = df_weather.set_index('Time', drop=False)

# Marching to one dataset
df = df.join(df_time_weather, lsuffix='_taxi', rsuffix='_weather')
df = df.sample(frac=1).reset_index(drop=True)
df = df.drop(columns=['Visibility'])

# Clear all NaN
df = df.dropna()


# Changing datatypes to 'category' and numbers for XGBoost
# Category
cat_cols = ['PULBorough','DOLBorough']
for i in cat_cols:
    df[i] = df[i].astype('category')

# Changing dtypes to int and dropping columns
df['pickup_hour'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.hour
df['dropoff_hour'] = pd.to_datetime(df['tpep_dropoff_datetime']).dt.hour
df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'Time'])



In [None]:
# Saving for checking pre-dropped results
df.to_parquet( path=r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\dataset-marged-before-dropping.parquet')

# Feature engineering

Dropping errors, NaN's and outliers to check training results

In [10]:
df_negative_fare = df['fare_amount'] < 0 
df_negative_fare = df_negative_fare.loc[df_negative_fare]
df = df.drop(df_negative_fare.index)


df['user_id'] = np.arange(len(df))
df.set_index('user_id', inplace=True)
df.sort_index(inplace=True)

# Deleting time_diffrence outliers
time_mean = df['time_diffrence h'].mean()
time_std = df['time_diffrence h'].std()

upper_limit = time_mean + 3*time_std
lower_limit = time_mean - 3*time_std

upper_limit_df = df[df['time_diffrence h'] > upper_limit].index
lower_limit_df = df[df['time_diffrence h'] < lower_limit].index

df = df.drop(upper_limit_df)
df = df.drop(lower_limit_df)


# Deleting errors in difference - travel time can't be negative
df['time_diffrence h'].astype(np.int64)
df_negative_travel_time  = (df['time_diffrence h'] < 0)
df_negative_travel_time = df['time_diffrence h'].loc[df_negative_travel_time]
df = df.drop(df_negative_travel_time.index)

In [12]:

# Checking data types
print(df.dtypes)

# Saving final dataset to .paraquet
df.to_parquet( path=r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\dataset-marged.parquet')

time_diffrence h         float64
trip_distance km         float64
RatecodeID               float64
congestion_surcharge     float64
PULocationID               int32
PULBorough              category
DOLocationID               int32
DOLBorough              category
fare_amount              float64
extra                    float64
tolls_amount             float64
Airport_fee              float64
cbd_congestion_fee       float64
average_speed km/h       float64
Temperature              float64
Snowfall                 float64
Showers                  float64
Rain                     float64
Precipitation            float64
Wind_speed_10m           float64
pickup_hour                int32
dropoff_hour               int32
dtype: object
