In [95]:
from datetime import date
from geopy.distance import geodesic
import holidays
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.cluster import KMeans

In [96]:
weather_list = ['data/weather_data/en_climate_daily_QC_7025251_2014_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2015_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2016_P1D.csv',
                          'data/weather_data/en_climate_daily_QC_7025251_2016_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2017_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2018_P1D.csv',
                          'data/weather_data/en_climate_daily_QC_7025251_2019_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2020_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2021_P1D.csv', 'data/weather_data/en_climate_daily_QC_7025251_2022_P1D.csv']

df_weather = []


for file in weather_list:
    df = pd.read_csv(file)
    df_weather.append(df)

df_weather = pd.concat(df_weather)
df_weather.to_csv('data/preprocessed_data/weather.csv')

In [98]:
# List with files for stations 2022
stations2022_list = ['data/stations_2022/20220104_stations.csv','data/stations_2022/20220105_stations.csv','data/stations_2022/20220106_stations.csv','data/stations_2022/20220107_stations.csv','data/stations_2022/20220108_stations.csv','data/stations_2022/20220109_stations.csv','data/stations_2022/20220110_stations.csv','data/stations_2022/20220111_stations.csv']

df_stations_2022 = []

for file in stations2022_list:
    df = pd.read_csv(file)
    df_stations_2022.append(df)

# Remove Duplicates
df_stations_2022 = pd.concat(df_stations_2022, ignore_index= True)
df_stations_2022 = df_stations_2022.drop_duplicates()

df_stations_2022.to_csv('data/stations/Stations_2022.csv', index= False)

In [99]:
# List with files for stations 2022
def prepare_stations():
    lst = []
    for year in range(2014, 2023):
        df = pd.read_csv(f'data/stations/Stations_{year}.csv')
        lst.append(df)
    
    # Remove Duplicates
    df_stations = pd.concat(lst, ignore_index=True)
    df_stations.drop_duplicates(subset=['code'], inplace=True, keep="first")
    df_stations.to_csv("data/regression_analysis/all_stations.csv")
    
    df_stations_2022 = pd.read_csv('data/stations/Stations_2022.csv')
    df_stations['distance_to_center'] = df_stations_2022.apply(calculate_distance_to_center, axis=1)
    coordinates = df_stations[['latitude', 'longitude']]
    kmeans = KMeans(n_clusters=50, random_state=0).fit(coordinates)
    df_stations['stations_cluster'] =  kmeans.labels_
    
    df_stations.to_csv("data/regression_analysis/all_stations_clustered.csv")
    return df_stations

In [100]:
def include_clusters(rides_df, stations_df):
    merged_df = pd.merge(rides_df, stations_df, left_on='start_station_code', right_on='code', how='left')
    merged_df = merged_df.rename(columns={'stations_cluster': 'start_station_cluster'})
    merged_df.drop('code', axis=1, inplace=True)
    
    merged_df = pd.merge(merged_df, stations_df, left_on='end_station_code', right_on='code', how='left')
    merged_df = merged_df.rename(columns={'stations_cluster': 'end_station_cluster'})
    merged_df.drop('code', axis=1, inplace=True)
    
    grouped_df = merged_df.groupby(['start_date', 'start_station_cluster', 'end_station_cluster']).agg(count=('start_date', 'size'), duration_sec=('duration_sec', 'mean'), is_holiday=("is_holiday", "first"), is_weekend=("is_weekend", "first")).reset_index()
    return grouped_df

In [116]:
def get_weather():
    df_weather = pd.read_csv('data/preprocessed_data/weather.csv', parse_dates=[4])
    df_weather.columns = df_weather.columns.str.lower()
    df_weather = df_weather[["date/time", "mean temp (°c)", "total precip (mm)"]]
    df_weather = df_weather.rename(columns={"date/time": "date",'mean temp (°c)': 'mean_temperature','total precip (mm)': 'total_precipitation'})
    df_weather['date'] = pd.to_datetime(df_weather['date']).dt.date
    
    # interpolate missing data
    df_weather[['mean_temperature','total_precipitation']] = df_weather[['mean_temperature','total_precipitation']].interpolate()
    return df_weather

In [101]:
def include_weather(grouped, weather):
    w_weather = grouped.merge(weather, left_on='start_date', right_on='date', how='left')
    w_weather.drop('date', axis=1, inplace=True)
    return w_weather

In [102]:
def calculate_distance_to_center(row):
    center_coords = (45.508888, -73.554167)  # City center coordinates (latitude, longitude)
    station_coords = (row['latitude'], row['longitude'])  # Station coordinates (latitude, longitude)
    distance_km = geodesic(center_coords, station_coords).kilometers
    return distance_km

In [103]:
def preprocess_data(rides_file_path, stations_file_path):
    # Read rides data and stations data
    rides_data = pd.read_csv(rides_file_path, parse_dates=[0, 1, 2, 3])
    stations_data = pd.read_csv(stations_file_path)

    # Convert column names to lowercase
    rides_data.columns = rides_data.columns.str.lower()
    stations_data.columns = stations_data.columns.str.lower()

    # Rename columns if necessary
    if 'start_station_code' not in rides_data.columns:
        rides_data = rides_data.rename(columns={'emplacement_pk_start': 'start_station_code',
                                                'emplacement_pk_end': 'end_station_code'})

    if 'code' not in stations_data.columns:
        stations_data = stations_data.rename(columns={'pk': 'code'})

    # Convert station codes to integers
    if rides_data['start_station_code'].dtype != 'int':
        rides_data['start_station_code'] = pd.to_numeric(rides_data['start_station_code'], errors='coerce')
        rides_data['end_station_code'] = pd.to_numeric(rides_data['end_station_code'], errors='coerce')
        rides_data = rides_data.dropna(subset=['start_station_code', 'end_station_code'])
        rides_data['start_station_code'] = rides_data['start_station_code'].astype(int)
        rides_data['end_station_code'] = rides_data['end_station_code'].astype(int)

    # Extract year, month, and weekday information
    rides_data['year'] = rides_data['start_date'].dt.year
    rides_data['month'] = rides_data['start_date'].dt.month
    rides_data['weekday'] = rides_data['start_date'].dt.weekday


    # Group rides by start date and start station code and count rides
    rides_count = rides_data.groupby([pd.Grouper(key='start_date', freq='12h'), 'start_station_code'])['end_date'].count().reset_index()
    rides_count = rides_count.rename(columns={'end_date': 'rides_count'})

    # Add AM/PM flag based on start hour
    rides_count['am_pm'] = rides_count['start_date'].dt.hour < 12

    # Count the number of stations
    stations_count = stations_data['code'].nunique()
    rides_count['stations_count'] = stations_count

    # Check if the date is a holiday or weekend
    holidays_canada = holidays.country_holidays('CA', subdiv='QC')
    rides_count['is_holiday'] = rides_count['start_date'].dt.date.map(lambda x: x in holidays_canada)
    rides_count['is_weekend'] = rides_count['start_date'].dt.weekday > 4

    rides_data['is_holiday'] = rides_data['start_date'].dt.date.map(lambda x: x in holidays_canada)
    rides_data['is_weekend'] = rides_data['start_date'].dt.weekday > 4

    # Convert start_date column to datetime and set it as the index
    rides_data['start_date'] = pd.to_datetime(rides_data['start_date']).dt.date

    # Merge rides data with stations data
    rides_count = rides_count.merge(stations_data, left_on='start_station_code', right_on='code', how='left')

    # Drop rows with NaN coordinates
    rides_count = rides_count.dropna(subset=['latitude', 'longitude'])
      
    # Calculate distance to city center for each station
    rides_count['distance_to_center'] = rides_count.apply(calculate_distance_to_center, axis=1)

    # Read weather data and interpolate missing values
    weather_data = pd.read_csv('data/preprocessed_data/weather.csv', parse_dates=[4])
    weather_data.columns = weather_data.columns.str.lower()
    weather_data = weather_data[["date/time", "mean temp (°c)", "total precip (mm)"]]
    weather_data = weather_data.rename(columns={"date/time": "date", 'mean temp (°c)': 'mean_temperature', 'total precip (mm)': 'total_precipitation'})
    weather_data['date'] = pd.to_datetime(weather_data['date']).dt.date
    weather_data[['mean_temperature', 'total_precipitation']] = weather_data[['mean_temperature', 'total_precipitation']].interpolate()

    # Merge rides data with weather data
    rides_count['ride_date'] = rides_count['start_date'].dt.date
    rides_count = rides_count.merge(weather_data, left_on='ride_date', right_on='date', how='left')


    # Extract year, month, day, and weekday information
    rides_count[['year', 'month', 'day', 'weekday']] = pd.DataFrame({
        'year': rides_count['start_date'].dt.year,
        'month': rides_count['start_date'].dt.month,
        'day': rides_count['start_date'].dt.day,
        'weekday': rides_count['start_date'].dt.weekday.values
    })

    # Select relevant columns for the final output
    rides_count = rides_count[[
        'latitude', 'longitude', 'distance_to_center', 'year', 'month', 'day', 'weekday', 'am_pm', 'is_holiday',
        'is_weekend', 'mean_temperature', 'total_precipitation', 'stations_count', 'rides_count'
    ]]

    return rides_count, rides_data

In [110]:

def preprocess_task1(data):
    # Columns and thresholds
    day_data_cols = ['year', 'month', 'day', 'weekday', 'is_holiday', 'is_weekend', 'mean_temperature', 'total_precipitation',]
    label3_split = [10, 30]
    label5_split = [5, 15, 30, 50]

    # Remove duplicates and calculate daily sums
    day_data = data[day_data_cols].drop_duplicates(subset=day_data_cols, keep='first')
    data_new = data.groupby(['year', 'month', 'day', 'am_pm'], as_index=False)['rides_count'].sum()

    # Merge with daily columns
    data_new = data_new.merge(day_data, how='left', on=['year', 'month', 'day'])

    # Create label columns
    bins3 = [float('-inf'), label3_split[0], label3_split[1], float('inf')]
    bins5 = [float('-inf'), label5_split[0], label5_split[1], label5_split[2], label5_split[3], float('inf')]
    labels3 = [1, 2, 3]
    labels5 = [1, 2, 3, 4, 5]
    data_new['label3'] = pd.cut(data_new['rides_count'], bins=bins3, labels=labels3)
    data_new['label5'] = pd.cut(data_new['rides_count'], bins=bins5, labels=labels5)

    return data_new


In [113]:
def preprocess_task2(rides):
  stations = prepare_stations()[["code", "stations_cluster"]]
  grouped = include_clusters(rides, stations)
  weather = get_weather()
  grouped_weather = include_weather(grouped, weather).drop_duplicates()

  return grouped_weather

In [114]:
def generate_preprocessed_data():
    stations_files = {
        'train': [list(range(2014, 2019)), 'train.csv'],
        'valid': [(2019,), 'valid.csv'],
        'test': [(2022,), 'test.csv']
    }

    weather_data = None

    for dataset, (years, filename) in stations_files.items():
        df_dataset = pd.DataFrame()
        df_rides = pd.DataFrame()
        for year in years:
            stations_file_path = f'data/stations/Stations_{year}.csv'
            for month in range(4, 11):
                rides_file_path = f'data/bike_rides/OD_{year}-{month:02d}.csv'
                df_month, rides_month = preprocess_data(rides_file_path, stations_file_path)
                df_dataset = pd.concat([df_dataset, df_month]).reset_index(drop=True)
                df_rides = pd.concat([df_rides, rides_month])

        df_dataset.to_csv(f'data/preprocessed_data/{filename}', index=False)
        df_dataset_t1 = preprocess_task1(df_dataset)
        df_dataset_t1.to_csv(f'data/preprocessed_data/t1_{dataset}.csv', index=False)
        df_dataset_t2 = preprocess_task2(df_rides)
        df_dataset_t2.to_csv(f'data/preprocessed_data/t2_{dataset}.csv')


In [117]:
generate_preprocessed_data()

