# Data Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import cos, sin, arcsin, sqrt
from math import radians
from datetime import date
import holidays

In [2]:
def read_and_preprocess(rides_file_path, stations_file_path):
    '''
    This function is used to read a rides file and all other necessary files and do the preprocessing.
    
    :param str rides_file_path: path to the rides file
    :param str stations_file_path: path to the stations file
    :return pd.DataFrame: dataframe ready to be used in the modelling process
    '''
    
    rides_df = pd.read_csv(rides_file_path, parse_dates=[0, 2])
    stations_df = pd.read_csv(stations_file_path)
    
    # turn all column names to lower case
    rides_df.columns = rides_df.columns.str.lower()
    stations_df.columns = stations_df.columns.str.lower()
    
    # this happens in file of August 2019, because of invalid station codes
    if rides_df['start_station_code'].dtype != 'int':
        
        # add column for integer values, insert None when a value can not be converted
        def to_int_or_none(val):
            try:
                return(int(val))
            except ValueError:
                return None

        rides_df['start_station_code_int'] = rides_df['start_station_code'].apply(to_int_or_none)
        rides_df['end_station_code_int'] = rides_df['end_station_code'].apply(to_int_or_none)

        # drop every row where station codes could not be converted to integer
        rides_df = rides_df.dropna()
        rides_df['start_station_code'] = rides_df['start_station_code_int'].astype('int')
        rides_df['end_station_code'] = rides_df['end_station_code_int'].astype('int')
        rides_df = rides_df.drop(columns=['start_station_code_int', 'end_station_code_int'])
    
    # aggregate rides: sum up rides between 0:00 to 12:00 and 12:00 to 0:00
    ride_counts_df = rides_df.groupby([pd.Grouper(key='start_date', freq='12h'), 'start_station_code'])['end_date'].count()
    ride_counts_df = ride_counts_df.to_frame()
    ride_counts_df = ride_counts_df.rename(columns={'end_date': 'count'})
    ride_counts_df = ride_counts_df.reset_index()
    
    # add am/pm flags (am = 0, pm = 1)
    ride_counts_df['pm'] = ride_counts_df['start_date'].dt.hour.map({0: 0, 12: 1})
    
    # join coordinates of stations
    ride_counts_df = ride_counts_df.merge(
        stations_df[['code', 'latitude', 'longitude']],
        left_on='start_station_code',
        right_on='code',
        # inner join removes any station not specified in the stations dataset
        how='inner'
    )
    
    # add month and weekday
    ride_counts_df['month'] = ride_counts_df['start_date'].dt.month
    ride_counts_df['weekday'] = ride_counts_df['start_date'].dt.weekday
    
    # add holyday flag
    ca_qc_holidays = holidays.country_holidays('CA', subdiv='QC')
    ride_counts_df['holiday'] = ride_counts_df['start_date'].isin(ca_qc_holidays)
    
    # add weather data
    weather_df = pd.read_csv('data/Canadian_climate_history.csv', parse_dates=[0])
    weather_df.columns = weather_df.columns.str.lower()
    weather_df = weather_df[['local_date', 'mean_temperature_montreal', 'total_precipitation_montreal']]
    weather_df = weather_df.rename(
        columns={'mean_temperature_montreal': 'mean_temperature',
                 'total_precipitation_montreal': 'total_precipitation'}
    )
    # add date attribute to join on
    ride_counts_df['date'] = pd.to_datetime(ride_counts_df['start_date'].dt.date)
    ride_counts_df = ride_counts_df.merge(weather_df, left_on='date', right_on='local_date')
    
    # add distance to city center
    def haversine(row):
        lon1 = -73.56878
        lat1 = 45.50354
        lon2 = row['longitude']
        lat2 = row['latitude']
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * arcsin(sqrt(a)) 
        km = 6367 * c
        return km

    ride_counts_df['distance_to_center'] = ride_counts_df.apply(lambda row: haversine(row), axis=1)
    
    # add density of stations in nearby area???
    
    # only keep these columns
    ride_counts_df = ride_counts_df[[
        'start_date',
        'latitude', 'longitude',
        'month', 'weekday', 'pm',
        'mean_temperature', 'total_precipitation',
        'distance_to_center', 'holiday',
         # ...
        'count'
    ]]
    
    return ride_counts_df
    

In [3]:
# 2017 & 2018 (training data)
# try reading file first
try:
    pd.read_csv('data/preprocessed_data/train.csv')
except FileNotFoundError:
    train_df = pd.DataFrame()
    for year in range(2017, 2019):
        stations_file_path = f'data/{year}/Stations_{year}.csv'
        for month in range(4, 11):
            rides_file_path = f'data/{year}/OD_{year}-{month:02d}.csv'
            # run preprocessing function and append df
            month_df = read_and_preprocess(rides_file_path, stations_file_path)
            train_df = pd.concat([train_df, month_df]).reset_index(drop=True)
    # save file
    train_df.to_csv('data/preprocessed_data/train.csv', index=False)

# 2019 (test data)
# try reading file first
try:
    pd.read_csv('data/preprocessed_data/test.csv')
except FileNotFoundError:
    test_df = pd.DataFrame()
    stations_file_path = f'data/{year}/Stations_{year}.csv'
    for month in range(4, 11):
        rides_file_path = f'data/{year}/OD_{year}-{month:02d}.csv'
        # run preprocessing function and append df
        month_df = read_and_preprocess(rides_file_path, stations_file_path)
        test_df = pd.concat([test_df, month_df]).reset_index(drop=True)
    # save file
    test_df.to_csv('data/preprocessed_data/test.csv', index=False)