In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import cos, sin, arcsin, sqrt
from math import radians
from datetime import date
import holidays
from sklearn.cluster import KMeans
from scipy import stats

In [2]:
# List with files for stations 2022
stations2022_list = ['data/stations_2022/20220104_stations.csv','data/stations_2022/20220105_stations.csv','data/stations_2022/20220106_stations.csv','data/stations_2022/20220107_stations.csv','data/stations_2022/20220108_stations.csv','data/stations_2022/20220109_stations.csv','data/stations_2022/20220110_stations.csv','data/stations_2022/20220111_stations.csv']

df_stations_2022 = []

for file in stations2022_list:
    df = pd.read_csv(file)
    df_stations_2022.append(df)

# Remove Duplicates
df_stations_2022 = pd.concat(df_stations_2022, ignore_index= True)
df_stations_2022 = df_stations_2022.drop_duplicates()

df_stations_2022.to_csv('data/stations/Stations_2022.csv', index= False)

In [3]:
def preprocesing_task2(data, n_clusters):
    data =  data.dropna()
    coordinates = data[['latitude', 'longitude']]

    aggregations = {
        'rides_count': 'sum',  
        'mean_temperature': 'mean',  
        'total_precipitation': 'mean',  
        'isHoliday': lambda x: round(x.mean()), 
        'isWeekend': lambda x: round(x.mean()), 
    }

    column_order = ['year', 'month', 'day', 'mean_temperature', 'total_precipitation', 'isHoliday', 'isWeekend', 'stations_cluster', 'rides_count']

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(coordinates)
    data['stations_cluster'] =  kmeans.labels_

    stations_cluster = data.groupby(['year', 'month', 'day', 'am_pm', 'stations_cluster']).agg(aggregations).reset_index().reindex(columns=column_order)

    stations_cluster['isHoliday'] = stations_cluster['isHoliday'].astype(bool)
    stations_cluster['isWeekend'] = stations_cluster['isWeekend'].astype(bool)

    return stations_cluster

In [4]:
def preprocess_data(rides_file_path, stations_file_path):

    df_rides = pd.read_csv(rides_file_path, parse_dates=[0, 1, 2, 3])
    df_stations = pd.read_csv(stations_file_path)

    df_rides.columns = df_rides.columns.str.lower()
    df_stations.columns = df_stations.columns.str.lower()
    
    # Rename Columns in some Stations files due to inconsistency in naming
    if 'start_station_code' not in df_rides.columns:
        df_rides = df_rides.rename(columns={'emplacement_pk_start': 'start_station_code',
                                        'emplacement_pk_end': 'end_station_code'})
    
    if 'code' not in df_stations.columns:
        df_stations = df_stations.rename(columns={'pk': 'code'})

    # Problems in Aug 2019 cause station codes aren't of type int
    if df_rides['start_station_code'].dtype != 'int':

        def to_int_or_Err(val):
            try:
                return int(val)
            except ValueError:
                return None

        df_rides['start_station_code_int'] = df_rides['start_station_code'].apply(to_int_or_Err)
        df_rides['end_station_code_int'] = df_rides['end_station_code'].apply(to_int_or_Err)

        # drop every row where station code which couldn't be converted to int
        df_rides = df_rides.dropna()
        df_rides['start_station_code'] = df_rides['start_station_code_int'].astype(int)
        df_rides['end_station_code'] = df_rides['end_station_code_int'].astype(int)
        df_rides = df_rides.drop(columns=['start_station_code_int', 'end_station_code_int'])

    # add year, month and weekday
    df_rides['year'] = df_rides['start_date'].dt.year
    df_rides['month'] = df_rides['start_date'].dt.month
    df_rides['weekday'] = df_rides['start_date'].dt.weekday

    # Sum up rides between Midnight and 12 and 12:00 AM to Midnight
    df_rides_count = df_rides.groupby([pd.Grouper(key='start_date', freq='12h'), 'start_station_code'])['end_date'].count()
    df_rides_count = df_rides_count.to_frame().rename(columns={'end_date': 'rides_count'}).reset_index()

    # add boolean am/pm (am = 0, pm = 1
    df_rides_count['am_pm'] = df_rides_count['start_date'].dt.hour < 12

    # add total count of stations per year 
    df_stations_count = len(df_stations['code'].unique())
    df_rides_count['stations_count'] = df_stations_count

    # add isHoliday column as boolean
    holidaysCanada = holidays.country_holidays('CA', subdiv='QC')
    df_rides_count['isHoliday'] = df_rides_count['start_date'].apply(lambda x: x.date() in holidaysCanada)

    # add isWeekend column as boolean
    df_rides_count['isWeekend'] = df_rides_count['start_date'].dt.weekday > 4

    # add distance to center of Montreal
    def distanceToCenter(row):
        lon1 = -73.554167
        lat1 = 45.508888
        lon2 = row['longitude']
        lat2 = row['latitude']
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * arcsin(sqrt(a)) 
        km = 6367 * c
        return km
    
    df_rides_count = df_rides_count.merge(df_stations, left_on='start_station_code', right_on='code', how='left')
    df_rides_count['distance_to_center'] = df_rides_count.apply(lambda row: distanceToCenter(row), axis=1)

    # add weather data
    df_weather = pd.read_csv('data/preprocessed_data/weather.csv', parse_dates=[4])
    df_weather.columns = df_weather.columns.str.lower()
    df_weather = df_weather[["date/time", "mean temp (°c)", "total precip (mm)"]]
    df_weather = df_weather.rename(columns={"date/time": "tmp_date",'mean temp (°c)': 'mean_temperature','total precip (mm)': 'total_precipitation'})
    df_weather['tmp_date'] = pd.to_datetime(df_weather['tmp_date']).dt.date
    
    # interpolate missing data
    df_weather[['mean_temperature','total_precipitation']] = df_weather[['mean_temperature','total_precipitation']].interpolate()

    # add attribute and join dataframes
    df_rides_count['date'] = pd.to_datetime(df_rides_count['start_date']).dt.date
    df_rides_count = df_rides_count.merge(df_weather, left_on='date', right_on='tmp_date', how='left')
    
    # Create more date related columns
    df_rides_count['year'], df_rides_count['month'], df_rides_count['day'], df_rides_count['weekday'] = df_rides_count['start_date'].dt.year, df_rides_count['start_date'].dt.month, df_rides_count['start_date'].dt.day, df_rides_count['start_date'].dt.dayofweek
    
    # drop na colums
    df_rides_count = df_rides_count[[
        'latitude','longitude','distance_to_center','year','month','day','weekday','am_pm','isHoliday','isWeekend','mean_temperature','total_precipitation',
        'stations_count','rides_count'
    ]]

    return df_rides_count



In [5]:
# Training data from year 2014 till 2018

df_train = pd.DataFrame()
for year in range(2014, 2018):
    stations_file_path = f'data/stations/Stations_{year}.csv'
    for month in range(4, 11):
        rides_file_path = f'data/bike_rides/OD_{year}-{month:02d}.csv'
        df_month = preprocess_data(rides_file_path, stations_file_path)
        df_train = pd.concat([df_train, df_month]).reset_index(drop=True)
# save file
df_train.to_csv('data/preprocessed_data/train.csv', index=False)
df_train = preprocesing_task2(df_train, 100)
df_train.to_csv('data/preprocessed_data/t2_train.csv', index=False)

In [6]:
# Validation data from year 2019

df_val = pd.DataFrame()
stations_file_path = f'data/stations/Stations_2019.csv'
for month in range(4, 11):
    rides_file_path = f'data/bike_rides/OD_2019-{month:02d}.csv'
    df_month = preprocess_data(rides_file_path, stations_file_path)
    df_val = pd.concat([df_val, df_month]).reset_index(drop=True)
# save file
df_val.to_csv('data/preprocessed_data/valid.csv', index=False)
df_val = preprocesing_task2(df_val, 100)
df_val.to_csv('data/preprocessed_data/t2_valid.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['stations_cluster'] =  kmeans.labels_


In [7]:
# Covid data from year 2020 till 2021

df_covid = pd.DataFrame()
for year in range(2020, 2021):
    stations_file_path = f'data/stations/Stations_{year}.csv'
    rides_file_path = f'data/bike_rides/OD_{year}.csv'
    df_month = preprocess_data(rides_file_path, stations_file_path)
    df_covid = pd.concat([df_covid, df_month]).reset_index(drop=True)
# save file
df_covid.to_csv('data/preprocessed_data/covid.csv', index=False)
df_covid = preprocesing_task2(df_covid, 100)
df_covid.to_csv('data/preprocessed_data/t2_covid.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/bike_rides/OD_2020.csv'

In [8]:
# Test data from year 2022

df_test = pd.DataFrame()
stations_file_path = f'data/stations/Stations_2022.csv'
for month in range(4, 11):
    rides_file_path = f'data/bike_rides/OD_2022-{month:02d}.csv'
    df_month = preprocess_data(rides_file_path, stations_file_path)
    df_test = pd.concat([df_test, df_month]).reset_index(drop=True)
# save file
df_test.to_csv('data/preprocessed_data/test.csv', index=False)
df_test = preprocesing_task2(df_test, 100)
df_test.to_csv('data/preprocessed_data/t2_test.csv', index=False)