### Downloading the data

In [None]:
from main import main
key = '<replace with your key>' # this is the key for the weather data API

In [None]:
# uncomment this line on your own risk
# it will take ~15-20 minutes to download the data and to do some cleaning
#main(key)

In [None]:
from configs import merged_cycle_data_file
import pandas as pd
from sklearn.model_selection import train_test_split

### Load the weather & holiday data

In [None]:
from configs import weather_data_csv

holidays = 'holidays.csv'
hol_df = pd.read_csv(holidays, index_col='date', parse_dates=['date'])
weather_df = pd.read_csv(weather_data_csv, index_col='timestamp', parse_dates=['timestamp'])
weather_df.head()

In [None]:
# get a set of the holiday dates
hol_set= set(hol_df.index.map(lambda x: x.date()))

### Load the merged csv file by reading it in chunks

# Warning:
The next cell takes a lot of time (on one machine it took 3h 40 mins) so skip running this cell.

In [None]:
# %%time
# from datetime import datetime
# import time
# chunk_size = 1000000
# date_mapper = lambda x: pd.to_datetime(datetime(year=x.year, month=x.month, day=x.day, hour=x.hour))

# bike_share_df = pd.DataFrame()
# print('Started loading merged_cycle_data_file.')
# iter_ = pd.read_csv(merged_cycle_data_file, chunksize=chunk_size, iterator=True,
#         index_col='Rental Id',
#         parse_dates=['End Date', 'Start Date'])
# print('Finished loading merged_cycle_data_file.')

# r_start = time.time()
# for i, df in enumerate(iter_):
#     r_end = time.time()
#     print(f'{i+1}. Read rows {chunk_size*i}:{chunk_size*(i+1)} in {r_end-r.start:.3f}. ', end='')

#     start = time.time()
#     df = df.dropna()
#     # leave only entries that have valid duration
#     df = df[df['Duration'] > 0]
    
#     diff = df['End Date'] - df['Start Date'] # compute the difference between the objects
#     seconds = diff.map(lambda x: x.total_seconds()) # map to seconds
#     df = df[(df['Duration'] == seconds) & (seconds >= 0)] # check if duration matches the result and if the result is positive
    
    
#     # keep only year, month, day, hour information from the start date
#     df['Start Date'] = df['Start Date'].map(date_mapper)
    
#     share_df = df.groupby('Start Date').agg({'Start Date': 'count'}).rename(columns={'Start Date': 'share_count'})
#     share_df = share_df.join(weather_df)
#     share_df = share_df.reset_index()
#     share_df = share_df.dropna()
    
#     share_df['month'] = share_df['Start Date'].apply(lambda t: t.month)
#     share_df['weekday'] = share_df['Start Date'].apply(lambda t: t.weekday())
#     share_df['hour'] = share_df['Start Date'].apply(lambda t: t.hour)
#     share_df['is_holiday'] = share_df['Start Date'].map(lambda x: x.date() in hol_set).map(lambda x: '1' if x else '0')
#     # check if start date hits on a weekend
#     # monday is 0, sunday is 6
#     share_df['is_weekend'] = share_df['Start Date'].map(lambda x: x.weekday() > 4).map(lambda x: '1' if x else '0')
#     share_df['weatherCode'] = share_df['weatherCode'].map(lambda x: str(int(x)))
    
#     bike_share_df = bike_share_df.append(share_df)
#     end = time.time()
#     print(f'Completed cleaning & merging in {end-start:3.3f} seconds.')
#     r_start = time.time()

# print('Finished reading!')
# bike_share_df = bike_share_df.reset_index().drop(columns=['index']) # fix the index
# # save the data to a file, so that we can load it faster next time
# bike_share_df.to_csv('shares-ungrouped.csv')

In [None]:
types = {
    'weatherCode': str,
    'is_holiday': str,
    'is_weekend': str
}
bike_share_df = pd.read_csv('shares-ungrouped.csv', parse_dates=['Start Date'], dtype=types).drop(columns=['Unnamed: 0'])

In [None]:
bike_share_df.shape

In [None]:
bike_share_df.info()

In [None]:
bike_share_df.head()

In [None]:
# because of reading in chunks, some hours appear multiple times
bike_share_df[bike_share_df['Start Date'] == '2018-12-07 10:00:00']

In [None]:
share_total = bike_share_df.groupby('Start Date').agg({'share_count': 'sum'})
share_row = bike_share_df.groupby('Start Date').agg(lambda x: x.iloc[0])
share_row['share_count'] = share_total['share_count']
bike_share_df = share_row.reset_index()
bike_share_df.to_csv('shares-grouped.csv', index=False)

In [None]:
bike_share_df.info()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
bike_share_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
bike_share_df['month'].hist(bins=12)

In [None]:
bike_share_df['weatherCode'].value_counts()

In [None]:
bike_share_df['hour'].hist(bins=24)

In [None]:
bike_share_df['temperature'].hist(bins=24)

In [None]:
bike_share_df[bike_share_df['share_count'] <= 100].shape

In [None]:
bike_share_df['share_count'].hist(bins=2000, figsize=(24,24))

In [None]:
bike_share_df[bike_share_df['share_count'] <= 300]['share_count'].hist(bins=100, figsize=(24,24))

In [None]:
# LabelBinarizer can apply the transformation from text categories
# to integer categories, then from integer categories to one-hot vectors
# basically, it combines a label encoder with one-hot encoder
# from sklearn.preprocessing import LabelBinarizer
# encoder = LabelBinarizer()
# holiday_cat = bike_share_df['weatherCode']
# holiday_cat_1hot = encoder.fit_transform(holiday_cat.to_numpy())
# holiday_cat_1hot

In [None]:
encoded_weather_codes = pd.get_dummies(bike_share_df['weatherCode'], prefix='weather')
bike_share_df_enc = bike_share_df.drop(['weatherCode'], axis=1)
bike_share_df_enc = bike_share_df_enc.join(encoded_weather_codes)
bike_share_df_enc.head(5)

In [None]:
holiday_enc = pd.get_dummies(bike_share_df['is_holiday'], prefix='holiday')
week_enc = pd.get_dummies(bike_share_df['is_weekend'], prefix='weekend')
bike_share_df_enc = bike_share_df_enc.join(holiday_enc).join(week_enc)
bike_share_df_enc = bike_share_df_enc.drop(['is_holiday', 'is_weekend'],axis=1)

In [None]:
# "weekday" could be dropped due to the fact that we have "is_weekend"
# "month" could be dropped due to the fact that the model might not be able to extract any useful information
# out of it
bike_share_df_enc = bike_share_df_enc.drop(['weekday'], axis=1)

In [None]:
import numpy as np

bike_share_df_enc['hr_sin'] = np.sin(bike_share_df_enc.hour*(2.*np.pi/24))
bike_share_df_enc['hr_cos'] = np.cos(bike_share_df_enc.hour*(2.*np.pi/24))
bike_share_df_enc['mnth_sin'] = np.sin((bike_share_df_enc.month-1)*(2.*np.pi/12))
bike_share_df_enc['mnth_cos'] = np.cos((bike_share_df_enc.month-1)*(2.*np.pi/12))
bike_share_df_enc = bike_share_df_enc.drop(['month', 'hour'], axis=1)

In [None]:
bike_share_df_enc.head(5)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, SVR
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import Lasso, ElasticNet, Ridge

In [None]:
train_set, test_set = train_test_split(bike_share_df_enc.drop('Start Date', axis=1), test_size=0.2, random_state=42069)
X_train = train_set.drop('share_count', axis=1).to_numpy()
Y_train = train_set['share_count'].to_numpy()

X_test = test_set.drop('share_count', axis=1).to_numpy()
Y_test = test_set['share_count'].to_numpy()

In [None]:
random_state =42069
classifiers = [
#     ('Decision Tree', DecisionTreeClassifier()),
#     ('Random Forest', RandomForestClassifier(n_estimators=100)),
#     ('NN (50, 50, 16)', MLPClassifier(hidden_layer_sizes=(16, 16, 16), verbose=True, learning_rate='adaptive', activation='tanh', learning_rate_init=0.01))
    ('Lasso', Lasso(random_state=random_state)),
    ('ElasticNet', ElasticNet(random_state=random_state)),
    ('Ridge', Ridge(random_state=random_state)),
    ('SVR liniar', SVR(kernel='linear', verbose=True)),
    ('SVR rbf', SVR(kernel='rbf', verbose=True)),
]

In [None]:
import time
def learn(classifiers, X_train, Y_train, X_test, Y_test):
    for name, clf in classifiers:
        print(f'** {name}')
        t0 = time.time()
        clf.fit(X_train, Y_train)
        t1 = time.time()
        score_train = clf.score(X_train[:10000], Y_train[:10000])
        t2 = time.time()
        score_test = clf.score(X_test, Y_test)
        t3 = time.time()
        print(f'\tTraining time {t1 - t0:3.3f}')
        print(f'\tPrediction time (train) {t2 - t1:3.3f}')
        print(f'\tPrediction time (test) {t3 - t2:3.3f}')
        print(f'\tScore train: {score_train:.3f}\tScore Test: {score_test:.3f}')

In [None]:
learn(classifiers, X_train, Y_train, X_test, Y_test)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled

In [None]:
# recreate the classifier
classifiers = [
#     ('Decision Tree', DecisionTreeClassifier()),
#     ('Random Forest', RandomForestClassifier(n_estimators=100)),
#     ('NN (50, 50, 16)', MLPClassifier(hidden_layer_sizes=(16, 16, 16), verbose=True, learning_rate='adaptive', activation='tanh', learning_rate_init=0.01))
    ('Lasso', Lasso(random_state=random_state)),
    ('ElasticNet', ElasticNet(random_state=random_state)),
    ('Ridge', Ridge(random_state=random_state)),
    ('SVR liniar', SVR(kernel='linear', verbose=True)),
    ('SVR rbf', SVR(kernel='rbf', verbose=True)),
]
learn(classifiers, X_train_scaled, Y_train, X_test_scaled, Y_test)

In [None]:
X_train_temp, Y_train_temp = train_set['temperature'].to_numpy().reshape(-1, 1), train_set['share_count']
X_test_temp, Y_test_temp = test_set['temperature'].to_numpy().reshape(-1, 1), test_set['share_count']

In [None]:
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
#     ('NN (50, 50, 16)', MLPClassifier(hidden_layer_sizes=(16, 16, 16), verbose=True, learning_rate='adaptive', activation='tanh', learning_rate_init=0.01))
    ('Lasso', Lasso(random_state=random_state)),
    ('ElasticNet', ElasticNet(random_state=random_state)),
    ('Ridge', Ridge(random_state=random_state)),
    ('SVR liniar', SVR(kernel='linear', verbose=True)),
    ('SVR rbf', SVR(kernel='rbf', verbose=True)),
]

In [None]:
learn(classifiers, X_train_temp, Y_train_temp, X_test_temp, Y_test_temp)

In [None]:
noBSdata = bike_share_df_enc.drop(['Start Date', 'hr_sin', 'hr_cos', 'mnth_sin', 'mnth_cos', 'holiday_0', 'holiday_1', 'weekend_1', 'weekend_0'], axis = 1)

train_set, test_set = train_test_split(noBSdata, test_size=0.2, random_state=42069)
X_train = train_set.drop('share_count', axis=1).to_numpy()
Y_train = train_set['share_count'].to_numpy()

X_test = test_set.drop('share_count', axis=1).to_numpy()
Y_test = test_set['share_count'].to_numpy()

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifiers = [
#     ('Decision Tree', DecisionTreeClassifier()),
#     ('Random Forest', RandomForestClassifier(n_estimators=100)),
#     ('NN (50, 50, 16)', MLPClassifier(hidden_layer_sizes=(16, 16, 16), verbose=True, learning_rate='adaptive', activation='tanh', learning_rate_init=0.01))
    ('Lasso', Lasso(random_state=random_state)),
    ('ElasticNet', ElasticNet(random_state=random_state)),
    ('Ridge', Ridge(random_state=random_state)),
    ('SVR liniar', SVR(kernel='linear', verbose=True)),
    ('SVR rbf', SVR(kernel='rbf', verbose=True)),
]

In [None]:
learn(classifiers, X_train_scaled, Y_train, X_test_scaled, Y_test)