# Bike sharing in Washington
<em>Individual Project Python<em>
___

                              Master in Business Analytics & Big data
                                 Professor Juan Luis Cano Rodriguez
                                           Martin Hofbauer
                                              _________

## Libraries

In [68]:
from distributed import Client, progress
import dask
import dask.dataframe as dd
import dask.array as da
import numpy as np
import dask_ml
from dask_ml.preprocessing import Categorizer, DummyEncoder
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from dask_ml.linear_model import LinearRegression
from dask_ml.metrics import mean_squared_error
from dask_ml.metrics import r2_score

In [58]:
client = Client()
client

Failed to start diagnostics server on port 8787. [WinError 10048] Normalerweise darf jede Socketadresse (Protokoll, Netzwerkadresse oder Anschluss) nur jeweils einmal verwendet werden


0,1
Client  Scheduler: tcp://127.0.0.1:56691  Dashboard: http://127.0.0.1:56694/status,Cluster  Workers: 4  Cores: 8  Memory: 17.10 GB


## Data Loading

In [59]:
hour_data = dd.read_csv(
    "https://gist.githubusercontent.com/geraldwal/b5a83f4c670abe0a662abce558e5d433/raw/bce4bbfc63355606e4503964e25798b5d2190b9b/hour%2520-%2520Python%2520Bike%2520Sharing",
    sep=",",
    parse_dates=["dteday"],
)

## Preparation for Baseline

### Renaming the Colums

In [60]:
new_columns = [
    "instant",
    "dteday",
    "season",
    "year",
    "month",
    "hour",
    "holiday",
    "weekday",
    "workingday",
    "weather",
    "temp",
    "atemp",
    "humidity",
    "windspeed",
    "casual",
    "registered",
    "count",
]
hour_data= hour_data.rename(columns=dict(zip(hour_data.columns, new_columns)))

In [32]:
hour_data = hour_data.drop(["dteday"], axis=1)

### Applying the Corrrect Datatypes and DummyEncoding

In [33]:
categs = ['season', 'year', 'month', 'hour', 'holiday', 'weekday', 'workingday', 'weather']

In [34]:
pipeline = make_pipeline(Categorizer(columns=categs), DummyEncoder(columns=categs))
hour_data_cl_hot_encoded = pipeline.fit_transform(hour_data)

In [71]:
hour_data_cl_hot_encoded.dtypes

instant           int64
temp            float64
atemp           float64
humidity        float64
windspeed       float64
casual            int64
registered        int64
count             int64
season_1          uint8
season_2          uint8
season_3          uint8
season_4          uint8
year_0            uint8
year_1            uint8
month_1           uint8
month_2           uint8
month_3           uint8
month_4           uint8
month_5           uint8
month_6           uint8
month_7           uint8
month_8           uint8
month_9           uint8
month_10          uint8
month_11          uint8
month_12          uint8
hour_0            uint8
hour_1            uint8
hour_2            uint8
hour_3            uint8
                 ...   
hour_9            uint8
hour_10           uint8
hour_11           uint8
hour_12           uint8
hour_13           uint8
hour_14           uint8
hour_15           uint8
hour_16           uint8
hour_17           uint8
hour_18           uint8
hour_19         

### Splitting the Dataset

The intital Split function needed to be adapted a bit, including a .loc after the X and y. Furthermore, the trainsize needed to be reduced by 1 in order not to have rows included in both train and test.

In [35]:
def split_data(dataset, Target):
    X = dataset.loc[:, dataset.columns != Target]
    y = dataset.loc[:, Target]
    train_size = int(len(dataset) * 0.875)
    X_train, X_test, y_train, y_test = (
        X.loc[0:train_size -1],
        X.loc[train_size : len(dataset)],
        y.loc[0:train_size -1],
        y.loc[train_size : len(dataset)],
    )
    return X_train, X_test, y_train, y_test

In [36]:
x_train_reg, x_test_reg, y_train_reg, y_test_reg = split_data(hour_data_cl_hot_encoded, "registered")
x_train_casual, x_test_casual, y_train_casual, y_test_casual = split_data(hour_data_cl_hot_encoded, "casual")
x_train_count, x_test_count, y_train_count, y_test_count = split_data(hour_data_cl_hot_encoded, "count")

In [37]:
x_train_reg_drop, x_test_reg_drop = (
    x_train_reg.drop(["casual", "count"], axis=1),
    x_test_reg.drop(["casual", "count"], axis=1),
)

In [38]:
x_train_reg_arr, x_test_reg_arr, y_train_reg_arr, y_test_reg_arr = (
    x_train_reg_drop.values,
    x_test_reg_drop.values,
    y_train_reg.values,
    y_test_reg.values,
)

In [39]:
x_train_casual_drop, x_test_casual_drop = (
    x_train_casual.drop(["registered", "count"], axis=1),
    x_test_casual.drop(["registered", "count"], axis=1),
)

In [40]:
x_train_casual_arr, x_test_casual_arr, y_train_casual_arr, y_test_casual_arr = (
    x_train_casual_drop.values,
    x_test_casual_drop.values,
    y_train_casual.values,
    y_test_casual.values,
)

In [41]:
x_train_count_drop, x_test_count_drop = (
    x_train_count.drop(["casual", "registered"], axis=1),
    x_test_count.drop(["casual", "registered"], axis=1),
)

In [42]:
x_train_count_arr, x_test_count_arr, y_train_count_arr, y_test_count_arr = (
    x_train_count_drop.values,
    x_test_count_drop.values,
    y_train_count.values,
    y_test_count.values,
)

In [72]:
def score_lin(X_train, X_test, y_train, y_test):
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_pred = lm.predict(X_test)
    print("Intercept:", lm.intercept_)
    print("Coefficients:", lm.coef_)
    print("Mean squared error (MSE): {:.2f}".format(mean_squared_error(y_test, y_pred)))
    #print("Variance score (R2): {:.2f}".format(r2_score(y_test, y_pred)))
    return y_pred

In [73]:
##  baseline predictions and score for count
# train_dropped_target = x_train_count_arr.drop(["casual", "registered"], axis = 1)
# test_dropped_target = x_test_count_arr.drop(["casual", "registered"], axis = 1)
baseline_registered_count = score_lin(x_train_count_arr, x_test_count_arr, y_train_count_arr, y_test_count_arr)

  contains = index in indices
  sub[blockwise_token(i)] = blockwise_token(indices.index(index))


Intercept: -46.42112740023391
Coefficients: [ 6.43302601e+01  4.50575452e-03  1.17771970e+02  9.29013330e+01
 -8.26383575e+01 -3.01002803e+01 -1.75271798e+01  1.14335274e+01
 -2.06408918e+00  2.39771706e+01 -2.25634845e+01  2.64342305e+01
 -8.27196825e+00 -5.96714879e+00  7.66168075e+00  4.24776439e+00
  1.86606490e+01  6.22753287e+00 -9.64980103e+00  6.60016121e+00
  2.77375273e+01  5.40501318e+00 -1.21735800e+01 -1.46738021e+01
 -1.16708998e+02 -1.34093669e+02 -1.42242065e+02 -1.53280421e+02
 -1.56498566e+02 -1.40354809e+02 -8.32647284e+01  4.63709524e+01
  1.78730044e+02  3.88505442e+01 -1.20025965e+01  1.21106074e+01
  4.90852042e+01  4.50753598e+01  2.99070680e+01  3.75864220e+01
  9.76274690e+01  2.50836932e+02  2.23488940e+02  1.19593544e+02
  4.06950686e+01 -7.80853412e+00 -4.48572337e+01 -8.44972491e+01
  1.50743417e+01 -3.16770353e+00  8.74035944e+00 -3.51544206e+00
 -1.24139372e+00  1.63980441e+00  3.18344623e+00  3.50487695e+00
  6.81555993e+00  8.54607685e-01  3.27248246e+

  contains = index in indices


Mean squared error (MSE): 14814.57


In [74]:
##  baseline predictions and score for registered
#train_dropped_target = x_train_reg.drop(["casual", "count"], axis = 1)
#test_dropped_target = x_test_reg.drop(["casual", "count"], axis = 1)
baseline_registered_pred = score_lin(x_train_reg_arr, x_test_reg_arr, y_train_reg_arr, y_test_reg_arr)

  contains = index in indices
  sub[blockwise_token(i)] = blockwise_token(indices.index(index))


Intercept: -44.743612844467634
Coefficients: [ 3.65324335e+01  5.83099214e-03  4.09506581e+01  8.46803644e+01
 -5.52042952e+01 -1.18298549e+01 -1.41629636e+01  5.56213897e+00
 -2.34379178e-01  2.53669788e+01 -1.04739472e+01  1.43495793e+01
  1.31171448e+00  3.77377721e+00  6.30218198e+00  3.84271827e+00
  1.64515077e+01  1.08503312e+01 -4.89201674e+00  6.00568681e+00
  1.80724429e+01 -7.86658830e+00 -2.02444591e+01 -1.91199493e+01
 -9.49405748e+01 -1.09363673e+02 -1.16434725e+02 -1.25250483e+02
 -1.27962050e+02 -1.13712780e+02 -5.99804051e+01  6.32074192e+01
  1.87091775e+02  4.10688965e+01 -2.22798122e+01 -7.69162155e+00
  2.33513589e+01  1.71206440e+01 -5.84744489e-03  8.53106244e+00
  6.89993983e+01  2.19016569e+02  2.01780533e+02  1.07723541e+02
  4.02080955e+01 -1.28020888e+00 -3.35645926e+01 -6.72584818e+01
  1.52036979e+01 -3.28202987e+00 -2.74811988e+00 -1.09721290e+01
  1.57074390e+00  7.56988058e+00  9.02710714e+00  9.50212751e+00
  5.45329847e+00 -1.00020730e+01  1.41343728e

  contains = index in indices


Mean squared error (MSE): 11519.35


In [75]:
##  baseline predictions and score for casual
#train_dropped_target = x_train_casual.drop(["registered", "count"], axis = 1)
#test_dropped_target = x_test_casual.drop(["registered", "count"], axis = 1)
baseline_casual_pred = score_lin(x_train_casual_arr, x_test_casual_arr, y_train_casual_arr, y_test_casual_arr)

  contains = index in indices
  sub[blockwise_token(i)] = blockwise_token(indices.index(index))


Intercept: 0.7003294966976737
Coefficients: [ 2.68204328e+00 -1.38034821e-03  7.67873912e+01  8.27411937e+00
 -2.74474861e+01 -1.82784540e+01 -1.11986120e+00  8.09247877e+00
  3.89586310e-01  8.37598868e-01 -1.03917621e+01  1.42641331e+01
 -6.33708275e+00 -6.45776987e+00  4.68995497e+00  3.79204831e+00
  5.63442167e+00 -1.15984132e+00 -1.25546851e+00  4.13981814e+00
  1.32505512e+01  1.68926303e+01  1.17338343e+01  8.14277593e+00
 -1.69630982e+01 -1.99244694e+01 -2.10014789e+01 -2.32236670e+01
 -2.37300514e+01 -2.18352983e+01 -1.84773052e+01 -1.20297833e+01
 -3.55533918e+00  2.58740343e+00  1.50821633e+01  2.46061698e+01
  3.05370939e+01  3.27574675e+01  3.47156243e+01  3.38580626e+01
  3.34311575e+01  3.66237715e+01  2.65122157e+01  1.66745224e+01
  5.29172039e+00 -1.72313300e+00 -6.48690446e+00 -1.24328853e+01
  5.83861014e+00  6.07302668e+00  1.42262111e+01  1.01938807e+01
 -6.42755090e-02 -3.18396696e+00 -3.09746635e+00 -3.25115511e+00
  4.10856420e+00  1.29278450e+01 -8.79902227e+

  contains = index in indices


Mean squared error (MSE): 1075.61
