In [362]:
import ssl

import numpy as np

ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
import pandas as pd

odf = pd.read_csv("https://npg-ds.s3.amazonaws.com/bike_sharing.csv")
odf.copy()

In [None]:
# copy the dataframe from later usage
df = odf.copy()

The dataset has the following features:

1. Datetime: hourly date + timestamp
2. Season:
    - 1 = spring
    - 2 = summer
    - 3 = fall
    - 4 = winter
3. Holiday: whether the day is considered a holiday
4. Workingday: whether the day is neither a weekend nor holiday
5. Weather:
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
6. Temp: temperature in Celsius
7. Atemp: "feels like" temperature in Celsius
8. Humidity: relative humidity
9. Windspeed: wind speed
10. Casual: number of non-registered user rentals initiated
11. Registered: number of registered user rentals initiated
12. Count: number of total rentals

Let's see some statistics on the data.

In [None]:
df.copy().describe()

Let's now start checking for some initial statistics about our dataset. For instance we could check for null values per feature:

In [None]:
df.copy().isnull().sum()

Now let's convert the dates into timestamps.

In [None]:
from datetime import datetime
df["datetime"] = df["datetime"].apply(lambda x: datetime.fromisoformat(x).timestamp())

We'll now have to normalize the values.

In [None]:
# normalize everything but the count, and categorical/boolean data
cols = df.columns.difference(['count'])
df[cols] = (df[cols] - df[cols].mean()) / df[cols].std()
df.copy()

Let's check the correlation matrix.

In [None]:
import matplotlib.pyplot as plt

# plot heatmap
plt.matshow(df.corr())
cb = plt.colorbar()
plt.show()

# show numerical values
df.corr()

From the correlation matrix we can observe:

1. Temp and Atemp are highly correlated. This make sense as we can expect people to to have a "feel" for the temperature close to the actual number. We can probably be okay using one of them. Let's keep Temp as is a more objective value.
2. Holiday and workday are totally correlated. Makes sense given that a day is either a workday or a holiday. We are okay keeping one of this, let's use holiday.
3. Windspeed and Humidity - High correlated fields, we'll keep windspeed.
4. Registered vs Casual, both represent the same concept, whether the user is registered or not. We are going to keep only registered.

In [None]:
df = df.drop("atemp", axis=1)
df = df.drop("holiday", axis=1)
df = df.drop("humidity", axis=1)
df = df.drop("casual", axis=1)
df.copy()

Let's now search the data for outliers

In [None]:
from sklearn.ensemble import IsolationForest

# estimate outliers
df["outlier"] = IsolationForest().fit_predict(df)
print("Outliers detected:", abs(df[df.outlier == -1]["outlier"].sum()))

# remove outliers (-1), keep inliers (1)
df = df[df.outlier == 1]
df = df.drop("outlier", axis=1).reset_index()
df.copy()

Now we can start training our model. First let's split our data in test and training.

In [None]:
from sklearn.model_selection import train_test_split

# split data for train and test
training_data, testing_data = train_test_split(df, test_size=0.2, random_state=25)

# split into features and labels
training_features = training_data.copy()
training_features = training_features.drop("count", axis=1)
training_labels = training_data["count"]

# split into features and labels
testing_features = pd.DataFrame(testing_data.copy())
testing_features = testing_features.drop("count", axis=1)
testing_labels = pd.DataFrame(testing_data["count"])

Now we can train our model. In this case we will use linear regression.

In [None]:
from sklearn.linear_model import LinearRegression

# train model
linear_regression = LinearRegression().fit(training_features, training_labels)

training_score = linear_regression.score(training_features, training_labels)
test_score = linear_regression.score(testing_features, testing_labels)

print(f"Training Score: {training_score}")
print(f"Test Score: {test_score}")