Read bike data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('data/bike.csv')
df.head()

Create histograms for all numeric variables

In [None]:
df.hist(bins=10, figsize=(20,15))
plt.tight_layout()
plt.show()

Create a heat map to visualize the interdependencies between numeric variables

In [None]:
plt.figure(figsize=(10,5))
corr=df.corr(numeric_only=True)
sns.heatmap(corr, cmap='BrBG', annot=True)

Boxplot for the weekday and the amount of hours for each weekday

In [None]:
plt.figure(figsize=(12, 6))
boxplot = df.boxplot(column=['hr'], by='weekday')

plt.xlabel('Weekday')
plt.ylabel('hours')
plt.show()

Find out if data is missing (answer: no)

In [None]:
print(df.isnull().sum())

Visualize outliers in the column "cnt"

In [None]:
sns.boxplot(x=df['cnt'])

Use 'power transform' method to make data appear more normally distributed, remove column 'dteday' and save that data frame to new variable

In [None]:
from sklearn.preprocessing import PowerTransformer
df.drop('dteday', axis=1, inplace=True)
pt = PowerTransformer()
transformed_data = pt.fit_transform(df)
df_new = pd.DataFrame(transformed_data, columns=df.columns)
df_new.head()

Create training and test data with 'cnt' as the variable to predict 

In [None]:
from sklearn.model_selection import train_test_split
# variables without 'cnt'
X = df_new.drop('cnt', axis=1)
# target variable
y = df_new['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Perform linear regression with R2-metric

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
model = LinearRegression()
# train model on the training data
model.fit(X_train, y_train)
# predict with the test data
y_pred = model.predict(X_test)
# compare the predictions with the test labels
r2 = r2_score(y_test, y_pred)
print('R2-Score: ', r2)