In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate

import pickle


In [2]:
# read in data
df = pd.read_csv('weatheravailability.csv')

df.head(5)

Unnamed: 0.1,Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,description
0,0,2,1551726000,15,5,20,Rain
1,1,3,1551726000,19,1,20,Rain
2,2,4,1551726000,20,0,20,Rain
3,3,5,1551726000,11,29,40,Rain
4,4,6,1551726000,16,4,20,Rain


In [3]:
# want to remove the 'Unnamed: 0' column.
# to make it easier, strip the space and conver to lower case...
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

df.head(5)

Unnamed: 0,unnamed:_0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,description
0,0,2,1551726000,15,5,20,Rain
1,1,3,1551726000,19,1,20,Rain
2,2,4,1551726000,20,0,20,Rain
3,3,5,1551726000,11,29,40,Rain
4,4,6,1551726000,16,4,20,Rain


In [4]:
# it's now easier to reference, and dropped
df = df.drop('unnamed:_0', axis=1)

In [5]:
# rename the weather description column for clarity
df.rename(columns={"description": "weather_desc"}, inplace=True)

In [6]:
df.head(5)

Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,weather_desc
0,2,1551726000,15,5,20,Rain
1,3,1551726000,19,1,20,Rain
2,4,1551726000,20,0,20,Rain
3,5,1551726000,11,29,40,Rain
4,6,1551726000,16,4,20,Rain


In [7]:
df.dtypes

number                    int64
last_update               int64
available_bike_stands     int64
available_bikes           int64
total_bike_stands         int64
weather_desc             object
dtype: object

In [8]:
continuous_features = ['available_bikes', 'available_bike_stands', 'last_update', 'total_bike_stands']

In [9]:
# the target feature is the feature that we want to investigate.
target_feature = ['available_bikes']

In [10]:
df['weather_desc'].value_counts()

Clouds     58312
Rain       23672
Drizzle     3380
Clear       3277
Mist        1921
Name: weather_desc, dtype: int64

## UNDERSTANDING THE DATA

In [11]:
# there seems to be no correlation...

df[['weather_desc', 'available_bikes', 'available_bike_stands']].corr()

Unnamed: 0,available_bikes,available_bike_stands
available_bikes,1.0,-0.765681
available_bike_stands,-0.765681,1.0


## MODEL

In [12]:
# Use more features for training
# Train a model using the weather description.
# As we are dealing with a categorical feature, we need a way to be able to make comparisons. We cannot do this with strings/objects, so instead we create dummy columns with true/false values
weather_desc_dummies = pd.get_dummies(df['weather_desc'], prefix='weather_desc_')
print("weather_desc_dummies:", weather_desc_dummies)

categorical_features = weather_desc_dummies.columns.values.tolist()

features = continuous_features + categorical_features
print("\nCont features: ", continuous_features)
print("Categ features: ", categorical_features)
print("Features: ", features)

weather_desc_dummies:        weather_desc__Clear  weather_desc__Clouds  weather_desc__Drizzle  \
0                        0                     0                      0   
1                        0                     0                      0   
2                        0                     0                      0   
3                        0                     0                      0   
4                        0                     0                      0   
5                        0                     0                      0   
6                        0                     0                      0   
7                        0                     0                      0   
8                        0                     0                      0   
9                        0                     0                      0   
10                       0                     0                      0   
11                       0                     0                      0   
12 

In [13]:
# create a new dataframe with the new dummy columns..
df_all = pd.concat([df, weather_desc_dummies], axis=1)
# print(df_all)

# drop the original column in this new dataframe as it isn't needed
df_all = df_all.drop('weather_desc', axis = 1)
# print(df_all)

In [14]:
# df = pd.get_dummies(df, drop_first=True)
df = pd.get_dummies(df)
# df

Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,weather_desc_Clear,weather_desc_Clouds,weather_desc_Drizzle,weather_desc_Mist,weather_desc_Rain
0,2,1551726000,15,5,20,0,0,0,0,1
1,3,1551726000,19,1,20,0,0,0,0,1
2,4,1551726000,20,0,20,0,0,0,0,1
3,5,1551726000,11,29,40,0,0,0,0,1
4,6,1551726000,16,4,20,0,0,0,0,1
5,7,1551726000,9,19,28,0,0,0,0,1
6,8,1551726000,15,15,30,0,0,0,0,1
7,9,1551726000,5,19,24,0,0,0,0,1
8,10,1551726000,7,9,16,0,0,0,0,1
9,11,1551726000,30,0,30,0,0,0,0,1


In [15]:
# prepare the descriptive features
X = df_all[features]
y = df_all.available_bikes

In [16]:
# now train the model with linear regression
multiple_linreg = LinearRegression().fit(X, y)

print("\nIntercept: \n", multiple_linreg.intercept_)
print("Features and coeficients:", list(zip(features, multiple_linreg.coef_)))


Intercept: 
 8.624923708566712e-08
Features and coeficients: [('available_bikes', 0.666666666666646), ('available_bike_stands', -0.33333333333342324), ('last_update', -5.551115123125783e-17), ('total_bike_stands', 0.3333333333332543), ('weather_desc__Clear', 1.5137483030872435e-15), ('weather_desc__Clouds', -7.992825236440232e-17), ('weather_desc__Drizzle', 2.3530227130948252e-15), ('weather_desc__Mist', -3.4406628434207246e-15), ('weather_desc__Rain', -3.4376742167065133e-16)]


In [17]:
# now test using the trained model
multiple_linreg_predictions = multiple_linreg.predict(X[features])

print("\nPredictions with multiple linear regression: \n")
actual_vs_predicted_multiplelinreg = pd.concat([df, pd.DataFrame(multiple_linreg_predictions, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg)


Predictions with multiple linear regression: 

       number  last_update  available_bike_stands  available_bikes  \
0           2   1551726000                     15                5   
1           3   1551726000                     19                1   
2           4   1551726000                     20                0   
3           5   1551726000                     11               29   
4           6   1551726000                     16                4   
5           7   1551726000                      9               19   
6           8   1551726000                     15               15   
7           9   1551726000                      5               19   
8          10   1551726000                      7                9   
9          11   1551726000                     30                0   
10         12   1551726000                     16                4   
11         13   1551726000                     29                1   
12         14   1551726000                

In [18]:
#This function is used repeatedly to compute all metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))
        

In [19]:
printMetrics(y, multiple_linreg_predictions)


MAE:  4.199655013674659e-11
RMSE:  4.915303657284433e-11
R2:  1.0


## Evaluation with train/test split

In [20]:
# Split the data into train and test sets
# Take a third (random) data samples as test data, rest as training data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

Training data:
        available_bikes  available_bike_stands  last_update  total_bike_stands  \
80666                8                     32   1554811200                 40   
21896                5                     35   1552899600                 40   
23937               14                     16   1552968000                 30   
20477               36                      4   1552852800                 40   
16297                0                     20   1552719600                 20   
29165                0                     36   1553133600                 36   
38596               30                      0   1553443200                 30   
15266               12                      8   1552687200                 20   
21859               22                     17   1552899600                 39   
47001                0                     35   1553720400                 35   
28352               20                      3   1553108400                 23   
76448       

In [21]:
# Train on the training sample, and test on the test sample.
linreg = LinearRegression().fit(X_train, y_train)

# Print the weights learned for each feature.
# print(linreg_train.coef_)
print("Features and coeficients:", list(zip(features, linreg.coef_)))

Features and coeficients: [('available_bikes', 0.6666666666667179), ('available_bike_stands', -0.33333333333316967), ('last_update', 0.0), ('total_bike_stands', 0.33333333333339266), ('weather_desc__Clear', -9.227351272517243e-16), ('weather_desc__Clouds', 2.3957743519145464e-15), ('weather_desc__Drizzle', 2.6343630918687854e-15), ('weather_desc__Mist', -4.434955759079662e-15), ('weather_desc__Rain', 3.1673293579093185e-16)]


In [22]:
# Predicted bike availability on training set
train_predictions = linreg.predict(X_train)
print("Actual values of training:\n", y_train)
print("Predictions on training:", train_predictions)
printMetrics(y_train, train_predictions)

Actual values of training:
 80666     8
21896     5
23937    14
20477    36
16297     0
29165     0
38596    30
15266    12
21859    22
47001     0
28352    20
76448     1
31784    25
62175     5
6644      2
1039     10
34853     5
68963     5
11310    20
32769    15
64063     0
39306     5
47722     0
8322      5
58958     1
4525      0
2467      1
45036    15
66766     6
22575     2
         ..
5229      3
70430     2
14194     0
34225     1
22942    20
42246    29
82103     0
61835     0
20300     0
12557     3
53678    17
70605    15
25215     4
67333    38
83950     4
67215    11
42984     7
18164    12
69550    20
84155     2
48199     5
75316     8
42899     0
87002     4
5899      2
63292    35
45136    27
57080     8
66322     0
78283    29
Name: available_bikes, Length: 63393, dtype: int64
Predictions on training: [8.00000000e+00 5.00000000e+00 1.40000000e+01 ... 8.00000000e+00
 1.49808823e-12 2.90000000e+01]

MAE:  1.595949375437292e-12
RMSE:  1.878180377449027e-12
R2:  1.0


In [23]:
# Predicted bike availability on test set
test_predictions = linreg.predict(X_test)
print("Actual values of test:\n", y_test)
print("Predictions on test:", test_predictions)
printMetrics(y_test, test_predictions)

Actual values of test:
 14171    40
62435    20
26658    21
31391     4
86856     3
12548    11
58999     1
1702     12
36910     3
35132    25
10710     0
47355    16
19554    23
14843    13
84697     0
31882     0
31524     4
19492     9
48158    18
24319    22
71443    25
45445    20
8221      9
60967     2
84415     0
27802    29
23719     1
34961     0
82730     0
83432     2
         ..
35539     0
10156    28
77766     0
50132     1
23753     0
58222     4
84252     0
70732    10
88428     2
60120     9
5386     25
26438     0
8225      2
20324    16
57674     4
41758    12
67677    19
25345     2
27923     5
27174     1
65433    11
3260      0
55725    10
74860     9
8007      8
9882     18
11256     8
23536    37
44386    10
168      30
Name: available_bikes, Length: 27169, dtype: int64
Predictions on test: [40. 20. 21. ... 37. 10. 30.]

MAE:  1.6031566331677818e-12
RMSE:  1.884673247774802e-12
R2:  1.0


In [24]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [25]:
with open('model.pkl', 'wb') as handle:
    pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL) 