In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate

import pickle


In [2]:
# read in data
df = pd.read_csv('weatheravailability.csv')

df.head(5)

Unnamed: 0.1,Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,description
0,0,2,1551726000,15,5,20,Rain
1,1,3,1551726000,19,1,20,Rain
2,2,4,1551726000,20,0,20,Rain
3,3,5,1551726000,11,29,40,Rain
4,4,6,1551726000,16,4,20,Rain


In [3]:
# want to remove the 'Unnamed: 0' column.
# to make it easier, strip the space and conver to lower case...
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

df.head(5)

Unnamed: 0,unnamed:_0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,description
0,0,2,1551726000,15,5,20,Rain
1,1,3,1551726000,19,1,20,Rain
2,2,4,1551726000,20,0,20,Rain
3,3,5,1551726000,11,29,40,Rain
4,4,6,1551726000,16,4,20,Rain


In [4]:
df = df.drop('unnamed:_0', axis=1)


In [5]:
df.rename(columns={"description": "weather_desc"}, inplace=True)

In [6]:
df.head(5)

Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,weather_desc
0,2,1551726000,15,5,20,Rain
1,3,1551726000,19,1,20,Rain
2,4,1551726000,20,0,20,Rain
3,5,1551726000,11,29,40,Rain
4,6,1551726000,16,4,20,Rain


In [7]:
df.dtypes

number                    int64
last_update               int64
available_bike_stands     int64
available_bikes           int64
total_bike_stands         int64
weather_desc             object
dtype: object

In [8]:
continuous_features = ['available_bikes', 'available_bike_stands', 'last_update', 'total_bike_stands']

In [9]:
# categorical_features = df[['weather_desc']].columns

In [10]:
target_feature = ['available_bikes']

In [11]:
df['weather_desc'].value_counts()

Clouds     58312
Rain       23672
Drizzle     3380
Clear       3277
Mist        1921
Name: weather_desc, dtype: int64

In [12]:
# # turn the weather description values into integers for easier representation and comparison

# df['weather_desc'] = df['weather_desc'].map({'Clouds': 1, 'Rain': 2, 'Drizzle': 3, 'Clear': 4, 'Mist': 5})

In [13]:
df.head(5)

Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,weather_desc
0,2,1551726000,15,5,20,Rain
1,3,1551726000,19,1,20,Rain
2,4,1551726000,20,0,20,Rain
3,5,1551726000,11,29,40,Rain
4,6,1551726000,16,4,20,Rain


## UNDERSTANDING THE DATA

In [14]:
# no correlation...

df[['weather_desc', 'available_bikes', 'available_bike_stands']].corr()

Unnamed: 0,available_bikes,available_bike_stands
available_bikes,1.0,-0.765681
available_bike_stands,-0.765681,1.0


## MODEL

In [15]:
# print(df.head(10))

# X = df[['weather_desc', 'last_update', 'number']]
# y = df.available_bikes

# print("\nDescriptive features in X:\n", X)
# print("\nTarget feature in y:\n", y)

In [16]:
# Use more features for training
# Train aka fit, a model using all continuous and categorical features.
weather_desc_dummies = pd.get_dummies(df['weather_desc'], prefix='weather_desc_')
print("weather_desc_dummies:", weather_desc_dummies)

categorical_features = weather_desc_dummies.columns.values.tolist()

features = continuous_features + categorical_features
print("\nCont features: ", continuous_features)
print("Categ features: ", categorical_features)
print("Features: ", features)

weather_desc_dummies:        weather_desc__Clear  weather_desc__Clouds  weather_desc__Drizzle  \
0                        0                     0                      0   
1                        0                     0                      0   
2                        0                     0                      0   
3                        0                     0                      0   
4                        0                     0                      0   
5                        0                     0                      0   
6                        0                     0                      0   
7                        0                     0                      0   
8                        0                     0                      0   
9                        0                     0                      0   
10                       0                     0                      0   
11                       0                     0                      0   
12 

In [17]:
df_all = pd.concat([df, weather_desc_dummies], axis=1)
# print(df_all)

df_all = df_all.drop('weather_desc', axis = 1)
print(df_all)

       number  last_update  available_bike_stands  available_bikes  \
0           2   1551726000                     15                5   
1           3   1551726000                     19                1   
2           4   1551726000                     20                0   
3           5   1551726000                     11               29   
4           6   1551726000                     16                4   
5           7   1551726000                      9               19   
6           8   1551726000                     15               15   
7           9   1551726000                      5               19   
8          10   1551726000                      7                9   
9          11   1551726000                     30                0   
10         12   1551726000                     16                4   
11         13   1551726000                     29                1   
12         14   1551726000                      5               25   
13         15   1551

In [18]:
# df = pd.get_dummies(df, drop_first=True)
df = pd.get_dummies(df)
df

Unnamed: 0,number,last_update,available_bike_stands,available_bikes,total_bike_stands,weather_desc_Clear,weather_desc_Clouds,weather_desc_Drizzle,weather_desc_Mist,weather_desc_Rain
0,2,1551726000,15,5,20,0,0,0,0,1
1,3,1551726000,19,1,20,0,0,0,0,1
2,4,1551726000,20,0,20,0,0,0,0,1
3,5,1551726000,11,29,40,0,0,0,0,1
4,6,1551726000,16,4,20,0,0,0,0,1
5,7,1551726000,9,19,28,0,0,0,0,1
6,8,1551726000,15,15,30,0,0,0,0,1
7,9,1551726000,5,19,24,0,0,0,0,1
8,10,1551726000,7,9,16,0,0,0,0,1
9,11,1551726000,30,0,30,0,0,0,0,1


In [19]:
X = df_all[features]
y = df_all.available_bikes

In [20]:
multiple_linreg = LinearRegression().fit(X, y)

print("\nIntercept: \n", multiple_linreg.intercept_)
print("Features and coeficients:", list(zip(features, multiple_linreg.coef_)))


Intercept: 
 8.624923708566712e-08
Features and coeficients: [('available_bikes', 0.666666666666646), ('available_bike_stands', -0.33333333333342324), ('last_update', -5.551115123125783e-17), ('total_bike_stands', 0.3333333333332543), ('weather_desc__Clear', 1.5137483030872435e-15), ('weather_desc__Clouds', -7.992825236440232e-17), ('weather_desc__Drizzle', 2.3530227130948252e-15), ('weather_desc__Mist', -3.4406628434207246e-15), ('weather_desc__Rain', -3.4376742167065133e-16)]


In [21]:
multiple_linreg_predictions = multiple_linreg.predict(X[features])

print("\nPredictions with multiple linear regression: \n")
actual_vs_predicted_multiplelinreg = pd.concat([df, pd.DataFrame(multiple_linreg_predictions, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg)


Predictions with multiple linear regression: 

       number  last_update  available_bike_stands  available_bikes  \
0           2   1551726000                     15                5   
1           3   1551726000                     19                1   
2           4   1551726000                     20                0   
3           5   1551726000                     11               29   
4           6   1551726000                     16                4   
5           7   1551726000                      9               19   
6           8   1551726000                     15               15   
7           9   1551726000                      5               19   
8          10   1551726000                      7                9   
9          11   1551726000                     30                0   
10         12   1551726000                     16                4   
11         13   1551726000                     29                1   
12         14   1551726000                

In [22]:
#This function is used repeatedly to compute all metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))
        

In [23]:
printMetrics(y, multiple_linreg_predictions)


MAE:  4.199655013674659e-11
RMSE:  4.915303657284433e-11
R2:  1.0


## Evaluation with train/test split

In [24]:
# Split the data into train and test sets
# Take a third (random) data samples as test data, rest as training data
# Note that this training set if very small and the model will not be very reliable due to this sample size problem.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

Training data:
        available_bikes  available_bike_stands  last_update  total_bike_stands  \
81317                1                     29   1554832800                 30   
76036                8                     32   1554663600                 40   
57681               20                     20   1554066000                 40   
5632                13                     27   1552377600                 40   
77288                5                     33   1554703200                 38   
85190                1                     35   1554966000                 36   
26480               16                      6   1553047200                 22   
51538                0                     31   1553864400                 31   
73291               16                     14   1554577200                 30   
60977                1                     29   1554174000                 30   
50999                0                     40   1553846400                 40   
32750       

In [25]:
# Train on the training sample and test on the test sample.
linreg = LinearRegression().fit(X_train, y_train)
# Print the weights learned for each feature.
#print(linreg_train.coef_)
print("Features and coeficients:", list(zip(features, linreg.coef_)))

Features and coeficients: [('available_bikes', 0.6666666666665384), ('available_bike_stands', -0.3333333333334026), ('last_update', 5.551115123125783e-17), ('total_bike_stands', 0.33333333333321624), ('weather_desc__Clear', -4.619999266605467e-16), ('weather_desc__Clouds', 4.568445069491977e-15), ('weather_desc__Drizzle', 1.389616015096375e-15), ('weather_desc__Mist', -9.218057468201292e-15), ('weather_desc__Rain', 3.725515943927565e-15)]


In [26]:
# Predicted price on training set
train_predictions = linreg.predict(X_train)
print("Actual values of training:\n", y_train)
print("Predictions on training:", train_predictions)
printMetrics(y_train, train_predictions)



Actual values of training:
 81317     1
76036     8
57681    20
5632     13
77288     5
85190     1
26480    16
51538     0
73291    16
60977     1
50999     0
32750     0
24578     0
46023     0
26927     3
57689    22
65963    27
61242     1
84964     0
10698    39
33252     0
26306     0
27559     4
12565    33
4799      2
77830    26
89627    21
79362    15
36224    23
90469     3
         ..
67424    19
48250     5
83339     7
56654     2
35881     3
3981      2
3894     31
74440     3
53156    12
12472     0
39617    15
14157    32
70901    16
37827     4
45988     0
43419     0
49146    28
41495     5
66963    11
51440     0
47224     0
13930    38
7420      8
75424    27
19149    10
52028     0
55442     4
83025    12
17266     0
6162      1
Name: available_bikes, Length: 63393, dtype: int64
Predictions on training: [ 1.00000000e+00  8.00000000e+00  2.00000000e+01 ...  1.20000000e+01
 -5.05425083e-11  1.00000000e+00]

MAE:  4.193741490568568e-11
RMSE:  4.910463274638963e-11
R2:

In [27]:
# Predicted price on test set
test_predictions = linreg.predict(X_test)
print("Actual values of test:\n", y_test)
print("Predictions on test:", test_predictions)
printMetrics(y_test, test_predictions)

Actual values of test:
 16635     3
51003    10
36927     3
33518     0
1330      1
26502     0
28123     4
57483    15
38015     8
87660     0
67371     0
81332     1
70882     4
23823     8
85620    18
22329    34
89012    10
43602     0
68713    24
22200    11
31074     9
29156     0
57420     6
84725    30
19636     0
56908     2
85012    12
24357     0
63040     4
33157     0
         ..
43191     0
72305     1
47635     0
9468      6
6101     28
88297     1
55884     1
81684     1
62270    28
31443    14
8365     10
50146     2
72730    17
38029    16
8981     23
37671     1
77115     0
10285     0
14081     0
78046    20
43059     9
39989     1
27377     2
73057     7
65965     1
29710    18
10848    10
64826    12
9176      4
21053     7
Name: available_bikes, Length: 27169, dtype: int64
Predictions on test: [ 3. 10.  3. ... 12.  4.  7.]

MAE:  4.201949266732135e-11
RMSE:  4.916971587036695e-11
R2:  1.0


In [28]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [29]:
with open('model.pkl', 'wb') as handle:
    pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL) 