# Train, test, and evaluate the model

In [20]:
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pickle

In [21]:
# load data
df = pd.read_csv('data.csv', index_col=0)
df.shape

(218486, 8)

In [22]:
df.columns

Index(['available_bikes', 'number', 'temp', 'humidity', 'wind_speed',
       'weather_id', 'weekday', 'hour'],
      dtype='object')

In [23]:
df.head(5)

Unnamed: 0,available_bikes,number,temp,humidity,wind_speed,weather_id,weekday,hour
45,9,56,281.41,76.0,2.57,803,0,22
46,15,48,281.41,76.0,2.57,803,0,22
47,14,34,281.41,76.0,2.57,803,0,22
48,15,49,281.41,76.0,2.57,803,0,22
49,11,65,281.41,76.0,2.57,803,0,22


In [24]:
df.dtypes

available_bikes      int64
number               int64
temp               float64
humidity           float64
wind_speed         float64
weather_id           int64
weekday              int64
hour                 int64
dtype: object

In [25]:
# categorical features
df['number'] = df['number'].astype('category')
df['weather_id'] = df['weather_id'].astype('category')
df.dtypes

available_bikes       int64
number             category
temp                float64
humidity            float64
wind_speed          float64
weather_id         category
weekday               int64
hour                  int64
dtype: object

In [26]:
# selected features for training/testing
features = ['temp', 'humidity', 'wind_speed', 'weather_id', 'weekday', 'hour']

In [27]:
# get all station numbers
station_numbers = df['number'].unique().tolist()
print(station_numbers)

[56, 48, 34, 49, 65, 77, 61, 90, 25, 74, 97, 116, 114, 22, 19, 112, 107, 106, 105, 40, 30, 51, 43, 62, 17, 11, 95, 6, 59, 57, 68, 98, 92, 13, 93, 7, 83, 28, 44, 12, 24, 27, 8, 101, 10, 5, 79, 73, 53, 72, 54, 18, 50, 103, 9, 76, 47, 78, 38, 108, 111, 85, 84, 75, 87, 94, 23, 16, 71, 63, 109, 15, 2, 66, 41, 82, 21, 36, 104, 29, 67, 4, 26, 33, 3, 39, 115, 58, 86, 37, 45, 32, 52, 110, 42, 102, 55, 64, 100, 96, 91, 99, 89, 31, 80, 69, 117, 113, 88, 507]


In [28]:
# group by station number
df_groups = df.groupby('number')

In [29]:
valid = 0

# for each station train and test the model, evaluate results
for number in station_numbers:
    
    # get data for station number
    dfa = df_groups.get_group(number)
    # drop number from data
    dfa = dfa.drop('number', axis=1)

    X = dfa[features]
    y = dfa['available_bikes']
    
    # get training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    # create and train model
    model = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
    
    # write to a pickle file
    with open('models/' + str(number) + '.pkl', 'wb') as file:
        pickle.dump(model, file, pickle.HIGHEST_PROTOCOL)
    
    # get predictions for X_test
    test_predictions = model.predict(X_test)
    
    # count valid models
    if metrics.r2_score(y_test, test_predictions)>0.6:
        valid += 1
    
    # display evaluation
    #print('===========station ', number,' evaluation===============')
    #print('MAE', metrics.mean_absolute_error(y_test, test_predictions))
    #print('RMSE', metrics.mean_squared_error(y_test, test_predictions)**0.5)
    #print('R^2=', metrics.r2_score(y_test, test_predictions))
    #print('========================================================')
    
    
print('Percent of models with r^2 over 60%:', valid / len(station_numbers) * 100)

Percent of models with r^2 over 60%: 81.81818181818183


### Get predictions for a given station

In [30]:
# selected features
features = ['temp', 'humidity', 'wind_speed', 'weather_id', 'weekday', 'hour']

# random station
station=110

# prepare data
dfa = df_groups.get_group(station)
dfa = dfa.drop('number', axis=1)

X = dfa[features]
y = dfa['available_bikes']

# load model from file
with open('models/'+str(station)+'.pkl', 'rb') as file:
    model = pickle.load(file) 

# get predictions
predictions = model.predict(X[features])

# create table to compare results
actual_vs_predicted = pd.concat([y, pd.DataFrame(predictions, columns=['predicted_available_bikes'], index=y.index)], axis=1)

# show actual vs predicted values
print(actual_vs_predicted)

        available_bikes  predicted_available_bikes
143                  14                         13
256                  13                         13
367                  13                         13
486                  13                         13
596                  13                         13
...                 ...                        ...
218147               22                         22
218265               22                         22
218389               22                         22
218425               23                         22
218512               22                         22

[1965 rows x 2 columns]


### Prediction example

In [31]:
def predict(station_number, temp=281, humidity=60, wind_speed=0, weather_id=803, week_day=0, hour=12):
    # features
    data = {
        'temp': [temp], 
        'humidity': [humidity], 
        'wind_speed': [wind_speed], 
        'weather_id':[weather_id], 
        'weekday':[week_day], 
        'hour':[hour]
    }
    
    # create dataframe
    X = pd.DataFrame.from_dict(data)

    # load model from file
    with open('models/'+str(station_number)+'.pkl', 'rb') as file:
        model = pickle.load(file) 

    # get prediction from model
    y = model.predict(X)
    
    # return the prediction
    return int(round(y[0]))
    
# get a prediction of available bikes for a station
predict(station_number=91, week_day=3, hour=20)

25