In [36]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set randomizer seed
np.random.seed(0)

In [33]:
# Load train data
train_path = '~/datasets/bike/train.csv' # path on local machine
df_train = pd.read_csv(train_path, parse_dates=[0])
# Split datetime up into 3 components (year, dayofyear, hour)
df_train.insert(1, "year", df_train["datetime"].dt.year)
df_train.insert(2, "dayofyear", df_train["datetime"].dt.dayofyear)
df_train.insert(3, "hour", df_train["datetime"].dt.hour)
# Get feature names (excluding datetime, casual, registered, count)
print ("Feature names: ")
print (df_train.columns.values[1:-3])
# Extract numerical features
num_data = df_train.iloc[:, 1:].values
X, Y = num_data[:, :-3], num_data[:, -1]
# Print out first 5 training examples
print ("\nFirst 5 examples: ")
print (X[0:5])

# Load and process test data (repeat above)
test_path = '~/datasets/bike/test.csv' # path on local machine
df_test = pd.read_csv(test_path, parse_dates=[0])
df_test.insert(1, "year", df_test["datetime"].dt.year)
df_test.insert(2, "dayofyear", df_test["datetime"].dt.dayofyear)
df_test.insert(3, "hour", df_test["datetime"].dt.hour)
feature_names_test = df_test.columns.values[1:]
test_data = df_test.iloc[:, 1:].values

# Split all training data into train set and dev set (about 7:1 ratio)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]
print ("All train data: {}".format(X.shape))
train_data, train_labels = X[:9500], Y[:9500]
dev_data, dev_labels = X[9500:], Y[9500:]

# Array sizes
print ("Train data: {}".format(train_data.shape))
print ("Dev data: {}".format(dev_data.shape))
print ("Test data: {}".format(test_data.shape))
# train_dates.dt.day.values

Feature names: 
['year' 'dayofyear' 'hour' 'season' 'holiday' 'workingday' 'weather' 'temp'
 'atemp' 'humidity' 'windspeed']

First 5 examples: 
[[  2.01100000e+03   1.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   9.84000000e+00
    1.43950000e+01   8.10000000e+01   0.00000000e+00]
 [  2.01100000e+03   1.00000000e+00   1.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   9.02000000e+00
    1.36350000e+01   8.00000000e+01   0.00000000e+00]
 [  2.01100000e+03   1.00000000e+00   2.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   9.02000000e+00
    1.36350000e+01   8.00000000e+01   0.00000000e+00]
 [  2.01100000e+03   1.00000000e+00   3.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   9.84000000e+00
    1.43950000e+01   7.50000000e+01   0.00000000e+00]
 [  2.01100000e+03   1.00000000e+00   4.00000000e+00   1.00000000e+00
    0.000

In [50]:
# Evaluation: Root Mean Squared Logarithmic Error (RMSLE)
def RMSLE(actual, predicted):
    """
    Inputs: actual counts, predicted counts
    Outputs: Root Mean Squared Logarithmic Error
    """
    return np.sqrt(np.mean(np.square(np.log(predicted+1) - np.log(actual+1))))

In [51]:
# Linear Regression
# Fit a simple regression model using all training data
lr_model = LinearRegression()
lr_model.fit(train_data, train_labels)
preds = lr_model.predict(dev_data)

# Compute baseline score (using the mean as the base model)
base_preds = np.repeat(np.mean(train_labels), dev_labels.shape)
print ("RMSLE(baseline): {}".format(RMSLE(dev_labels, base_preds)))

# Now try linear regression model
# Predictions cannot be negative, convert negative predictions to zero
preds[preds<0] = 0
print ("RMSLE: {}".format(RMSLE(dev_labels, preds)))

# R2 for dev set
print ("R-square on dev: {}".format(metrics.r2_score(dev_labels, preds)))

RMSLE(baseline): 1.52309106369
RMSLE: 1.27241681542
R-square on dev: 0.37209391508


In [85]:
# Linear Regression with L2 Regularization
# Fit ridge regression model
lr_ridge_model = Ridge(alpha=2400)
lr_ridge_model.fit(train_data, train_labels)
preds = lr_ridge_model.predict(dev_data)

# Predictions cannot be negative, convert negative predictions to zero
preds[preds<0] = 0
print ("RMSLE: {}".format(RMSLE(dev_labels, preds)))

# R2 for dev set
print ("R-square on dev: {}".format(metrics.r2_score(dev_labels, preds)))

RMSLE: 1.26044812959
R-square on dev: 0.347722009279


In [115]:
# Linear Regression with L1 Regularization
# Fit lasso regression model
lr_lasso_model = Lasso(alpha=8)
lr_lasso_model.fit(train_data, train_labels)
preds = lr_lasso_model.predict(dev_data)

# Predictions cannot be negative, convert negative predictions to zero
preds[preds<0] = 0
print ("RMSLE: {}".format(RMSLE(dev_labels, preds)))

# R2 for dev set
print ("R-square on dev: {}".format(metrics.r2_score(dev_labels, preds)))

RMSLE: 1.25750980355
R-square on dev: 0.353691196355


In [5]:
# Write results in Kaggle format
# Predict with ALL training data (train set + dev set)
lr_model = LinearRegression()
lr_model.fit(X, Y)
kaggle_preds = lr_model.predict(test_data)
kaggle_preds[kaggle_preds<0] = 0

# Write to csv
out_path = '/Users/InfernoIX/datasets/bike/kaggle_submission.csv'
df_submission = pd.DataFrame(df_test["datetime"])
df_submission["count"] = kaggle_preds
df_submission.to_csv(out_path, index=False)