In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set randomizer seed
np.random.seed(0)

In [3]:
# Load train data
train_path = '~/datasets/bike/train.csv' # path on local machine
df_train = pd.read_csv(train_path)
# Extract datetime feature
all_train_dates = pd.to_datetime(df_train["datetime"])
# Get feature names (excluding datetime, casual, registered, count)
feature_names_train = df_train.columns.values[1:-3]
print ("Feature names: ")
print (feature_names_train)
# Extract numerical features
num_data = df_train.iloc[:,1:].values
X, Y = num_data[:, :-3], num_data[:, -1]
# Print out first 5 training examples
print ("\nFirst 5 examples: ")
print (X[0:5])

# Load and process test data (repeat above)
test_path = '~/datasets/bike/test.csv' # path on local machine
df_test = pd.read_csv(test_path)
test_dates = pd.to_datetime(df_test["datetime"])
feature_names_test = df_test.columns.values[1:]
test_data = df_test.iloc[:,1:].values

# Split all training data into train set and dev set (about 7:1 ratio)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y, all_train_dates = X[shuffle], Y[shuffle], all_train_dates[shuffle]
print ("All train data: {}".format(X.shape))
train_data, train_labels, train_dates = X[:9500], Y[:9500], all_train_dates[:9500]
dev_data, dev_labels, dev_dates = X[9500:], Y[9500:], all_train_dates[9500:]

# Array sizes
print ("Train data: {}".format(train_data.shape))
print ("Dev data: {}".format(dev_data.shape))
print ("Test data: {}".format(test_data.shape))
# train_dates.dt.day.values

Feature names: 
['season' 'holiday' 'workingday' 'weather' 'temp' 'atemp' 'humidity'
 'windspeed']

First 5 examples: 
[[  1.      0.      0.      1.      9.84   14.395  81.      0.   ]
 [  1.      0.      0.      1.      9.02   13.635  80.      0.   ]
 [  1.      0.      0.      1.      9.02   13.635  80.      0.   ]
 [  1.      0.      0.      1.      9.84   14.395  75.      0.   ]
 [  1.      0.      0.      1.      9.84   14.395  75.      0.   ]]
All train data: (10886, 8)
Train data: (9500, 8)
Dev data: (1386, 8)
Test data: (6493, 8)


In [4]:
# Evaluation: Root Mean Squared Logarithmic Error (RMSLE)
def RMSLE(actual, predicted):
    """
    Inputs: actual counts, predicted counts
    Outputs: Root Mean Squared Logarithmic Error
    """
    return np.sqrt(np.mean(np.square(np.log(predicted+1) - np.log(actual+1))))

In [5]:
# Fit a simple regression model using all training data
lr_model = LinearRegression()
lr_model.fit(train_data, train_labels)
preds = lr_model.predict(dev_data)

# Compute baseline score (using the mean as the base model)
base_preds = np.repeat(np.mean(train_labels), dev_labels.shape)
print ("RMSLE(baseline): {}".format(RMSLE(dev_labels, base_preds)))

# Now try linear regression model
# Predictions cannot be negative, convert negative predictions to zero
preds[preds<0] = 0
print ("RMSLE: {}".format(RMSLE(dev_labels, preds)))

# R2 for dev set
print ("R-square on dev: {}".format(metrics.r2_score(dev_labels, preds)))

RMSLE(baseline): 1.5629246036
RMSLE: 1.39378752988
R-square on dev: 0.280842519381


In [5]:
# Write results in Kaggle format
# Predict with ALL training data (train set + dev set)
lr_model = LinearRegression()
lr_model.fit(X, Y)
kaggle_preds = lr_model.predict(test_data)
kaggle_preds[kaggle_preds<0] = 0

# Write to csv
out_path = '/Users/InfernoIX/datasets/bike/kaggle_submission.csv'
df_submission = pd.DataFrame(df_test["datetime"])
df_submission["count"] = kaggle_preds
df_submission.to_csv(out_path, index=False)