Code from BikeStarter notebook to load in and prep data

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import dill

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor

# Set number of CPU cores for parallel algorithms
import os
if "CPU_LIMIT" in os.environ:
    # If you are on JupyterHub, this gives you the right number of CPUs for your virtual machine
    num_cpus = int(os.getenv("CPU_LIMIT").split('.')[0])
else:
    # If you are not on JupyterHub, this gives you the right number for your computer.
    num_cpus = os.cpu_count()

In [2]:
# This sets some nicer defaults for plotting.
# This must be run in a separate cell from importing matplotlib due to a bug.
params = {'legend.fontsize': 'large',
          'figure.figsize': (11.0, 11.0),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'xx-large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large'}
mpl.rcParams.update(params)

# This makes it so that the pandas dataframes don't get truncated horizontally.
pd.options.display.max_columns = 200

In [3]:
train_df = pd.read_csv('train.csv') 
test_df = pd.read_csv('test.csv')

In [4]:
train_df['datetime'] = pd.to_datetime(train_df['datetime'])
test_df['datetime'] = pd.to_datetime(test_df['datetime'])

In [5]:
train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['hour'] = train_df['datetime'].dt.hour
train_df['DOW'] = train_df['datetime'].dt.dayofweek

test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['hour'] = test_df['datetime'].dt.hour
test_df['DOW'] = test_df['datetime'].dt.dayofweek

Independent variables list modified to the below:

In [6]:
ind_variables_selected = ['season','holiday','workingday','weather', 'temp','humidity','windspeed', 'year', 'hour', 'DOW']

In [7]:
X_orig_train = train_df[ind_variables_selected]
y_orig_train = train_df['count']

In [8]:
X_test = test_df[ind_variables_selected]

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_orig_train, y_orig_train, test_size = 0.25, random_state = 201)

Final Model Code

In [11]:
xgb_model_final = XGBRegressor(max_depth=10, n_estimators = 30, learning_rate=.1, random_state=201)
xgb_model_final.fit(X_orig_train, y_orig_train)



XGBRegressor(max_depth=10, n_estimators=30, random_state=201)

In [12]:
xgb_final_pred = xgb_model_final.predict(X_test)

In [13]:
final_pred = xgb_final_pred.clip(0)

Submission Code

In [14]:
sample_submission = pd.read_csv('sampleSubmission.csv')

In [15]:
sample_submission['count'] = final_pred

In [16]:
sample_submission.to_csv('finalSubmission.csv', index=False)