In [36]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report  
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import PolynomialFeatures

plt.rcParams['figure.figsize'] = (10,6)

In [37]:
df = pd.read_csv('../data/train.csv', index_col=0, parse_dates=True)

In [38]:
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday
df['month'] = df.index.month
df['year'] = df.index.year
df['year'] = df['year'] - 2011
df['month_count'] = df['month']+df['year']*12
df[df['month']==2]

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,weekday,month,year,month_count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-02-01 00:00:00,1,0,1,2,6.56,9.090,64,7.0015,2,6,8,0,1,2,0,2
2011-02-01 01:00:00,1,0,1,2,6.56,9.090,69,7.0015,0,3,3,1,1,2,0,2
2011-02-01 02:00:00,1,0,1,2,6.56,11.365,69,0.0000,0,2,2,2,1,2,0,2
2011-02-01 03:00:00,1,0,1,2,6.56,11.365,69,0.0000,0,2,2,3,1,2,0,2
2011-02-01 05:00:00,1,0,1,3,5.74,10.605,93,0.0000,0,3,3,5,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-02-19 19:00:00,1,0,0,2,12.30,14.395,52,16.9979,18,101,119,19,6,2,1,14
2012-02-19 20:00:00,1,0,0,2,12.30,14.395,52,19.0012,22,81,103,20,6,2,1,14
2012-02-19 21:00:00,1,0,0,3,10.66,12.880,65,11.0014,3,33,36,21,6,2,1,14
2012-02-19 22:00:00,1,0,0,3,9.84,12.880,75,6.0032,8,47,55,22,6,2,1,14


In [39]:
y = df['count']
X = df[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'weekday', 'hour', 'month_count']]
X,y

(                     season  holiday  workingday  weather   temp   atemp  \
 datetime                                                                   
 2011-01-01 00:00:00       1        0           0        1   9.84  14.395   
 2011-01-01 01:00:00       1        0           0        1   9.02  13.635   
 2011-01-01 02:00:00       1        0           0        1   9.02  13.635   
 2011-01-01 03:00:00       1        0           0        1   9.84  14.395   
 2011-01-01 04:00:00       1        0           0        1   9.84  14.395   
 ...                     ...      ...         ...      ...    ...     ...   
 2012-12-19 19:00:00       4        0           1        1  15.58  19.695   
 2012-12-19 20:00:00       4        0           1        1  14.76  17.425   
 2012-12-19 21:00:00       4        0           1        1  13.94  15.910   
 2012-12-19 22:00:00       4        0           1        1  13.94  17.425   
 2012-12-19 23:00:00       4        0           1        1  13.12  16.665   

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,) #

In [41]:
ylog_train = np.log1p(y_train)
ylog_test = np.log1p(y_test)

In [42]:
numerical_pipeline = Pipeline([
    ('num_scaler',StandardScaler()),
    ('polynomial', PolynomialFeatures(degree= 2,)),
    #('num_discritazer',KBinsDiscretizer(n_bins=, strategy='uniform'),)
])

In [43]:
categorical_pipeline = Pipeline([
    ('categorical_ohe',OneHotEncoder(sparse=False,drop = 'first')),
    #
])

In [44]:
polynomial_pipeline_4 = Pipeline([
    ('polynomial', PolynomialFeatures(degree= 4,)),
    ('categorical_ohe',OneHotEncoder(sparse=False,drop = 'first')),
    
])

In [45]:
polynomial_pipeline_2 = Pipeline([
    ('polynomial', PolynomialFeatures(degree= 2,)),
    ('categorical_ohe',OneHotEncoder(sparse=False,drop = 'first')),
    
])

In [46]:
column_transformer = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,['temp',  'humidity',]), #'atemp',
    ('categorical_pipeline',categorical_pipeline,['hour', 'weekday',  'holiday',]),#
    ('polynomial_4', polynomial_pipeline_4, ['season', 'workingday',]),
    ('polynomial_2', polynomial_pipeline_2, [ 'weather',]),
    ('do_nothing', 'passthrough',[ 'month_count',])
])

In [47]:
column_transformer.fit(X_train) #learn how to do the transforamtion
X_train_fe = column_transformer.transform(X_train) #Do the accual transformation
X_test_fe = column_transformer.transform(X_test) #Do the same transformation on test set 

In [48]:
m_l1 = Lasso()

m_l1.fit(X_train_fe,ylog_train)

alphas = [1,5,10,]#50,100,10000, 100000]
for i in alphas:
    m_l1 = Ridge(alpha = i)
    m_l1.fit(X_train_fe,ylog_train)  #Train the linear model with ridge regularization
    print(cross_val_score(m_l1, X_train_fe, ylog_train,cv = 5).mean())
    

0.8342349598492358
0.8328536682920866
0.8304576549566447


In [49]:
m_l1.score(X_test_fe,ylog_test)

0.83627850441772

In [50]:
y_predlog = m_l1.predict(X_test_fe)

In [51]:
ypred = np.exp(y_predlog)-1

In [52]:
from sklearn.metrics import mean_squared_log_error
mean_squared_log_error(y_test, ypred)

0.34483104014064975

In [53]:
df_k = pd.read_csv('../data/test.csv', index_col=0, parse_dates=True)
df_k['hour'] = df_k.index.hour
df_k['weekday'] = df_k.index.weekday
df_k['month'] = df_k.index.month
df_k['year'] = df_k.index.year
df_k['year'] = df_k['year'] - 2011
df_k['month_count'] = df_k['month']+df_k['year']*12
X_test_kaggle = df_k[['season', 'holiday', 'workingday', 'weather', 'temp',
                     'atemp', 'humidity', 'windspeed', 'weekday', 'hour', 'month_count']]

In [55]:
X_test_kaggle_fe = column_transformer.transform(X_test_kaggle)
y_predlog_k = m_l1.predict(X_test_kaggle_fe)
ypred_k = np.exp(y_predlog_k)-1
ypred_k

array([ 27.65615675,  10.64263872,   6.16603983, ..., 150.13869849,
       120.12885491,  80.19408036])

In [56]:
df_s = pd.read_csv('../data/sampleSubmission.csv', index_col=0, parse_dates=True)
pd.DataFrame(df_s).to_csv("predictions")