In [1]:
import numpy as np
import pandas as pd
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combine = [train_df,test_df]

In [3]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
train_df["datetime"].head(10)

0    2011-01-01 00:00:00
1    2011-01-01 01:00:00
2    2011-01-01 02:00:00
3    2011-01-01 03:00:00
4    2011-01-01 04:00:00
5    2011-01-01 05:00:00
6    2011-01-01 06:00:00
7    2011-01-01 07:00:00
8    2011-01-01 08:00:00
9    2011-01-01 09:00:00
Name: datetime, dtype: object

In [5]:
train_df["datetime"][4]

'2011-01-01 04:00:00'

In [6]:
train_df["datetime"][4][11:13]

'04'

In [7]:
train_df.loc[:,["datetime"]].describe()

Unnamed: 0,datetime
count,10886
unique,10886
top,2011-05-04 11:00:00
freq,1


In [8]:
N = len(train_df["datetime"])
a = []
for n in range(0,N):
    a.append(train_df["datetime"][n][11:13])

In [9]:
a[:10]

['00', '01', '02', '03', '04', '05', '06', '07', '08', '09']

In [10]:
train_df["datetime"] = a

In [11]:
train_df["datetime"].head()

0    00
1    01
2    02
3    03
4    04
Name: datetime, dtype: object

In [12]:
size_mapping = {'00':0,"01":1, '02':2, '03':3, '04':4, '05':5, "06":6, "07":7, "08":8, "09":9, "10":10,"11":11, "12":12,
                "13":13,"14":14,"15":15,"16":16,"17":17,"18":18,"19":19,"20":20,"21":21,"22":22,"23":23
               }

In [13]:
train_df['datetime'] = train_df['datetime'].map(size_mapping)
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,0,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,1,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,3,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,4,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [14]:
X = train_df.loc[:,train_df.columns[0:9]].values
y = train_df.loc[:,["count"]].values

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [16]:
# 重回帰の線形回帰を実行
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [17]:
#重回帰線形モデルの学習
# trainデータのみを用いていることに注意
lr.fit(X_train, y_train)                            

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
# 自由度調整済み決定係数
# (決定係数, trainまたはtestのサンプル数, 利用した特徴量の数)
def adjusted(score, n_sample, n_features):
    adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
    return adjusted_score

In [19]:
# 自由度調整済み決定係数を出力
print('adjusted R^2')
print('train: %3f' % adjusted(lr.score(X_train, y_train), len(y_train), 2))
print('test : %3f' % adjusted(lr.score(X_test, y_test), len(y_test), 2))

adjusted R^2
train: 0.336406
test : 0.331809


In [20]:
# LASSO回帰を実行
from sklearn.linear_model import Lasso                         # LASSO回帰のクラスを読み込み
model_lasso = Lasso(alpha=10)                                # LASSO回帰のインスタンスを生成
model_lasso.fit(X_train, y_train)                                      # データを学習させる

Lasso(alpha=10, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [21]:
model_lasso.score(X_test,y_test)

0.3315748018848248

In [56]:
# MSEを出力する関数を読み込む
from sklearn.metrics import mean_squared_error as mse

# RMSEをtrainとtestに分けて出力
# 過学習をしているかどうかを確認
print('RMSE')
print('train: %.3f' % (mse(y_train, lr.predict(X_train)) ** (1/2)))
print('test : %.3f' % (mse(y_test, lr.predict(X_test)) ** (1/2)))

RMSE
train: 147.848
test : 147.262


In [59]:
train_df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [64]:
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [65]:
rmsle(y_test, lr.predict(X_test))

array([1.16495542])

In [66]:
rmsle(y_train, lr.predict(X_train))

array([1.16853273])

### 結果
スコアはあまり良くない・・・どうすれば？

https://www.analyticsvidhya.com/blog/2015/06/solution-kaggle-competition-bike-sharing-demand/