In [13]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from datasist.structdata import detect_outliers


In [14]:
df=pd.read_csv('bikes.csv')
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,rented_bikes_count
0,2011-01-01 00:00:00,Spring,0.0,0.0,Clear,9.84,81.0,,3,13,16
1,2011-01-01 01:00:00,Spring,0.0,0.0,,9.02,80.0,0.0000,8,32,40
2,2011-01-01 02:00:00,Spring,0.0,0.0,Clear,9.02,,0.0000,5,27,32
3,2011-01-01 03:00:00,Spring,0.0,0.0,Clear,9.84,75.0,0.0000,3,10,13
4,2011-01-01 04:00:00,,0.0,0.0,Clear,,75.0,,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,Winter,0.0,1.0,Clear,15.58,50.0,26.0027,7,329,336
10882,2012-12-19 20:00:00,Winter,0.0,1.0,,14.76,,,10,231,241
10883,2012-12-19 21:00:00,,0.0,1.0,Clear,13.94,61.0,15.0013,4,164,168
10884,2012-12-19 22:00:00,Winter,,1.0,Clear,13.94,,6.0032,12,117,129


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   datetime            10886 non-null  object 
 1   season              10672 non-null  object 
 2   holiday             10030 non-null  float64
 3   workingday          9388 non-null   float64
 4   weather             8746 non-null   object 
 5   temp                8104 non-null   float64
 6   humidity            7462 non-null   float64
 7   windspeed           6820 non-null   float64
 8   casual              10886 non-null  int64  
 9   registered          10886 non-null  int64  
 10  rented_bikes_count  10886 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 935.6+ KB


In [16]:
df.isnull().sum()

datetime                 0
season                 214
holiday                856
workingday            1498
weather               2140
temp                  2782
humidity              3424
windspeed             4066
casual                   0
registered               0
rented_bikes_count       0
dtype: int64

In [17]:
df.dropna(axis=0,inplace=True)

In [18]:
df.isnull().sum()

datetime              0
season                0
holiday               0
workingday            0
weather               0
temp                  0
humidity              0
windspeed             0
casual                0
registered            0
rented_bikes_count    0
dtype: int64

In [19]:
num_col=df.select_dtypes('number').columns
num_col

Index(['holiday', 'workingday', 'temp', 'humidity', 'windspeed', 'casual',
       'registered', 'rented_bikes_count'],
      dtype='object')

In [20]:
df['datetime']=pd.to_datetime(df['datetime'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2177 entries, 3 to 10885
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetime            2177 non-null   datetime64[ns]
 1   season              2177 non-null   object        
 2   holiday             2177 non-null   float64       
 3   workingday          2177 non-null   float64       
 4   weather             2177 non-null   object        
 5   temp                2177 non-null   float64       
 6   humidity            2177 non-null   float64       
 7   windspeed           2177 non-null   float64       
 8   casual              2177 non-null   int64         
 9   registered          2177 non-null   int64         
 10  rented_bikes_count  2177 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(2)
memory usage: 204.1+ KB


In [21]:
indexes=detect_outliers(df,0,num_col)
len(indexes)

337

In [22]:
df.drop(indexes,axis=0,inplace=True)
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,rented_bikes_count
3,2011-01-01 03:00:00,Spring,0.0,0.0,Clear,9.84,75.0,0.0000,3,10,13
14,2011-01-01 14:00:00,Spring,0.0,0.0,Mist,18.86,72.0,19.0012,35,71,106
15,2011-01-01 15:00:00,Spring,0.0,0.0,Mist,18.04,77.0,19.9995,40,70,110
16,2011-01-01 16:00:00,Spring,0.0,0.0,Mist,17.22,82.0,19.9995,41,52,93
26,2011-01-02 02:00:00,Spring,0.0,0.0,Mist,17.22,100.0,19.0012,1,8,9
...,...,...,...,...,...,...,...,...,...,...,...
10844,2012-12-18 06:00:00,Winter,0.0,1.0,Clear,14.76,93.0,11.0014,1,117,118
10848,2012-12-18 10:00:00,Winter,0.0,1.0,Clear,18.04,77.0,6.0032,22,162,184
10873,2012-12-19 11:00:00,Winter,0.0,1.0,Clear,16.40,54.0,15.0013,31,169,200
10881,2012-12-19 19:00:00,Winter,0.0,1.0,Clear,15.58,50.0,26.0027,7,329,336


In [23]:
df['year']=df['datetime'].dt.year
df['month']=df['datetime'].dt.month


In [24]:
cat_col=df.select_dtypes('object').columns
cat_col

Index(['season', 'weather'], dtype='object')

In [25]:
df=pd.get_dummies(df,cat_col,drop_first=True)
df

Unnamed: 0,datetime,holiday,workingday,temp,humidity,windspeed,casual,registered,rented_bikes_count,year,month,season_Spring,season_Summer,season_Winter,weather_Mist,weather_Rainy
3,2011-01-01 03:00:00,0.0,0.0,9.84,75.0,0.0000,3,10,13,2011,1,True,False,False,False,False
14,2011-01-01 14:00:00,0.0,0.0,18.86,72.0,19.0012,35,71,106,2011,1,True,False,False,True,False
15,2011-01-01 15:00:00,0.0,0.0,18.04,77.0,19.9995,40,70,110,2011,1,True,False,False,True,False
16,2011-01-01 16:00:00,0.0,0.0,17.22,82.0,19.9995,41,52,93,2011,1,True,False,False,True,False
26,2011-01-02 02:00:00,0.0,0.0,17.22,100.0,19.0012,1,8,9,2011,1,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10844,2012-12-18 06:00:00,0.0,1.0,14.76,93.0,11.0014,1,117,118,2012,12,False,False,True,False,False
10848,2012-12-18 10:00:00,0.0,1.0,18.04,77.0,6.0032,22,162,184,2012,12,False,False,True,False,False
10873,2012-12-19 11:00:00,0.0,1.0,16.40,54.0,15.0013,31,169,200,2012,12,False,False,True,False,False
10881,2012-12-19 19:00:00,0.0,1.0,15.58,50.0,26.0027,7,329,336,2012,12,False,False,True,False,False


In [26]:
df.drop('datetime',axis=1,inplace=True)
df

Unnamed: 0,holiday,workingday,temp,humidity,windspeed,casual,registered,rented_bikes_count,year,month,season_Spring,season_Summer,season_Winter,weather_Mist,weather_Rainy
3,0.0,0.0,9.84,75.0,0.0000,3,10,13,2011,1,True,False,False,False,False
14,0.0,0.0,18.86,72.0,19.0012,35,71,106,2011,1,True,False,False,True,False
15,0.0,0.0,18.04,77.0,19.9995,40,70,110,2011,1,True,False,False,True,False
16,0.0,0.0,17.22,82.0,19.9995,41,52,93,2011,1,True,False,False,True,False
26,0.0,0.0,17.22,100.0,19.0012,1,8,9,2011,1,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10844,0.0,1.0,14.76,93.0,11.0014,1,117,118,2012,12,False,False,True,False,False
10848,0.0,1.0,18.04,77.0,6.0032,22,162,184,2012,12,False,False,True,False,False
10873,0.0,1.0,16.40,54.0,15.0013,31,169,200,2012,12,False,False,True,False,False
10881,0.0,1.0,15.58,50.0,26.0027,7,329,336,2012,12,False,False,True,False,False


In [27]:
df.corr()

Unnamed: 0,holiday,workingday,temp,humidity,windspeed,casual,registered,rented_bikes_count,year,month,season_Spring,season_Summer,season_Winter,weather_Mist,weather_Rainy
holiday,,,,,,,,,,,,,,,
workingday,,1.0,0.094854,-0.11465,0.032562,-0.142112,0.153716,0.102123,-0.012734,-0.004296,-0.018418,0.040212,-0.020422,-0.010584,-0.022377
temp,,0.094854,1.0,-0.012533,-0.015724,0.502379,0.259482,0.331634,0.043601,0.260963,-0.557247,0.169799,-0.245318,-0.036706,0.000544
humidity,,-0.11465,-0.012533,1.0,-0.31554,-0.305999,-0.24322,-0.275474,-0.030709,0.217994,-0.19988,-0.002904,0.143062,0.170684,0.325507
windspeed,,0.032562,-0.015724,-0.31554,1.0,0.104184,0.070151,0.082871,-0.037907,-0.153992,0.129266,0.038972,-0.086406,-0.035059,0.014059
casual,,-0.142112,0.502379,-0.305999,0.104184,1.0,0.56656,0.70331,0.090148,0.124788,-0.256189,0.081697,-0.083106,0.003257,-0.103451
registered,,0.153716,0.259482,-0.24322,0.070151,0.56656,1.0,0.98425,0.163281,0.166078,-0.165905,-0.014273,0.091266,-0.002598,-0.103892
rented_bikes_count,,0.102123,0.331634,-0.275474,0.082871,0.70331,0.98425,1.0,0.160203,0.170047,-0.198089,0.005214,0.060906,-0.001543,-0.111822
year,,-0.012734,0.043601,-0.030709,-0.037907,0.090148,0.163281,0.160203,1.0,0.032608,-0.005354,-0.030562,0.042038,0.068003,-0.004914
month,,-0.004296,0.260963,0.217994,-0.153992,0.124788,0.166078,0.170047,0.032608,1.0,-0.76326,-0.247706,0.766847,0.017462,-0.010059


In [28]:
df.drop('holiday',axis=1,inplace=True)

In [29]:
df.drop(['casual','registered'],axis=1,inplace=True)
df

Unnamed: 0,workingday,temp,humidity,windspeed,rented_bikes_count,year,month,season_Spring,season_Summer,season_Winter,weather_Mist,weather_Rainy
3,0.0,9.84,75.0,0.0000,13,2011,1,True,False,False,False,False
14,0.0,18.86,72.0,19.0012,106,2011,1,True,False,False,True,False
15,0.0,18.04,77.0,19.9995,110,2011,1,True,False,False,True,False
16,0.0,17.22,82.0,19.9995,93,2011,1,True,False,False,True,False
26,0.0,17.22,100.0,19.0012,9,2011,1,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
10844,1.0,14.76,93.0,11.0014,118,2012,12,False,False,True,False,False
10848,1.0,18.04,77.0,6.0032,184,2012,12,False,False,True,False,False
10873,1.0,16.40,54.0,15.0013,200,2012,12,False,False,True,False,False
10881,1.0,15.58,50.0,26.0027,336,2012,12,False,False,True,False,False


In [30]:
x=df.drop('rented_bikes_count',axis=1)
y=df['rented_bikes_count']

In [31]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)


In [32]:
models={
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(n_neighbors=7),
    'SVR':SVR()
}

In [35]:
for name,model in models.items():
    print('using:',name)
    model.fit(x_train,y_train)
    print('train accuracy: ',model.score(x_train,y_train))
    print('test accuracy: ',model.score(x_test,y_test))
    print('-'*20)

using: LR
train accuracy:  0.24913257296251579
test accuracy:  0.19544808052899132
--------------------
using: KNN
train accuracy:  0.3738649399846389
test accuracy:  0.10636506511114285
--------------------
using: SVR
train accuracy:  -0.046884338941015624
test accuracy:  -0.06390851158378186
--------------------
