In [27]:
import pandas as pd
import numpy as np
import sidetable as stb
import datetime as dt

In [5]:
df = pd.read_csv('datos/bikes.csv', index_col = 0)

In [6]:
df.sample(10)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
161,162,11-06-2018,summer,0,6,0,6,0,1,29.725,33.9019,65.4583,10.37495,1729,3237,4966
189,190,09-07-2018,autumn,0,7,0,6,0,1,30.066653,33.2079,60.9167,11.250104,1988,3348,5336
84,85,26-03-2018,summer,0,3,0,6,0,1,10.899153,12.87855,39.4167,14.041257,981,1515,2496
53,54,23-02-2018,spring,0,2,0,3,1,1,9.091299,12.28585,42.3043,6.305571,139,1778,1917
707,708,09-12-2019,winter,1,12,0,0,0,2,15.750847,19.5073,90.5417,10.584325,441,2787,3228
285,286,13-10-2018,winter,0,10,0,4,1,2,24.155847,27.5902,89.6667,9.499729,290,2623,2913
66,67,08-03-2018,spring,0,3,0,2,1,1,11.9925,15.12,42.0833,8.08355,316,1817,2133
47,48,17-02-2018,spring,0,2,0,4,1,1,17.869153,21.4329,50.5,15.416968,259,2216,2475
12,13,13-01-2018,spring,0,1,0,4,1,1,6.765,7.54415,47.0417,20.167,38,1368,1406
168,169,18-06-2018,summer,0,6,0,6,0,1,28.563347,32.1977,67.0417,8.000336,1807,3312,5119


day.csv have the following fields:
	
	- instant: record index
	- dteday : date
	- season : season (spring, summer, autumn, winter)
	- yr : year (0: 2018, 1:2019)
	- mnth : month ( 1 to 12)
	- holiday : weather day is a holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : temperature in Celsius
	- atemp: feeling temperature in Celsius
	- hum: humidity
	- windspeed: wind speed
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered

In [7]:
df.shape

(730, 16)

In [10]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,spring,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,spring,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,spring,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    object 
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(10), object(2)
memory usage: 97.0+ KB


In [18]:
df.stb.missing().T

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
missing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
total,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,730.0,365.5,210.877136,1.0,183.25,365.5,547.75,730.0
yr,730.0,0.5,0.500343,0.0,0.0,0.5,1.0,1.0
mnth,730.0,6.526027,3.450215,1.0,4.0,7.0,10.0,12.0
holiday,730.0,0.028767,0.167266,0.0,0.0,0.0,0.0,1.0
weekday,730.0,2.99726,2.006161,0.0,1.0,3.0,5.0,6.0
workingday,730.0,0.683562,0.465405,0.0,0.0,1.0,1.0,1.0
weathersit,730.0,1.394521,0.544807,1.0,1.0,1.0,2.0,3.0
temp,730.0,20.319259,7.506729,2.424346,13.811885,20.465826,26.880615,35.328347
atemp,730.0,23.726322,8.150308,3.95348,16.889713,24.368225,30.445775,42.0448
hum,730.0,62.765175,14.237589,0.0,52.0,62.625,72.989575,97.25


In [21]:
df['weekday'].unique()

array([6, 0, 1, 2, 3, 4, 5])

In [15]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
dteday,730,730,01-01-2018,1
season,730,4,autumn,188


In [22]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,spring,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,spring,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,spring,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [24]:
df.duplicated().sum()

0

In [25]:
df['dteday'] = df['dteday'].apply(pd.to_datetime)

  df['dteday'] = df['dteday'].apply(pd.to_datetime)


In [28]:
df['month'] = df['dteday'].dt.month_name()

In [32]:
df['year'] = df['dteday'].dt.year

In [36]:
df['weekday'] = df['dteday'].dt.day_name()

In [38]:
df['day'] = df['dteday'].dt.day

In [40]:
df.sample(10)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month,year,day_name,day
168,169,2018-06-18,summer,0,6,0,Monday,0,1,28.563347,32.1977,67.0417,8.000336,1807,3312,5119,June,2018,Monday,18
151,152,2018-01-06,summer,0,6,0,Saturday,1,2,31.330847,36.04835,67.7083,13.875164,513,3461,3974,January,2018,Saturday,6
112,113,2018-04-23,summer,0,4,0,Monday,0,2,18.86,22.50605,88.7917,15.458575,1462,2574,4036,April,2018,Monday,23
592,593,2019-08-16,autumn,1,8,0,Friday,1,1,29.485847,32.7344,51.9167,9.500332,1338,6267,7605,August,2019,Friday,16
245,246,2018-03-09,autumn,0,9,0,Friday,0,1,27.435847,31.66065,71.6667,12.416775,1935,2549,4484,March,2018,Friday,9
74,75,2018-03-16,spring,0,3,0,Friday,1,2,14.973897,18.3465,77.6522,13.608839,321,1871,2192,March,2018,Friday,16
627,628,2019-09-20,autumn,1,9,0,Friday,1,1,22.413347,26.6096,61.8333,7.917189,939,6781,7720,September,2019,Friday,20
271,272,2018-09-29,winter,0,9,0,Saturday,1,1,25.283347,28.7256,69.9167,11.583161,653,4186,4839,September,2018,Saturday,29
303,304,2018-10-31,winter,0,10,0,Wednesday,1,1,13.94,17.80315,70.3333,7.12545,362,3307,3669,October,2018,Wednesday,31
485,486,2019-01-05,summer,1,5,0,Saturday,1,2,25.146653,28.85105,65.9583,10.458432,653,5087,5740,January,2019,Saturday,5


In [42]:
df.drop(['day_name','yr', 'mnth', 'season'], inplace = True, axis = 1)

In [43]:
df.head()

Unnamed: 0,instant,dteday,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month,year,day
0,1,2018-01-01,0,Monday,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985,January,2018,1
1,2,2018-02-01,0,Thursday,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801,February,2018,1
2,3,2018-03-01,0,Thursday,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349,March,2018,1
3,4,2018-04-01,0,Sunday,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562,April,2018,1
4,5,2018-05-01,0,Tuesday,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600,May,2018,1


In [None]:
def get_season(elem):
    if  elem <=  2018-03-20:
        return 'Winter'
    elif elem > "14" and elem < "18":
        return "Tarde"
    else:
        return "Noche"

In [None]:
season_list = []

for month in df['dteday']:

In [None]:
df['season'] = pd.cut(df['dteday'], bins = 4, la)