In [1]:
import pandas as pd
import numpy as np
import sidetable as stb
import datetime as dt

In [2]:
df = pd.read_csv('datos/bikes.csv', index_col = 0)

In [3]:
df.sample(10)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
498,499,14-05-2019,summer,1,5,0,1,1,2,23.506653,27.495,78.9583,14.250364,342,2501,2843
315,316,12-11-2018,winter,0,11,0,6,0,1,14.623347,17.8971,55.2917,14.208154,1275,2792,4067
543,544,28-06-2019,autumn,1,6,0,4,1,1,30.715847,33.7756,42.25,11.50055,921,5958,6879
70,71,12-03-2018,spring,0,3,0,6,0,1,13.495847,16.2875,59.4583,14.791925,724,1408,2132
673,674,05-11-2019,winter,1,11,0,1,1,1,13.085847,15.40375,49.4167,15.833775,378,4881,5259
439,440,16-03-2019,spring,1,3,0,5,1,2,17.869153,21.81145,84.2083,7.583864,548,3830,4378
713,714,15-12-2019,winter,1,12,0,6,0,1,13.290847,16.91915,65.0417,7.12545,767,4280,5047
222,223,11-08-2018,autumn,0,8,0,4,1,1,29.4175,32.57605,42.375,11.041332,812,3980,4792
241,242,30-08-2018,autumn,0,8,0,2,1,1,26.205847,29.7352,54.8333,8.375536,775,4429,5204
54,55,24-02-2018,spring,0,2,0,4,1,2,12.121732,14.45955,69.7391,16.783232,100,1707,1807


day.csv have the following fields:
	
	- instant: record index
	- dteday : date
	- season : season (spring, summer, autumn, winter)
	- yr : year (0: 2018, 1:2019)
	- mnth : month ( 1 to 12)
	- holiday : weather day is a holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : temperature in Celsius
	- atemp: feeling temperature in Celsius
	- hum: humidity
	- windspeed: wind speed
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered

In [4]:
df.shape

(730, 16)

In [5]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,spring,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,spring,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,spring,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    object 
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(10), object(2)
memory usage: 97.0+ KB


In [7]:
df.stb.missing().T

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
missing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
total,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,730.0,365.5,210.877136,1.0,183.25,365.5,547.75,730.0
yr,730.0,0.5,0.500343,0.0,0.0,0.5,1.0,1.0
mnth,730.0,6.526027,3.450215,1.0,4.0,7.0,10.0,12.0
holiday,730.0,0.028767,0.167266,0.0,0.0,0.0,0.0,1.0
weekday,730.0,2.99726,2.006161,0.0,1.0,3.0,5.0,6.0
workingday,730.0,0.683562,0.465405,0.0,0.0,1.0,1.0,1.0
weathersit,730.0,1.394521,0.544807,1.0,1.0,1.0,2.0,3.0
temp,730.0,20.319259,7.506729,2.424346,13.811885,20.465826,26.880615,35.328347
atemp,730.0,23.726322,8.150308,3.95348,16.889713,24.368225,30.445775,42.0448
hum,730.0,62.765175,14.237589,0.0,52.0,62.625,72.989575,97.25


In [9]:
df['weekday'].unique()

array([6, 0, 1, 2, 3, 4, 5])

In [10]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
dteday,730,730,01-01-2018,1
season,730,4,autumn,188


In [11]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,spring,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,spring,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,spring,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [12]:
df.duplicated().sum()

0

In [13]:
df['dteday'] = df['dteday'].apply(pd.to_datetime)

  df['dteday'] = df['dteday'].apply(pd.to_datetime)


In [14]:
df['month'] = df['dteday'].dt.month_name()

In [15]:
df['year'] = df['dteday'].dt.year

In [16]:
df['weekday'] = df['dteday'].dt.day_name()

In [17]:
df['day'] = df['dteday'].dt.day

In [18]:
df.sample(10)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month,year,day
349,350,2018-12-16,winter,0,12,0,Sunday,1,2,15.375,17.99125,50.0417,17.458525,178,3399,3577,December,2018,16
580,581,2019-04-08,autumn,1,8,0,Monday,0,1,32.526653,37.56335,61.3333,17.249686,2345,4479,6824,April,2019,8
91,92,2018-02-04,summer,0,4,0,Sunday,0,2,12.915,15.78185,65.375,13.208782,898,1354,2252,February,2018,4
515,516,2019-05-31,summer,1,5,0,Friday,1,1,27.88,31.56645,49.2917,13.083693,1100,6238,7338,May,2019,31
305,306,2018-02-11,winter,0,11,0,Sunday,1,1,15.4775,19.50665,71.875,5.500144,370,3816,4186,February,2018,11
525,526,2019-10-06,summer,1,6,0,Sunday,0,1,29.793347,33.17585,53.8333,8.959307,2224,4374,6598,October,2019,6
16,17,2018-01-17,spring,0,1,1,Wednesday,0,2,7.209153,8.83855,53.75,12.999139,117,883,1000,January,2018,17
585,586,2019-09-08,autumn,1,8,0,Sunday,1,1,30.989153,34.9754,62.0417,10.4587,1196,6090,7286,September,2019,8
406,407,2019-11-02,spring,1,2,0,Saturday,0,3,9.190847,10.54335,73.125,19.416332,192,1977,2169,November,2019,2
228,229,2018-08-17,autumn,0,8,0,Friday,1,1,29.656653,33.33355,57.5417,9.625689,668,4026,4694,August,2018,17


In [19]:
df.drop(['day_name','yr', 'mnth', 'season'], inplace = True, axis = 1)

KeyError: "['day_name'] not found in axis"

In [None]:
df.head()

Unnamed: 0,instant,dteday,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month,year,day
0,1,2018-01-01,0,Monday,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985,January,2018,1
1,2,2018-02-01,0,Thursday,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801,February,2018,1
2,3,2018-03-01,0,Thursday,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349,March,2018,1
3,4,2018-04-01,0,Sunday,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562,April,2018,1
4,5,2018-05-01,0,Tuesday,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600,May,2018,1


In [None]:
def get_season(elem):
    if  elem <=  2018-03-20:
        return 'Winter'
    elif elem > "14" and elem < "18":
        return "Tarde"
    else:
        return "Noche"

In [None]:
season_list = []

for month in df['dteday']:

In [None]:
df['season'] = pd.cut(df['dteday'], bins = 4, la)

In [23]:


# definir los límites de los intervalos para cada temporada
intervalos = [pd.Timestamp('2017-03-20'), pd.Timestamp('2017-06-21'), pd.Timestamp('2017-09-22'), pd.Timestamp('2017-12-22'), 
              pd.Timestamp('2018-03-20'), pd.Timestamp('2018-06-21'), pd.Timestamp('2018-09-22'), pd.Timestamp('2018-12-22'), pd.Timestamp('2018-12-31')]
etiquetas = ['Invierno', 'Primavera', 'Verano', 'Otoño', 'Invierno', 'Primavera', 'Verano', 'Otoño',]

# categorizar las fechas por temporada
df['season'] = pd.cut(df['dteday'], bins=intervalos, labels=etiquetas, ordered = False)



In [25]:
df.sample(10)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,month,year,day
112,113,2018-04-23,Invierno,0,4,0,Monday,0,2,18.86,22.50605,88.7917,15.458575,1462,2574,4036,April,2018,23
429,430,2019-06-03,,1,3,0,Monday,1,1,10.591653,12.7521,45.6667,13.458625,221,3735,3956,June,2019,3
722,723,2019-12-24,,1,12,0,Tuesday,1,2,9.483464,12.945,79.1304,5.174437,174,746,920,December,2019,24
396,397,2019-01-02,,1,2,0,Wednesday,1,1,19.235847,23.3269,50.7917,12.667489,304,4275,4579,January,2019,2
525,526,2019-10-06,,1,6,0,Sunday,0,1,29.793347,33.17585,53.8333,8.959307,2224,4374,6598,October,2019,6
348,349,2018-12-15,Verano,0,12,0,Saturday,1,2,17.3225,20.61185,63.4167,17.958814,181,3528,3709,December,2018,15
250,251,2018-08-09,Primavera,0,9,0,Thursday,1,3,25.990433,27.76805,93.9565,12.914116,153,1689,1842,August,2018,9
650,651,2019-10-13,,1,10,0,Sunday,0,1,16.126653,19.5698,49.4583,9.791514,2252,4857,7109,October,2019,13
238,239,2018-08-27,Primavera,0,8,0,Monday,0,2,27.88,31.7778,85.0,25.166339,226,889,1115,August,2018,27
466,467,2019-12-04,,1,4,0,Wednesday,1,1,16.2975,19.3802,46.625,19.458743,663,4746,5409,December,2019,4
