In [1]:
import pandas as pd

# Dataset: Bike Sharing Daily
* Source: https://www.kaggle.com/contactprad/bike-share-daily-data
* Licence: [1] Fanaee-T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg, doi:10.1007/s13748-013-0040-3.

@article{
year={2013},
issn={2192-6352},
journal={Progress in Artificial Intelligence},
doi={10.1007/s13748-013-0040-3},
title={Event labeling combining ensemble detectors and background knowledge},
url={http://dx.doi.org/10.1007/s13748-013-0040-3},
publisher={Springer Berlin Heidelberg},
keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
author={Fanaee-T, Hadi and Gama, Joao},
pages={1-15}
}

In [17]:
data = pd.read_csv('../sklearn-pipelines/datasets/bike_sharing_daily.csv')

df = data[['dteday', 'temp']].copy()
df['dteday'] = pd.to_datetime(df['dteday'])
df['year'] =  df['dteday'].dt.year
df['month'] =  df['dteday'].dt.month
df

Unnamed: 0,dteday,temp,year,month
0,2011-01-01,0.344167,2011,1
1,2011-01-02,0.363478,2011,1
2,2011-01-03,0.196364,2011,1
3,2011-01-04,0.200000,2011,1
4,2011-01-05,0.226957,2011,1
...,...,...,...,...
726,2012-12-27,0.254167,2012,12
727,2012-12-28,0.253333,2012,12
728,2012-12-29,0.253333,2012,12
729,2012-12-30,0.255833,2012,12


# Simple ranking: highest temperatures

* Ranking pandas documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html

In [19]:
df['row_number_temp'] = df['temp'].rank(method='first', ascending=False)
df['rank'] = df['temp'].rank(method='min', ascending=False)
df['dense_rank_temp'] = df['temp'].rank(method='dense', ascending=False)
df.sort_values('dense_rank_temp')

Unnamed: 0,dteday,temp,year,month,row_number_temp,rank,dense_rank_temp
553,2012-07-07,0.861667,2012,7,1.0,1.0,1.0
203,2011-07-23,0.849167,2011,7,2.0,2.0,2.0
202,2011-07-22,0.848333,2011,7,3.0,3.0,3.0
209,2011-07-29,0.838333,2011,7,4.0,4.0,4.0
545,2012-06-29,0.834167,2012,6,5.0,5.0,5.0
...,...,...,...,...,...,...,...
407,2012-02-12,0.127500,2012,2,727.0,727.0,495.0
368,2012-01-04,0.107500,2012,1,728.0,728.0,496.0
23,2011-01-24,0.097391,2011,1,729.0,729.0,497.0
22,2011-01-23,0.096522,2011,1,730.0,730.0,498.0


# Simple ranking: lowest temperatures

In [20]:
df['row_number_temp'] = df['temp'].rank(method='first', ascending=True)
df['rank'] = df['temp'].rank(method='min', ascending=True)
df['dense_rank_temp'] = df['temp'].rank(method='dense', ascending=True)
df.sort_values('dense_rank_temp')

Unnamed: 0,dteday,temp,year,month,row_number_temp,rank,dense_rank_temp
21,2011-01-22,0.059130,2011,1,1.0,1.0,1.0
22,2011-01-23,0.096522,2011,1,2.0,2.0,2.0
23,2011-01-24,0.097391,2011,1,3.0,3.0,3.0
368,2012-01-04,0.107500,2012,1,4.0,4.0,4.0
407,2012-02-12,0.127500,2012,2,5.0,5.0,5.0
...,...,...,...,...,...,...,...
545,2012-06-29,0.834167,2012,6,727.0,727.0,495.0
209,2011-07-29,0.838333,2011,7,728.0,728.0,496.0
202,2011-07-22,0.848333,2011,7,729.0,729.0,497.0
203,2011-07-23,0.849167,2011,7,730.0,730.0,498.0


# Partition By rankings: highest temperatures for each year-month

In [23]:
df['rank_by_date'] = df.groupby(['year', 'month'])['temp'].rank(method='min', ascending=False)
df.sort_values(['year', 'month', 'rank_by_date'])

Unnamed: 0,dteday,temp,year,month,row_number_temp,rank,dense_rank_temp,rank_by_date
1,2011-01-02,0.363478,2011,1,218.0,218.0,161.0,1.0
0,2011-01-01,0.344167,2011,1,196.0,196.0,143.0,2.0
18,2011-01-19,0.292174,2011,1,121.0,121.0,93.0,3.0
19,2011-01-20,0.261667,2011,1,75.0,75.0,60.0,4.0
14,2011-01-15,0.233333,2011,1,55.0,55.0,48.0,5.0
...,...,...,...,...,...,...,...,...
728,2012-12-29,0.253333,2012,12,66.0,65.0,54.0,26.0
722,2012-12-23,0.245833,2012,12,62.0,61.0,51.0,28.0
725,2012-12-26,0.243333,2012,12,60.0,59.0,50.0,29.0
723,2012-12-24,0.231304,2012,12,53.0,53.0,46.0,30.0
