In [41]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd


# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import OneHotEncoder


# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None

In [42]:
df = pd.read_pickle('../datos/bikes_registered.pkl')
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,registered,temp,hum,windspeed
0,January,2018,Monday,0,1,2,654,-0.827613,1.252343,-0.387833
1,February,2018,Thursday,1,0,2,670,-0.722069,0.480996,0.748899
2,March,2018,Thursday,1,0,1,1229,-1.635432,-1.338073,0.745931
3,April,2018,Sunday,0,0,1,1454,-1.61556,-0.261577,-0.389769
4,May,2018,Tuesday,1,0,1,1518,-1.468226,-1.340294,-0.046477


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   month       730 non-null    category
 1   year        730 non-null    category
 2   weekday     730 non-null    category
 3   workingday  730 non-null    category
 4   holiday     730 non-null    category
 5   weathersit  730 non-null    category
 6   registered  730 non-null    int64   
 7   temp        730 non-null    float64 
 8   hum         730 non-null    float64 
 9   windspeed   730 non-null    float64 
dtypes: category(6), float64(3), int64(1)
memory usage: 33.1 KB


In [44]:
df.groupby('holiday')['registered'].count()

holiday
0    708
1     22
Name: registered, dtype: int64

In [45]:
categoricas = df.select_dtypes(include = 'category')
categoricas.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit
0,January,2018,Monday,0,1,2
1,February,2018,Thursday,1,0,2
2,March,2018,Thursday,1,0,1
3,April,2018,Sunday,0,0,1
4,May,2018,Tuesday,1,0,1


In [46]:
lista_diccionarios = []
for col in categoricas:

    dicc = {}

    df_cat = (df[col].value_counts()).reset_index()
    for valor in list(df_cat['index']):
        peso = (df['registered'][(df[col] == valor)].sum()*100/df['registered'].sum()).round(2)
        dicc[valor] = peso

    lista_diccionarios.append(dicc)

In [47]:
lista_diccionarios

[{'August': 9.59,
  'December': 7.03,
  'January': 6.07,
  'July': 9.46,
  'March': 7.92,
  'May': 9.13,
  'October': 9.17,
  'April': 8.21,
  'June': 9.5,
  'November': 7.89,
  'September': 9.84,
  'February': 6.19},
 {2018: 37.29, 2019: 62.71},
 {'Monday': 12.45,
  'Tuesday': 13.84,
  'Friday': 15.27,
  'Saturday': 15.17,
  'Sunday': 13.53,
  'Thursday': 15.37,
  'Wednesday': 14.38},
 {1: 69.82, 0: 30.18},
 {0: 97.86, 1: 2.14},
 {1: 67.83, 2: 30.9, 3: 1.27}]

In [48]:
mapa_month = {'August': 9.59,
  'December': 7.03,
  'January': 6.07,
  'July': 9.46,
  'March': 7.92,
  'May': 9.13,
  'October': 9.17,
  'April': 8.21,
  'June': 9.5,
  'November': 7.89,
  'September': 9.84,
  'February': 6.19}
mapa_year = {2018: 2.7, 2019: 4.6}
mapa_weekday = {'Monday': 12.45,
  'Tuesday': 13.84,
  'Friday': 15.27,
  'Saturday': 15.17,
  'Sunday': 13.53,
  'Thursday': 15.37,
  'Wednesday': 14.38}
mapa_holiday = {0: 1, 1: 0}
mapa_weathersit = {1: 4, 2: 3, 3: 1}


In [49]:
categoricas.drop(['workingday'], axis = 1, inplace = True)

In [50]:
lista_mapas = [mapa_month, mapa_year, mapa_weekday,mapa_holiday,  mapa_weathersit ]

In [51]:
for indice, col in enumerate(categoricas):

    df[col] = df[col].map(lista_mapas[indice])

In [52]:
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,registered,temp,hum,windspeed
0,6.07,2.7,12.45,0,0,3,654,-0.827613,1.252343,-0.387833
1,6.19,2.7,15.37,1,1,3,670,-0.722069,0.480996,0.748899
2,7.92,2.7,15.37,1,1,4,1229,-1.635432,-1.338073,0.745931
3,8.21,2.7,13.53,0,1,4,1454,-1.61556,-0.261577,-0.389769
4,9.13,2.7,13.84,1,1,4,1518,-1.468226,-1.340294,-0.046477


Consideramos la columna 'workingday' como que no tiene orden, por lo que para hacer el encoding utilizaremos el método One Hot Encoding.

In [53]:
def one_hot_encoder_one(df,columna,keep_first=True):
    
    oh = OneHotEncoder(dtype='int')
    
    transformados = oh.fit_transform(df[[columna]])
    
    oh_df = pd.DataFrame(transformados.toarray())
    
    oh_df.columns = oh.get_feature_names_out()
    
    final = pd.concat([df,oh_df],axis=1)
    
    final.drop(columna, axis = 1,  inplace = True)
    return final

In [54]:
df_registered_encod = one_hot_encoder_one(df,'workingday')


In [55]:
df_registered_encod

Unnamed: 0,month,year,weekday,holiday,weathersit,registered,temp,hum,windspeed,workingday_0,workingday_1
0,6.07,2.7,12.45,0,3,654,-0.827613,1.252343,-0.387833,1,0
1,6.19,2.7,15.37,1,3,670,-0.722069,0.480996,0.748899,0,1
2,7.92,2.7,15.37,1,4,1229,-1.635432,-1.338073,0.745931,0,1
3,8.21,2.7,13.53,1,4,1454,-1.615560,-0.261577,-0.389769,1,0
4,9.13,2.7,13.84,1,4,1518,-1.468226,-1.340294,-0.046477,0,1
...,...,...,...,...,...,...,...,...,...,...,...
725,7.03,4.6,15.27,1,3,1867,-1.319509,0.177576,2.059845,0,1
726,7.03,4.6,15.17,1,3,2451,-1.324068,-0.264634,-0.452029,1,0
727,7.03,4.6,13.53,1,3,1182,-1.324068,0.880424,-0.853182,1,0
728,7.03,4.6,12.45,1,4,1432,-1.310404,-1.014341,2.067858,0,1


In [56]:
df_registered_encod.to_pickle('../datos/df_casual_prep.pkl')