In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd


# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import OneHotEncoder


# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None

In [2]:
df = pd.read_pickle('../datos/bikes_cnt.pkl')
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,cnt,temp,hum,windspeed
0,January,2018,Monday,0,1,2,985,-0.827613,1.252343,-0.387833
1,February,2018,Thursday,1,0,2,801,-0.722069,0.480996,0.748899
2,March,2018,Thursday,1,0,1,1349,-1.635432,-1.338073,0.745931
3,April,2018,Sunday,0,0,1,1562,-1.61556,-0.261577,-0.389769
4,May,2018,Tuesday,1,0,1,1600,-1.468226,-1.340294,-0.046477


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   month       730 non-null    category
 1   year        730 non-null    category
 2   weekday     730 non-null    category
 3   workingday  730 non-null    category
 4   holiday     730 non-null    category
 5   weathersit  730 non-null    category
 6   cnt         730 non-null    int64   
 7   temp        730 non-null    float64 
 8   hum         730 non-null    float64 
 9   windspeed   730 non-null    float64 
dtypes: category(6), float64(3), int64(1)
memory usage: 33.1 KB


In [5]:
df.groupby('holiday')['cnt'].count()

holiday
0    708
1     22
Name: cnt, dtype: int64

In [4]:
categoricas = df.select_dtypes(include = 'category')
categoricas.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit
0,January,2018,Monday,0,1,2
1,February,2018,Thursday,1,0,2
2,March,2018,Thursday,1,0,1
3,April,2018,Sunday,0,0,1
4,May,2018,Tuesday,1,0,1


In [5]:
lista_diccionarios = []
for col in categoricas:

    dicc = {}

    df_cat = (df[col].value_counts()).reset_index()
    for valor in list(df_cat['index']):
        peso = (df['cnt'][(df[col] == valor)].sum()*100/df['cnt'].sum()).round(2)
        dicc[valor] = peso

    lista_diccionarios.append(dicc)

In [8]:
lista_diccionarios

[{'August': 9.67,
  'December': 6.64,
  'January': 5.69,
  'July': 9.65,
  'March': 8.13,
  'May': 9.54,
  'October': 9.2,
  'April': 8.47,
  'June': 9.63,
  'November': 7.61,
  'September': 9.82,
  'February': 5.95},
 {2018: 37.77, 2019: 62.23},
 {'Monday': 13.82,
  'Tuesday': 13.98,
  'Friday': 14.77,
  'Saturday': 14.59,
  'Sunday': 14.37,
  'Thursday': 14.54,
  'Wednesday': 13.93},
 {1: 69.33, 0: 30.67},
 {0: 97.46, 1: 2.54},
 {1: 68.61, 2: 30.24, 3: 1.15}]

In [9]:
mapa_month = {'August': 9.67,
  'December': 6.64,
  'January': 5.69,
  'July': 9.65,
  'March': 8.13,
  'May': 9.54,
  'October': 9.2,
  'April': 8.47,
  'June': 9.63,
  'November': 7.61,
  'September': 9.82,
  'February': 5.95}
mapa_year = {2018: 4, 2019: 6}
mapa_weekday = {'Monday': 13.82,
  'Tuesday': 13.98,
  'Friday': 14.77,
  'Saturday': 14.59,
  'Sunday': 14.37,
  'Thursday': 14.54,
  'Wednesday': 13.9}
mapa_holiday = {0: 4, 1: 3}
mapa_weathersit = {1: 4, 2: 3, 3: 1}


In [10]:
categoricas.drop(['workingday'], axis = 1, inplace = True)

In [11]:
lista_mapas = [mapa_month, mapa_year, mapa_weekday,mapa_holiday,  mapa_weathersit ]

In [12]:
for indice, col in enumerate(categoricas):

    df[col] = df[col].map(lista_mapas[indice])

In [13]:
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,cnt,temp,hum,windspeed
0,5.69,4,13.82,0,3,3,985,-0.827613,1.252343,-0.387833
1,5.95,4,14.54,1,4,3,801,-0.722069,0.480996,0.748899
2,8.13,4,14.54,1,4,4,1349,-1.635432,-1.338073,0.745931
3,8.47,4,14.37,0,4,4,1562,-1.61556,-0.261577,-0.389769
4,9.54,4,13.98,1,4,4,1600,-1.468226,-1.340294,-0.046477


Consideramos la columna 'workingday' como que no tiene orden, por lo que para hacer el encoding utilizaremos el método One Hot Encoding.

In [10]:
import pickle

In [11]:
def one_hot_encoder_one(df,columna,keep_first=True):
    
    oh = OneHotEncoder(dtype='int')
    
    transformados = oh.fit_transform(df[[columna]])
    
    oh_df = pd.DataFrame(transformados.toarray())
    
    oh_df.columns = oh.get_feature_names_out()
    
    final = pd.concat([df,oh_df],axis=1)

    with open ("one_hot.pkl", "wb") as f:
        pickle.dump(oh, f)
    
    final.drop(columna, axis = 1,  inplace = True)
    return final

In [12]:
df_cnt_encod = one_hot_encoder_one(df,'workingday')


In [16]:
df_cnt_encod

Unnamed: 0,month,year,weekday,holiday,weathersit,cnt,temp,hum,windspeed,workingday_0,workingday_1
0,5.69,4,13.82,3,3,985,-0.827613,1.252343,-0.387833,1,0
1,5.95,4,14.54,4,3,801,-0.722069,0.480996,0.748899,0,1
2,8.13,4,14.54,4,4,1349,-1.635432,-1.338073,0.745931,0,1
3,8.47,4,14.37,4,4,1562,-1.615560,-0.261577,-0.389769,1,0
4,9.54,4,13.98,4,4,1600,-1.468226,-1.340294,-0.046477,0,1
...,...,...,...,...,...,...,...,...,...,...,...
725,6.64,6,14.77,4,3,2114,-1.319509,0.177576,2.059845,0,1
726,6.64,6,14.59,4,3,3095,-1.324068,-0.264634,-0.452029,1,0
727,6.64,6,14.37,4,3,1341,-1.324068,0.880424,-0.853182,1,0
728,6.64,6,13.82,4,4,1796,-1.310404,-1.014341,2.067858,0,1


In [17]:
df_cnt_encod.to_pickle('../datos/df_cnt_prep.pkl')