In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd


# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import OneHotEncoder


# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None

In [2]:
df = pd.read_pickle('../datos/bikes_casual.pkl')
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,casual,temp,hum,windspeed
0,January,2018,Monday,0,1,2,331,-0.827613,1.252343,-0.387833
1,February,2018,Thursday,1,0,2,131,-0.722069,0.480996,0.748899
2,March,2018,Thursday,1,0,1,120,-1.635432,-1.338073,0.745931
3,April,2018,Sunday,0,0,1,108,-1.61556,-0.261577,-0.389769
4,May,2018,Tuesday,1,0,1,82,-1.468226,-1.340294,-0.046477


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   month       730 non-null    category
 1   year        730 non-null    category
 2   weekday     730 non-null    category
 3   workingday  730 non-null    category
 4   holiday     730 non-null    category
 5   weathersit  730 non-null    category
 6   casual      730 non-null    int64   
 7   temp        730 non-null    float64 
 8   hum         730 non-null    float64 
 9   windspeed   730 non-null    float64 
dtypes: category(6), float64(3), int64(1)
memory usage: 33.1 KB


In [4]:
df.groupby('holiday')['casual'].count()

holiday
0    708
1     22
Name: casual, dtype: int64

In [5]:
categoricas = df.select_dtypes(include = 'category')
categoricas.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit
0,January,2018,Monday,0,1,2
1,February,2018,Thursday,1,0,2
2,March,2018,Thursday,1,0,1
3,April,2018,Sunday,0,0,1
4,May,2018,Tuesday,1,0,1


In [6]:
lista_diccionarios = []
for col in categoricas:

    dicc = {}

    df_cat = df[col].value_counts().reset_index()
    for valor in list(df_cat['index']):
        peso = (df['casual'][(df[col] == valor)].sum()*100/df['casual'].sum()).round(2)
        dicc[valor] = peso

    lista_diccionarios.append(dicc)

In [7]:
lista_diccionarios

[{'August': 10.03,
  'December': 4.96,
  'January': 4.05,
  'July': 10.47,
  'March': 9.01,
  'May': 11.33,
  'October': 9.3,
  'April': 9.59,
  'June': 10.19,
  'November': 6.39,
  'September': 9.76,
  'February': 4.92},
 {2018: 39.88, 2019: 60.12},
 {'Monday': 19.75,
  'Tuesday': 14.6,
  'Friday': 12.63,
  'Saturday': 12.12,
  'Sunday': 17.98,
  'Thursday': 10.93,
  'Wednesday': 11.98},
 {1: 67.22, 0: 32.78},
 {0: 95.78, 1: 4.22},
 {1: 72.0, 2: 27.37, 3: 0.63}]

In [8]:
mapa_month = {'August': 10.03,
  'December': 4.96,
  'January': 4.05,
  'July': 10.47,
  'March': 9.01,
  'May': 11.33,
  'October': 9.3,
  'April': 9.59,
  'June': 10.19,
  'November': 6.39,
  'September': 9.76,
  'February': 4.92}
mapa_year = {2018: 2.7, 2019: 4.6}
mapa_weekday = {'Monday': 19.75,
  'Tuesday': 14.6,
  'Friday': 12.63,
  'Saturday': 12.12,
  'Sunday': 17.98,
  'Thursday': 10.93,
  'Wednesday': 11.98}
mapa_weathersit = {1: 4, 2: 3, 3: 1}

In [9]:
categoricas.drop(['workingday', 'holiday'], axis = 1, inplace = True)

In [10]:
lista_mapas = [mapa_month, mapa_year, mapa_weekday, mapa_weathersit]

In [11]:
for indice, col in enumerate(categoricas):

    df[col] = df[col].map(lista_mapas[indice])

In [12]:
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,casual,temp,hum,windspeed
0,4.05,2.7,19.75,0,1,3,331,-0.827613,1.252343,-0.387833
1,4.92,2.7,10.93,1,0,3,131,-0.722069,0.480996,0.748899
2,9.01,2.7,10.93,1,0,4,120,-1.635432,-1.338073,0.745931
3,9.59,2.7,17.98,0,0,4,108,-1.61556,-0.261577,-0.389769
4,11.33,2.7,14.6,1,0,4,82,-1.468226,-1.340294,-0.046477


Consideramos la columna 'workingday' como que no tiene orden, por lo que para hacer el encoding utilizaremos el método One Hot Encoding.

In [13]:
def one_hot_encoder_one(df,columna,keep_first=True):
    
    oh = OneHotEncoder(dtype='int')
    
    transformados = oh.fit_transform(df[[columna]])
    
    oh_df = pd.DataFrame(transformados.toarray())
    
    oh_df.columns = oh.get_feature_names_out()
    
    final = pd.concat([df,oh_df],axis=1)
    
    final.drop(columna, axis = 1,  inplace = True)
    return final

In [14]:
df_casual_encod = one_hot_encoder_one(df,'workingday')

In [15]:
df_casual_encod

Unnamed: 0,month,year,weekday,holiday,weathersit,casual,temp,hum,windspeed,workingday_0,workingday_1
0,4.05,2.7,19.75,1,3,331,-0.827613,1.252343,-0.387833,1,0
1,4.92,2.7,10.93,0,3,131,-0.722069,0.480996,0.748899,0,1
2,9.01,2.7,10.93,0,4,120,-1.635432,-1.338073,0.745931,0,1
3,9.59,2.7,17.98,0,4,108,-1.615560,-0.261577,-0.389769,1,0
4,11.33,2.7,14.60,0,4,82,-1.468226,-1.340294,-0.046477,0,1
...,...,...,...,...,...,...,...,...,...,...,...
725,4.96,4.6,12.63,0,3,247,-1.319509,0.177576,2.059845,0,1
726,4.96,4.6,12.12,0,3,644,-1.324068,-0.264634,-0.452029,1,0
727,4.96,4.6,17.98,0,3,159,-1.324068,0.880424,-0.853182,1,0
728,4.96,4.6,19.75,0,4,364,-1.310404,-1.014341,2.067858,0,1


In [16]:
df_casual_encod.to_pickle('../datos/df_casual_prep.pkl')