In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd


# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import OneHotEncoder


# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None

In [2]:
df = pd.read_pickle('../datos/bikes_registered.pkl')
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,registered,temp,hum,windspeed
0,January,2018,Monday,0,1,2,654,-0.827613,1.252343,-0.387833
1,February,2018,Thursday,1,0,2,670,-0.722069,0.480996,0.748899
2,March,2018,Thursday,1,0,1,1229,-1.635432,-1.338073,0.745931
3,April,2018,Sunday,0,0,1,1454,-1.61556,-0.261577,-0.389769
4,May,2018,Tuesday,1,0,1,1518,-1.468226,-1.340294,-0.046477


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   month       730 non-null    category
 1   year        730 non-null    category
 2   weekday     730 non-null    category
 3   workingday  730 non-null    category
 4   holiday     730 non-null    category
 5   weathersit  730 non-null    category
 6   registered  730 non-null    int64   
 7   temp        730 non-null    float64 
 8   hum         730 non-null    float64 
 9   windspeed   730 non-null    float64 
dtypes: category(6), float64(3), int64(1)
memory usage: 33.1 KB


In [4]:
df.groupby('holiday')['registered'].count()

holiday
0    708
1     22
Name: registered, dtype: int64

In [5]:
categoricas = df.select_dtypes(include = 'category')
categoricas.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit
0,January,2018,Monday,0,1,2
1,February,2018,Thursday,1,0,2
2,March,2018,Thursday,1,0,1
3,April,2018,Sunday,0,0,1
4,May,2018,Tuesday,1,0,1


In [6]:
lista_diccionarios = []
for col in categoricas:

    dicc = {}

    df_cat = (df[col].value_counts()).reset_index()
    for valor in list(df_cat['index']):
        peso = (df['registered'][(df[col] == valor)].sum()*100/df['registered'].sum()).round(2)
        dicc[valor] = peso

    lista_diccionarios.append(dicc)

In [7]:
lista_diccionarios

[{'August': 9.59,
  'December': 7.03,
  'January': 6.07,
  'July': 9.46,
  'March': 7.92,
  'May': 9.13,
  'October': 9.17,
  'April': 8.21,
  'June': 9.5,
  'November': 7.89,
  'September': 9.84,
  'February': 6.19},
 {2018: 37.29, 2019: 62.71},
 {'Monday': 12.45,
  'Tuesday': 13.84,
  'Friday': 15.27,
  'Saturday': 15.17,
  'Sunday': 13.53,
  'Thursday': 15.37,
  'Wednesday': 14.38},
 {1: 69.82, 0: 30.18},
 {0: 97.86, 1: 2.14},
 {1: 67.83, 2: 30.9, 3: 1.27}]

In [8]:
mapa_month = {'August': 2.4,
  'December': 1.5,
  'January': 1,
  'July': 2.3,
  'March': 1.7,
  'May': 2.1,
  'October': 2.1,
  'April': 1.9,
  'June': 2.4,
  'November': 1.8,
  'September': 2.6,
  'February': 1.4}
mapa_year = {2018: 1, 2019: 2}
mapa_weekday = {'Monday': 1,
  'Tuesday': 1.2,
  'Friday': 1.5,
  'Saturday': 1.5,
  'Sunday': 1.2,
  'Thursday': 1.5,
  'Wednesday': 1.3}
mapa_holiday = {0: 1, 1: 3}
mapa_weathersit = {1: 4, 2: 3, 3: 1}
mapa_working_day = {0: 1, 1: 2.5}


In [9]:
lista_mapas = [mapa_month, mapa_year, mapa_weekday, mapa_working_day, mapa_holiday,  mapa_weathersit ]

In [10]:
for indice, col in enumerate(categoricas):

    df[col] = df[col].map(lista_mapas[indice])

In [11]:
df.head()

Unnamed: 0,month,year,weekday,workingday,holiday,weathersit,registered,temp,hum,windspeed
0,1.0,1,1.0,1.0,3,3,654,-0.827613,1.252343,-0.387833
1,1.4,1,1.5,2.5,1,3,670,-0.722069,0.480996,0.748899
2,1.7,1,1.5,2.5,1,4,1229,-1.635432,-1.338073,0.745931
3,1.9,1,1.2,1.0,1,4,1454,-1.61556,-0.261577,-0.389769
4,2.1,1,1.2,2.5,1,4,1518,-1.468226,-1.340294,-0.046477


In [12]:
df.to_pickle('../datos/df_registered_prep2.pkl')