# Descripción del proyecto

La compañía Sweet Lift Taxi ha recopilado datos históricos sobre pedidos de taxis en los aeropuertos. Para atraer a más conductores durante las horas pico, necesitamos predecir la cantidad de pedidos de taxis para la próxima hora.

## Preparación

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('taxi.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   datetime    26496 non-null  object
 1   num_orders  26496 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 414.1+ KB


In [5]:
# Conversión a tipo datetime y pasarlo como índice
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.set_index('datetime')

In [6]:
# Sumar intervalos de una hora
df = df.resample('1H').sum()
df.head()

Unnamed: 0_level_0,num_orders
datetime,Unnamed: 1_level_1
2018-03-01 00:00:00,124
2018-03-01 01:00:00,85
2018-03-01 02:00:00,71
2018-03-01 03:00:00,66
2018-03-01 04:00:00,43


## Análisis

In [7]:
df.describe()

Unnamed: 0,num_orders
count,4416.0
mean,84.422781
std,45.023853
min,0.0
25%,54.0
50%,78.0
75%,107.0
max,462.0


### Enriqueceimiento de datos

In [None]:
# Creamos función que complete las características según nuestras necesidades
def new_characteristics(df, max_lag, rolling_mean_size):
    """
    Agrega características de año, mes, día del mes,
    día de la semana, hora, desfases elegidos y
    promedio móvil elegido y elimina los valores ausentes
    """

    df_new = df.copy()
    df_new['year'] = df_new.index.year
    df_new['month'] = df_new.index.month
    df_new['day'] = df_new.index.day
    df_new['dayofweek'] = df_new.index.dayofweek
    df_new['hour'] = df_new.index.hour

    for lag in range(1, max_lag + 1):
        df_new['lag_{}'.format(lag)] = df_new['num_orders'].shift(lag)

    df_new['rolling_mean'] = (
        df_new['num_orders'].shift().rolling(rolling_mean_size).mean()
    )

    old_len = len(df_new)
    df_new = df_new.dropna()
    new_len = len(df_new)
    print(f'Se eliminaron {old_len - new_len} filas.')

    return df_new


df_new = new_characteristics(df, 7, 12)
df_new.head(24)

Se eliminaron 12 filas.


Unnamed: 0_level_0,num_orders,year,month,day,dayofweek,hour,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,rolling_mean
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-03-01 12:00:00,30,2018,3,1,3,12,96.0,64.0,69.0,34.0,15.0,12.0,6.0,57.083333
2018-03-01 13:00:00,32,2018,3,1,3,13,30.0,96.0,64.0,69.0,34.0,15.0,12.0,49.25
2018-03-01 14:00:00,48,2018,3,1,3,14,32.0,30.0,96.0,64.0,69.0,34.0,15.0,44.833333
2018-03-01 15:00:00,66,2018,3,1,3,15,48.0,32.0,30.0,96.0,64.0,69.0,34.0,42.916667
2018-03-01 16:00:00,43,2018,3,1,3,16,66.0,48.0,32.0,30.0,96.0,64.0,69.0,42.916667
2018-03-01 17:00:00,44,2018,3,1,3,17,43.0,66.0,48.0,32.0,30.0,96.0,64.0,42.916667
2018-03-01 18:00:00,73,2018,3,1,3,18,44.0,43.0,66.0,48.0,32.0,30.0,96.0,46.083333
2018-03-01 19:00:00,45,2018,3,1,3,19,73.0,44.0,43.0,66.0,48.0,32.0,30.0,51.166667
2018-03-01 20:00:00,61,2018,3,1,3,20,45.0,73.0,44.0,43.0,66.0,48.0,32.0,53.666667
2018-03-01 21:00:00,66,2018,3,1,3,21,61.0,45.0,73.0,44.0,43.0,66.0,48.0,55.916667


Agregamos promedio móvil cada 12 horas porque me pareció que refleja las diferencias del movimiento matutino con el nocturno, y en caso del defase (lag) elegimos 7 para tener un análisis de patrones semanales.

## Formación

In [9]:
# Cargamos librerías
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

In [None]:
# Dividimos dataset
random = 23451

df_train, df_test = train_test_split(df_new, shuffle=False, test_size=0.1)

features_train = df_train.drop(['num_orders'], axis=1)
target_train = df_train['num_orders']
features_test = df_test.drop(['num_orders'], axis=1)
target_test = df_test['num_orders']

In [11]:
# Regresión lineal
model_lr = LinearRegression()

model_lr.fit(features_train, target_train)
pred_train_lr = model_lr.predict(features_train)

mean_squared_error(target_train, pred_train_lr, squared=False)

30.4278616056613

In [None]:
# Modelo de Bosque aleatorio
model_forest = RandomForestRegressor(random_state=random,
                                     n_estimators=1000,
                                     max_depth=None,
                                     max_leaf_nodes=50,
                                     min_samples_split=2,
                                     min_samples_leaf=1,
                                     n_jobs=-1,
                                     verbose=0
                                     )

model_forest.fit(features_train, target_train)
pred_train_forest = model_forest.predict(features_train)

mean_squared_error(target_train, pred_train_forest, squared=False)

20.668398491949148

In [13]:
# Modelo con LightGBM
model_lgb = lgb.LGBMRegressor(n_estimators=700,
                              num_leaves=50,
                              learning_rate=0.15,
                              metric='RMSE',
                              verbose=-1
                              )

model_lgb.fit(features_train, target_train)
pred_train_lgb = model_lgb.predict(features_train)

mean_squared_error(target_train, pred_train_lgb, squared=False)

[WinError 2] El sistema no puede encontrar el archivo especificado
  File "c:\Users\lans_\anaconda3\envs\tripleten\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\lans_\anaconda3\envs\tripleten\lib\subprocess.py", line 493, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\lans_\anaconda3\envs\tripleten\lib\subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\lans_\anaconda3\envs\tripleten\lib\subprocess.py", line 1327, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


0.765947275164911

## Prueba

In [None]:
# Probamos cada modelo con el dataset de pruba
pred_test_lr = model_lr.predict(features_test)
pred_test_forest = model_forest.predict(features_test)
pred_test_lgb = model_lgb.predict(features_test)

print(
    f'\nEl RMSE del modelo de regresiónn lineal es: {mean_squared_error(target_test, pred_test_lr, squared=False)}')
print(
    f'El RMSE del modelo de bosque aleatorio es: {mean_squared_error(target_test, pred_test_forest, squared=False)}')
print(
    f'El RMSE del modelo lightgmb es: {mean_squared_error(target_test, pred_test_lgb, squared=False)}\n')


El RMSE del modelo de regresiónn lineal es: 52.78043670568286
El RMSE del modelo de bosque aleatorio es: 47.081992959733576
El RMSE del modelo lightgmb es: 43.978432529887705



### Conclusión
La mejor métrica de RMSE fue con el modelo **LightGBM** con **desfases semanales (7)** y un **promedio movil de 12 horas** ya que este rango facilita la diferenciación de movimiento matutino contra el nocturno.  
La calificación de la métrica fue de **43.98**, el modelo se equivoca en promedio por 44 ordenes por hora, no es el modelo más preciso pero cumple con los requerimientos.

## Predicciones

In [None]:
# Creamos función que complete las características según nuestras necesidades
def predict_next_hours(df, hours_to_predict, max_lag=7, rolling_mean_size=12):
    """
    Está fucnión devuelve las predicciones de las siguientes horas
    seleccionadas
    'NOTA: MAX_LAG Y ROLLING_MEAN_SIZE DEBEN SER IGUALES A LAS
    YA UTILIZADAS EN EL DATASET ENTRANTE'
    """

    df_predict = df.copy()

    # Con bucle for insertamos las filas seleccionadas a predecir
    for new_file in range(hours_to_predict):
        next_datetime = df_predict.index[-1] + pd.Timedelta(hours=1)

        # Devuelve las nuevas características
        df_predict.loc[next_datetime, 'year'] = next_datetime.year
        df_predict.loc[next_datetime, 'month'] = next_datetime.month
        df_predict.loc[next_datetime, 'day'] = next_datetime.day
        df_predict.loc[next_datetime, 'dayofweek'] = next_datetime.dayofweek
        df_predict.loc[next_datetime, 'hour'] = next_datetime.hour

        # Agrega los desfases y media movil seleccionada
        for lag in range(1, max_lag + 1):
            df_predict.loc[next_datetime, f'lag_{lag}'] = df_predict['num_orders'].shift(
                lag).iloc[-1]

        df_predict.loc[next_datetime, 'rolling_mean'] = (
            df_predict['num_orders'].iloc[-rolling_mean_size:].mean()
        )

        # Predice las horas seleccionadas
        features = df_predict.loc[next_datetime:next_datetime].drop(
            ['num_orders'], axis=1)
        prediction = model_lgb.predict(features)[0].round()

        # Arroja las predicciones
        df_predict.loc[next_datetime, 'num_orders'] = prediction

    return df_predict[['num_orders']].tail(hours_to_predict)

In [16]:
# Predecimos las siguientes 24 horas
df_predictions = predict_next_hours(df_new, 24)
df_predictions

Unnamed: 0_level_0,num_orders
datetime,Unnamed: 1_level_1
2018-09-01 00:00:00,213.0
2018-09-01 01:00:00,192.0
2018-09-01 02:00:00,182.0
2018-09-01 03:00:00,149.0
2018-09-01 04:00:00,136.0
2018-09-01 05:00:00,95.0
2018-09-01 06:00:00,51.0
2018-09-01 07:00:00,31.0
2018-09-01 08:00:00,89.0
2018-09-01 09:00:00,129.0


# Conclusión
El modelo fue entrenado correctamente y predice las siguientes horas con un promedio máximo de error de 44 ordenes, 4 ordenes mejor al promedio aceptado (48).