In [1]:
%pylab inline
import pandas as pd

from dataManager import DataManager
from model import ModelManager as manager

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

import plotly.graph_objects as go

Populating the interactive namespace from numpy and matplotlib


## Modelo 1
Creación de la nueva data organizada para el modelo. En ella se consideran los meses de `CANTIDAD == 0` para `REF`. Esta forma de organizar los datos no permite hacer disgregación en el modelo por punto de venta (`TIENDA`).

In [16]:
prods = data = DataManager().products.drop_duplicates().copy()
prods['AREA'] = prods.ANCHO * prods.FONDO

data = DataManager().sales_ref_month_sin_ventas_mayores()
data['DATE'] = data['ANIO'].astype(str) + '-' + data['MES'].astype(str).str.zfill(2)

df = data.pivot_table(index='REF',columns=['DATE','ANIO','MES'],values='CANTIDAD',aggfunc='sum').reset_index()
df = pd.melt(df,id_vars='REF')

df = df.sort_values(['REF','DATE'])
df = df.rename(columns={'value':'CANTIDAD'})
df = df.reset_index(drop=True).fillna(0)

df = df.merge(data.drop(columns=['TIENDA','ANIO','MES','CANTIDAD']),on=['REF','DATE'],how='left')
df = df[['REF','DATE','ANIO','MES','CANTIDAD','PRECIO','SUBTOTAL','DESCUENTO(%)','TOTAL','F_COVID']]
df = df.fillna(0)

df = df.merge(prods,on='REF',validate='m:1')
df = df.sort_values(['ANIO','MES']).reset_index(drop=True)

In [18]:
df[df.REF == 'XA0194:00009:'][['REF','DATE','CANTIDAD','PRECIO','TOTAL','COLOR_POS','DESC_LARGA']]

Unnamed: 0,REF,DATE,CANTIDAD,PRECIO,TOTAL,COLOR_POS,DESC_LARGA
591,XA0194:00009:,2019-01,4.0,2471881.0,1235940.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
592,XA0194:00009:,2019-01,4.0,2471881.0,1235940.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
593,XA0194:00009:,2019-01,4.0,2471881.0,1235940.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
594,XA0194:00009:,2019-01,4.0,2471881.0,1235940.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
2120,XA0194:00009:,2019-02,0.0,0.0,0.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
3636,XA0194:00009:,2019-03,2.0,2471881.0,1235940.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
3637,XA0194:00009:,2019-03,2.0,2471881.0,1235940.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
5287,XA0194:00009:,2019-04,4.0,2471881.0,2265390.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
5288,XA0194:00009:,2019-04,4.0,2471882.0,1977617.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO
6852,XA0194:00009:,2019-05,2.0,2471881.0,1276638.0,CAFE,SOFA RECLINO WAYLAND 3P CF 2 RECLINO


Preparación de la data para entrenar el modelo

In [19]:
num_var=['AREA','ALTO','DESCUENTO(%)','PRECIO','CANTIDAD']
x_num=df[num_var[:-1]].astype('float')

cat_var=[
    'ANIO', 'MES', 'PUESTOS', 'COLOR_POS', 'SUBCATEGORIA_POS', 'ESTILO', 'F_COVID' 
]
x_cat=df[cat_var].astype('category')
x_cat_dummies=pd.get_dummies(x_cat)

y = df['CANTIDAD']

scaler = MinMaxScaler()
x_num_norm = scaler.fit_transform(x_num)
x = np.append(x_num_norm,x_cat_dummies,axis=1)

#split data till januar 2021

index = df[(df.ANIO==2021)].index[0]

x_train = x[:index-1]
y_train = y[:index-1]
x_test = x[index-1:]
y_test = y[index-1:]

Entrenamiento del modelo

In [20]:
%%time
model = GradientBoostingRegressor(**{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300})
model.fit(x_train,y_train)

CPU times: user 14.6 s, sys: 205 ms, total: 14.8 s
Wall time: 20 s


GradientBoostingRegressor(max_depth=2, n_estimators=300)

Resultado del error

In [21]:
mse(model.predict(x_test),y_test)

17.322043539492785

Verificación de prueba con el test para identificar variabilidad en la predicción

In [22]:
df['PREDICTED'] = model.predict(np.concatenate([x_train,x_test],axis=0)).round()
res = df.groupby(['REF','DATE']).sum().reset_index()

In [23]:
aux = res.query('REF=="XA0194:00009:"')
fig = go.Figure()
fig.add_scatter(x=aux['DATE'], y=aux['PREDICTED'], mode='lines', name='Valores predichos')
fig.add_scatter(x=aux['DATE'], y=aux['CANTIDAD'], mode='lines', name='Valores reales')

## Modelo 2
El modelo anteriormente desarrollado por johan. Sirve de prueba para identificar las mejores variables a incluir en el modelo 1

In [None]:
def split_data(prevs):
    prevs.append(var)
    x_cat=data[prevs].astype('category')
    x_cat_dummies=pd.get_dummies(x_cat)

    y = data['CANTIDAD']

    scaler = MinMaxScaler()
    x_num_norm = scaler.fit_transform(x_num)
    x = np.append(x_num_norm,x_cat_dummies,axis=1)

    #split data till januar 2021
    index = data[(data.ANIO==2021)].index[0]

    x_train = x[:index-1]
    y_train = y[:index-1]
    x_test = x[index-1:]
    y_test = y[index-1:]
    
    return x_train,y_train,x_test,y_test,prevs
    
def fit_model(x_train,y_train):
    model = GradientBoostingRegressor(**{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300})
    model.fit(x_train,y_train)
    
    return model
    
def get_mse(model,x_test,y_test):
    return mse(model.predict(x_test),y_test)

In [None]:
data = DataManager().sales_ref_month_sin_ventas_mayores()

num_var=['AREA','ALTO','DESCUENTO(%)','PRECIO','CANTIDAD']
x_num=data[num_var[:-1]].astype('float')

cat_var=[
    'ANIO', 'MES', 'TIENDA', 'PUESTOS', 'COLOR_POS', 'CATEGORIA', 'SUBCATEGORIA_POS', 'VIGENCIA',
    'ORIGEN', 'ESTILO', 'MATERIAL_POS', 'ACABADO', 'F_COVID', 'REF'
]

x_train,y_train,x_test,y_test,prevs = split_data(cat_var)
%time model = fit_model(x_train,y_train)
print('MSE for variables:')
print(prevs)
print(get_mse(model,x_test,y_test))
print('--------------------------------------')

In [None]:
data['PREDICTED'] = model.predict(np.concatenate([x_train,x_test],axis=0)).round()
data = data.groupby(['REF','DATE']).sum().reset_index()

In [None]:
aux = data.query('REF=="D00935:00048:00048"')
fig = go.Figure()
fig.add_scatter(x=aux['DATE'], y=aux['PREDICTED'], mode='lines', name='Valores predichos')
fig.add_scatter(x=aux['DATE'], y=aux['CANTIDAD'], mode='lines', name='Valores reales')