In [2]:
%pylab inline
import pandas as pd

from dataManager import DataManager
from model import ModelManager as manager

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

import plotly.graph_objects as go

Populating the interactive namespace from numpy and matplotlib


In [3]:
prods = data = DataManager().products.drop_duplicates().copy()
prods['AREA'] = prods.ANCHO * prods.FONDO

data = DataManager().sales_ref_month_sin_ventas_mayores()
data['DATE'] = data['ANIO'].astype(str) + '-' + data['MES'].astype(str).str.zfill(2)

df = data.pivot_table(index='REF',columns=['DATE','ANIO','MES'],values='CANTIDAD',aggfunc='sum').reset_index()
df = pd.melt(df,id_vars='REF')

df = df.sort_values(['REF','DATE'])
df = df.rename(columns={'value':'CANTIDAD'})
df = df.reset_index(drop=True).fillna(0)

df = df.merge(data.drop(columns=['TIENDA','ANIO','MES','CANTIDAD']).groupby(['REF','DATE']).mean(),on=['REF','DATE'],how='left',validate='1:1')
df = df[['REF','DATE','ANIO','MES','CANTIDAD','PRECIO','DESCUENTO(%)','F_COVID']]
df = df.fillna(0)

df = df.merge(prods,on='REF',validate='m:1')
df = df.sort_values(['ANIO','MES']).reset_index(drop=True)

In [10]:
num_var=['AREA','ALTO','DESCUENTO(%)','PRECIO','CANTIDAD']
x_num=df[num_var[:-1]].astype('float')

cat_var=[
    'ANIO', 'MES', 'PUESTOS', 'COLOR_POS', 'SUBCATEGORIA_POS', 'ESTILO', 'F_COVID' 
]
x_cat=df[cat_var].astype('category')
x_cat_dummies=pd.get_dummies(x_cat)

y = df['CANTIDAD']

scaler = MinMaxScaler()
x_num_norm = scaler.fit_transform(x_num)
x = np.append(x_num_norm,x_cat_dummies,axis=1)

#split data till januar 2021

index = df[(df.ANIO==2021)].index[0]

x_train = x[:index]
y_train = y[:index]
x_test = x[index:]
y_test = y[index:]

In [15]:
df[index-1:]

Unnamed: 0,REF,DATE,ANIO,MES,CANTIDAD,PRECIO,DESCUENTO(%),F_COVID,ITEM,DESCRIPCION,...,PUESTOS,COLOR,ANCHO,ALTO,FONDO,DESC_LARGA,SUBCATEGORIA_POS,COLOR_POS,MATERIAL_POS,AREA
30071,XZ0993:00100:,2020-12,2020,12,0.0,0.000000,0.000000,0.0,11151,MESA CENTRO FILIS,...,,BLANCO,100.0,44.0,50.0,MESA CENTRO FILIS LACA BL 2C LIQUIDACIÓN,MESAS DE CENTRO,BLANCO,VIDRIO,5000.0
30072,A01040:00005:,2021-01,2021,1,0.0,0.000000,0.000000,0.0,11057,SILLA COMEDOR 85,...,1.0,NEGRO,40.0,77.0,47.0,SILLA COMEDOR 85-1062 NG NO GARANTIA,SILLAS DE COMEDOR,NEGRO,SINTÉTICO,1880.0
30073,BR0002:00193:,2021-01,2021,1,2.0,325593.000000,0.417836,2.0,19389,ESTANTE PRAIA,...,,BLANCO/RUSTICO,41.0,180.0,63.0,ESTANTE PRAIA 1 GAV RT3091 BL/RUST,OTROS,OTRO,MADERA LAMINADA,2583.0
30074,BR0003:00194:,2021-01,2021,1,1.0,324188.000000,0.392612,2.0,19390,ESTANTE PRAIA,...,,NEGRO/RUSTICO,41.0,180.0,63.0,ESTANTE PRAIA 1 GAV RT3091 NG/RUST,OTROS,OTRO,MADERA LAMINADA,2583.0
30075,BR0004:00193:,2021-01,2021,1,0.0,0.000000,0.000000,0.0,19391,ESCRITORIO RETRO L PARANA,...,,BLANCO/RUSTICO,136.0,78.0,108.0,ESCRITORIO RETRO L PARANA RT3077 BL/RUST,OTROS,OTRO,MADERA LAMINADA,14688.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35079,XZ0986:00005:,2021-04,2021,4,0.0,0.000000,0.000000,0.0,11123,COMEDOR SAMUEL,...,6.0,NEGRO,150.0,75.0,70.0,COMEDOR SAMUEL 150X90 6P NG 3C,SET 6P,NEGRO,VIDRIO,10500.0
35080,XZ0987:00024:,2021-04,2021,4,0.0,0.000000,0.000000,0.0,11127,COMEDOR SAMUEL,...,6.0,ROJO,150.0,75.0,70.0,COMEDOR SAMUEL 150X90 6P RJ 3C,SET 6P,ROJO,VIDRIO,10500.0
35081,XZ0988:00005:,2021-04,2021,4,0.0,0.000000,0.000000,0.0,11131,COMEDOR SAMUEL,...,4.0,NEGRO,120.0,75.0,70.0,COMEDOR SAMUEL 4P NG 3C,SET 4P,NEGRO,VIDRIO,8400.0
35082,XZ0989:00024:,2021-04,2021,4,0.0,0.000000,0.000000,0.0,11136,COMEDOR SAMUEL,...,4.0,ROJO,120.0,75.0,70.0,COMEDOR SAMUEL 120X70 4P RJ 3C,SET 4P,ROJO,METAL + VIDRIO,8400.0


In [13]:
%%time
model = GradientBoostingRegressor(**{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300})
model.fit(x_train,y_train)

CPU times: user 10.1 s, sys: 194 ms, total: 10.3 s
Wall time: 13.9 s


GradientBoostingRegressor(max_depth=2, n_estimators=300)

In [22]:
mse(model.predict(x_test),y_test)

6.047817363608716

In [15]:
df['PREDICTED'] = model.predict(np.concatenate([x_train,x_test],axis=0)).round()
res = df.groupby(['REF','DATE']).sum().reset_index()

In [20]:
aux = res.query('REF=="XA0539:00061:"')
fig = go.Figure()
fig.add_scatter(x=aux['DATE'], y=aux['PREDICTED'], mode='lines', name='Valores predichos')
fig.add_scatter(x=aux['DATE'], y=aux['CANTIDAD'], mode='lines', name='Valores reales')

In [None]:
def split_data(prevs):
    prevs.append(var)
    x_cat=data[prevs].astype('category')
    x_cat_dummies=pd.get_dummies(x_cat)

    y = data['CANTIDAD']

    scaler = MinMaxScaler()
    x_num_norm = scaler.fit_transform(x_num)
    x = np.append(x_num_norm,x_cat_dummies,axis=1)

    #split data till januar 2021
    index = data[(data.ANIO==2021)].index[0]

    x_train = x[:index-1]
    y_train = y[:index-1]
    x_test = x[index-1:]
    y_test = y[index-1:]
    
    return x_train,y_train,x_test,y_test,prevs
    
def fit_model(x_train,y_train):
    model = GradientBoostingRegressor(**{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300})
    model.fit(x_train,y_train)
    
    return model
    
def get_mse(model,x_test,y_test):
    return mse(model.predict(x_test),y_test)

In [None]:
data = DataManager().sales_ref_month_sin_ventas_mayores()

num_var=['AREA','ALTO','DESCUENTO(%)','PRECIO','CANTIDAD']
x_num=data[num_var[:-1]].astype('float')

cat_var=[
    'ANIO', 'MES', 'TIENDA', 'PUESTOS', 'COLOR_POS', 'CATEGORIA', 'SUBCATEGORIA_POS', 'VIGENCIA',
    'ORIGEN', 'ESTILO', 'MATERIAL_POS', 'ACABADO', 'F_COVID', 'REF'
]

x_train,y_train,x_test,y_test,prevs = split_data(cat_var)
%time model = fit_model(x_train,y_train)
print('MSE for variables:')
print(prevs)
print(get_mse(model,x_test,y_test))
print('--------------------------------------')

In [None]:
data = DataManager().sales_ref_month_sin_ventas_mayores()
data['DATE'] = data['ANIO'].astype(str) + '-' + data['MES'].astype(str).str.zfill(2)

num_var=['AREA','ALTO','DESCUENTO(%)','PRECIO','CANTIDAD']
x_num=data[num_var[:-1]].astype('float')

cat_var=[
    'ANIO', 'MES', 'TIENDA', 'PUESTOS', 'COLOR_POS', 'SUBCATEGORIA_POS', 'ESTILO', 'F_COVID' 
]

x_train,y_train,x_test,y_test,prevs = split_data(cat_var)
%time model = fit_model(x_train,y_train)
print('MSE for variables:')
print(prevs)
print(get_mse(model,x_test,y_test))
print('--------------------------------------')

In [None]:
data['PREDICTED'] = model.predict(np.concatenate([x_train,x_test],axis=0)).round()
data = data.groupby(['REF','DATE']).sum().reset_index()

In [None]:
aux = data.query('REF=="D00935:00048:00048"')
fig = go.Figure()
fig.add_scatter(x=aux['DATE'], y=aux['PREDICTED'], mode='lines', name='Valores predichos')
fig.add_scatter(x=aux['DATE'], y=aux['CANTIDAD'], mode='lines', name='Valores reales')