In [211]:
import re
import pandas as pd

# Basic data cleaning.

In [212]:
def search_nanull(data: pd.DataFrame) -> list:
    result = []
    for col in data.columns:
        result.append(data[col].isna().unique() +
                      data[col].isnull().unique())
    return result

In [213]:
categorical2int = lambda x: int(re.findall('\d',x)[0])

In [214]:
dataset1 = pd.read_csv('data/Input1_clientes_estructura.csv', sep=';')

In [215]:
dataset1.sample(10)

Unnamed: 0,Cliente,Regional2,Gerencia2,SubCanal2,Categoria,Nevera
5231,5232,Regional 1,Gerencia_6,Subcanal_8,Categoria_2,1
8051,8052,Regional 1,Gerencia_8,Subcanal_3,Categoria_2,1
461,462,Regional 1,Gerencia_3,Subcanal_3,Categoria_2,0
14896,14897,Regional 1,Gerencia_2,Subcanal_8,Categoria_2,0
20321,20322,Regional 1,Gerencia_1,Subcanal_3,Categoria_1,0
2793,2794,Regional 1,Gerencia_5,Subcanal_2,Categoria_1,1
9701,9702,Regional 1,Gerencia_3,Subcanal_6,Categoria_2,0
13561,13562,Regional 1,Gerencia_3,Subcanal_6,Categoria_3,0
20193,20194,Regional 1,Gerencia_4,Subcanal_31,Categoria_2,0
10153,10154,Regional 1,Gerencia_1,Subcanal_3,Categoria_3,0


In [216]:
print(dataset1.dtypes)
dataset1.describe(exclude="int64")

Cliente       int64
Regional2    object
Gerencia2    object
SubCanal2    object
Categoria    object
Nevera        int64
dtype: object


Unnamed: 0,Regional2,Gerencia2,SubCanal2,Categoria
count,20921,20921,20921,20921
unique,1,11,32,6
top,Regional 1,Gerencia_10,Subcanal_3,Categoria_2
freq,20921,5441,8217,8773


In [217]:
dataset1.describe(include="int64")

Unnamed: 0,Cliente,Nevera
count,20921.0,20921.0
mean,10461.0,0.39783
std,6039.516827,0.489462
min,1.0,0.0
25%,5231.0,0.0
50%,10461.0,0.0
75%,15691.0,1.0
max,20921.0,1.0


In [218]:
dataset1.convert_dtypes()
dataset1 = dataset1.drop('Regional2', axis=1)

In [219]:
any(search_nanull(dataset1))

False

In [220]:
cols = {'Gerencia2': 'Distribuidor', 
        'SubCanal2': 'Tipo', 'Categoria':'Category'}
for col, new_col in zip(cols.keys(), cols.values()):
    print(new_col)
    dataset1[new_col] = dataset1[col].apply(categorical2int)
    dataset1 = dataset1.drop(col, axis=1)

Distribuidor
Tipo
Category


In [221]:
cols_structure = {'Category': 'Categoria'}
dataset1 = dataset1.rename(columns=cols_structure)

In [222]:
dataset1

Unnamed: 0,Cliente,Nevera,Distribuidor,Tipo,Categoria
0,1,0,1,1,1
1,2,0,1,1,1
2,3,0,1,1,1
3,4,1,1,1,1
4,5,1,1,1,2
...,...,...,...,...,...
20916,20917,0,1,3,2
20917,20918,0,4,3,1
20918,20919,0,9,3,1
20919,20920,0,1,3,1


In [223]:
dataset2 = pd.read_csv('data/Input2_clientes_venta.csv', sep=';')

In [224]:
dataset2.sample(10)

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr
416421,2019,11,1757,SegmentoPrecio_2,Marca_3,Cupo_2,CapacidadEnvase_10,0.065203,0.0,550.482608
830082,2019,10,12218,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_10,0.282547,-338.449867,1972.221235
1312551,2020,7,1240,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_16,0.263447,0.0,1690.601424
802028,2020,3,11458,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_16,0.026345,0.0,22.910722
782540,2019,9,10932,SegmentoPrecio_2,Marca_3,Cupo_1,CapacidadEnvase_6,0.007087,0.0,74.553611
333691,2020,5,14856,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_16,0.052689,0.0,338.120362
445878,2019,6,2914,SegmentoPrecio_1,Marca_1,Cupo_1,CapacidadEnvase_12,0.056114,0.0,472.236583
47986,2020,5,1894,SegmentoPrecio_1,Marca_4,Cupo_2,CapacidadEnvase_10,0.173875,-207.914642,1004.236528
1117300,2019,8,13810,SegmentoPrecio_1,Marca_1,Cupo_1,CapacidadEnvase_14,0.018692,0.0,138.453567
407540,2019,10,1604,SegmentoPrecio_1,Marca_4,Cupo_1,CapacidadEnvase_12,0.018705,0.0,165.614123


In [225]:
print(dataset2.dtypes)
dataset2.describe(exclude=['int64','float64'])

Año                   int64
Mes                   int64
Cliente               int64
SegmentoPrecio2      object
Marca2               object
Cupo2                object
CapacidadEnvase2     object
Volumen             float64
disc                float64
nr                  float64
dtype: object


Unnamed: 0,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2
count,1406116,1406116,1406116,1406116
unique,3,39,5,17
top,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_10
freq,991679,406526,786172,707919


In [226]:
dataset2 = dataset2.drop(['Año', 'Mes'], axis=1)

In [227]:
dataset2.describe(include=['int64','float64'])

Unnamed: 0,Cliente,Volumen,disc,nr
count,1406116.0,1406116.0,1406116.0,1406116.0
mean,8328.064,0.2294363,-133.5855,1704.834
std,4906.4,1.395798,1491.833,10470.79
min,1.0,0.0,-479456.7,0.0
25%,4148.0,0.02173441,-28.77037,177.8085
50%,8097.0,0.05268947,0.0,423.3184
75%,12318.0,0.1496381,0.0,1117.963
max,20580.0,300.0,0.0,2302965.0


In [228]:
any(search_nanull(dataset2))

False

In [229]:
cols = {'CapacidadEnvase2':'CapacidadEnvase', 
        'SegmentoPrecio2':'CategoriaMarca',
        'Marca2':'Marca', 'Cupo2':'Envase'}
for col, new_col in zip(cols.keys(), cols.values()):
    print(new_col)
    dataset2[new_col] = dataset2[col].apply(categorical2int)
    dataset2 = dataset2.drop(col, axis=1)

CapacidadEnvase
CategoriaMarca
Marca
Envase


In [232]:
dataset2.head(10)

Unnamed: 0,Cliente,Volumen,disc,nr,CapacidadEnvase,CategoriaMarca,Marca,Envase
0,10,0.112229,-30.590603,900.328567,1,1,1,1
1,10,0.021734,0.0,149.184463,1,1,2,2
2,10,0.043469,0.0,359.625828,1,2,3,2
3,10,0.026345,-31.065261,134.748399,1,1,1,2
4,10,0.086938,0.0,496.901005,1,1,4,2
5,10,0.369485,-114.891895,2853.911219,1,1,1,2
6,10,0.825907,-221.071444,6153.614786,1,1,5,2
7,10,0.03663,0.0,303.759828,1,1,1,1
8,10,0.043469,0.0,358.300464,1,2,6,2
9,10,0.004347,0.0,43.469689,1,2,6,1


In [233]:
cols_structure = {'disc': 'Descuento', 'nr': 'IngresoNeto'}
dataset2 = dataset2.rename(columns=cols_structure)

In [234]:
dataset2

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase
0,10,0.112229,-30.590603,900.328567,1,1,1,1
1,10,0.021734,0.000000,149.184463,1,1,2,2
2,10,0.043469,0.000000,359.625828,1,2,3,2
3,10,0.026345,-31.065261,134.748399,1,1,1,2
4,10,0.086938,0.000000,496.901005,1,1,4,2
...,...,...,...,...,...,...,...,...
1406111,20577,0.039122,0.000000,544.150314,1,3,3,2
1406112,20580,0.000000,0.000000,0.000000,1,1,3,2
1406113,20580,0.058683,0.000000,759.218996,1,3,3,2
1406114,20580,0.000000,0.000000,0.000000,1,1,3,2


In [189]:
dataset2[dataset2['IngresoNeto'] <= 0]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase
725145,9299,0.028540,0.0,0.0,1,1,1,2
910771,14481,0.021734,0.0,0.0,1,1,1,2
1387941,212,0.000000,0.0,0.0,1,3,3,2
1387949,591,0.000000,0.0,0.0,1,3,3,2
1387951,614,0.000000,0.0,0.0,1,3,3,2
...,...,...,...,...,...,...,...,...
1406103,20529,0.000000,0.0,0.0,1,3,3,2
1406105,20529,0.000000,0.0,0.0,1,3,3,2
1406108,20569,0.000000,0.0,0.0,1,1,3,2
1406112,20580,0.000000,0.0,0.0,1,1,3,2


In [190]:
dataset2[dataset2['Volumen'] <= 0]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase
1387941,212,0.0,0.0,0.0,1,3,3,2
1387949,591,0.0,0.0,0.0,1,3,3,2
1387951,614,0.0,0.0,0.0,1,3,3,2
1387953,614,0.0,0.0,0.0,1,3,3,2
1387973,1006,0.0,0.0,0.0,1,3,3,2
...,...,...,...,...,...,...,...,...
1406103,20529,0.0,0.0,0.0,1,3,3,2
1406105,20529,0.0,0.0,0.0,1,3,3,2
1406108,20569,0.0,0.0,0.0,1,1,3,2
1406112,20580,0.0,0.0,0.0,1,1,3,2


In [248]:
dataset2 = dataset2.merge(dataset1)
dataset2.sample(10)

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
526742,4126,0.052163,0.0,530.129362,1,1,5,1,1,1,1,2
87550,3885,0.05708,0.0,289.707708,1,1,5,2,0,1,3,3
1240442,14023,0.065203,-51.050905,397.970406,1,1,4,2,0,1,5,2
9965,435,0.108672,-69.320064,1026.0196,1,2,6,2,1,2,1,3
872444,13077,0.13831,0.0,1056.900121,1,1,1,2,0,6,5,2
485880,3609,0.130406,-249.926071,1001.187003,1,1,5,2,1,1,1,4
375070,92,0.711308,0.0,4564.623841,1,1,1,2,0,3,3,3
1234659,678,0.028057,-41.424746,455.334419,1,3,9,3,0,3,3,2
218400,9380,0.391219,-181.925353,2543.956769,1,1,4,2,1,8,3,3
179087,7560,0.014173,0.0,148.786992,6,3,7,1,0,8,6,2


In [249]:
dataset2.describe()

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
count,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0,1406116.0
mean,8328.064,0.2294363,-133.5855,1704.834,1.477345,1.427433,3.542452,1.757047,0.6164278,3.905715,3.278923,2.383874
std,4906.4,1.395798,1491.833,10470.79,1.449432,0.7142294,2.499,0.6219709,0.4862558,3.021207,1.967307,0.9703062
min,1.0,0.0,-479456.7,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
50%,8097.0,0.05268947,0.0,423.3184,1.0,1.0,4.0,2.0,1.0,3.0,3.0,2.0
max,20580.0,300.0,0.0,2302965.0,9.0,3.0,9.0,5.0,1.0,9.0,9.0,6.0


# Some relations and gruops of data.

In [237]:
dataset1.groupby(['Categoria']).agg({'Nevera':'count'})

Unnamed: 0_level_0,Nevera
Categoria,Unnamed: 1_level_1
1,5195
2,8773
3,5454
4,898
5,454
6,147


In [243]:
dataset1.groupby('Distribuidor').agg({'Nevera':'count'})

Unnamed: 0_level_0,Nevera
Distribuidor,Unnamed: 1_level_1
1,6847
2,1189
3,1860
4,1971
5,604
6,3074
7,223
8,2634
9,2519
