In [1]:
import re
import pandas as pd

# Basic data cleaning.

In [2]:
def search_nanull(data: pd.DataFrame) -> list:
    result = []
    for col in data.columns:
        result.append(data[col].isna().unique() +
                      data[col].isnull().unique())
    return result

In [3]:
def string_to_int(data: pd.DataFrame, 
                  cols: dict) -> pd.DataFrame:
    """convert cols (cols.keys()) in new_cols (cols.values())
    of data and drop the oldones
    Args:
        data: DataFreme to add new columns
        cols: names of the old and new columns
    Returns:
        DataFrame with changes
    """
    categorical2int = lambda x: int(re.findall('\d',x)[0])

    for col, new_col in zip(cols.keys(), cols.values()):
        data[new_col] = data[col].apply(categorical2int)
        data = data.drop(col, axis=1)

    return data

## Dataclean tests

In [4]:
categorical2int = lambda x: int(re.findall('\d',x)[0])

In [5]:
dataset1 = pd.read_csv('data/Input1_clientes_estructura.csv', sep=';')

In [6]:
dataset1.sample(10)

Unnamed: 0,Cliente,Regional2,Gerencia2,SubCanal2,Categoria,Nevera
8864,8865,Regional 1,Gerencia_6,Subcanal_3,Categoria_2,0
11581,11582,Regional 1,Gerencia_10,Subcanal_5,Categoria_2,1
3275,3276,Regional 1,Gerencia_10,Subcanal_3,Categoria_2,0
19499,19500,Regional 1,Gerencia_10,Subcanal_4,Categoria_4,0
12286,12287,Regional 1,Gerencia_8,Subcanal_1,Categoria_3,1
12564,12565,Regional 1,Gerencia_2,Subcanal_9,Categoria_2,0
6827,6828,Regional 1,Gerencia_6,Subcanal_11,Categoria_3,1
14830,14831,Regional 1,Gerencia_3,Subcanal_7,Categoria_3,0
8194,8195,Regional 1,Gerencia_10,Subcanal_1,Categoria_3,1
12416,12417,Regional 1,Gerencia_6,Subcanal_3,Categoria_2,0


In [7]:
print(dataset1.dtypes)
dataset1.describe(exclude="int64")

Cliente       int64
Regional2    object
Gerencia2    object
SubCanal2    object
Categoria    object
Nevera        int64
dtype: object


Unnamed: 0,Regional2,Gerencia2,SubCanal2,Categoria
count,20921,20921,20921,20921
unique,1,11,32,6
top,Regional 1,Gerencia_10,Subcanal_3,Categoria_2
freq,20921,5441,8217,8773


In [8]:
dataset1.describe(include="int64")

Unnamed: 0,Cliente,Nevera
count,20921.0,20921.0
mean,10461.0,0.39783
std,6039.516827,0.489462
min,1.0,0.0
25%,5231.0,0.0
50%,10461.0,0.0
75%,15691.0,1.0
max,20921.0,1.0


In [9]:
dataset1 = dataset1.drop('Regional2', axis=1)

In [10]:
any(search_nanull(dataset1))

False

In [11]:
cols = {'Gerencia2': 'Distribuidor', 
        'SubCanal2': 'Tipo', 'Categoria':'Category'}
for col, new_col in zip(cols.keys(), cols.values()):
    print(new_col)
    dataset1[new_col] = dataset1[col].apply(categorical2int)
    dataset1 = dataset1.drop(col, axis=1)

Distribuidor
Tipo
Category


In [12]:
cols_structure = {'Category': 'Categoria'}
dataset1 = dataset1.rename(columns=cols_structure)

In [13]:
dataset1

Unnamed: 0,Cliente,Nevera,Distribuidor,Tipo,Categoria
0,1,0,1,1,1
1,2,0,1,1,1
2,3,0,1,1,1
3,4,1,1,1,1
4,5,1,1,1,2
...,...,...,...,...,...
20916,20917,0,1,3,2
20917,20918,0,4,3,1
20918,20919,0,9,3,1
20919,20920,0,1,3,1


In [14]:
dataset2 = pd.read_csv('data/Input2_clientes_venta.csv', sep=';')

In [15]:
dataset2.sample(10)

Unnamed: 0,Año,Mes,Cliente,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2,Volumen,disc,nr
1396084,2020,7,7282,SegmentoPrecio_1,Marca_38,Cupo_2,CapacidadEnvase_10,0.0,0.0,0.0
969438,2020,2,17319,SegmentoPrecio_1,Marca_8,Cupo_2,CapacidadEnvase_10,0.021734,0.0,168.668673
783898,2020,8,10956,SegmentoPrecio_1,Marca_5,Cupo_1,CapacidadEnvase_10,0.086938,-55.623344,812.053169
818640,2020,5,11894,SegmentoPrecio_2,Marca_3,Cupo_2,CapacidadEnvase_10,0.043469,0.0,368.13549
1248438,2020,3,3240,SegmentoPrecio_2,Marca_6,Cupo_2,CapacidadEnvase_10,0.065203,-6.822769,643.81896
524633,2020,2,4165,SegmentoPrecio_1,Marca_4,Cupo_2,CapacidadEnvase_10,0.108672,-100.232277,650.895208
761849,2020,5,10321,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_10,0.065203,0.0,539.185899
979034,2020,8,18065,SegmentoPrecio_1,Marca_1,Cupo_1,CapacidadEnvase_12,0.056114,0.0,531.453511
533237,2020,6,4268,SegmentoPrecio_1,Marca_5,Cupo_1,CapacidadEnvase_6,0.007087,0.0,67.852473
1024063,2019,5,5710,SegmentoPrecio_2,Marca_16,Cupo_2,CapacidadEnvase_10,0.021734,0.0,181.911219


In [16]:
print(dataset2.dtypes)
dataset2.describe(exclude=['int64','float64'])

Año                   int64
Mes                   int64
Cliente               int64
SegmentoPrecio2      object
Marca2               object
Cupo2                object
CapacidadEnvase2     object
Volumen             float64
disc                float64
nr                  float64
dtype: object


Unnamed: 0,SegmentoPrecio2,Marca2,Cupo2,CapacidadEnvase2
count,1406116,1406116,1406116,1406116
unique,3,39,5,17
top,SegmentoPrecio_1,Marca_1,Cupo_2,CapacidadEnvase_10
freq,991679,406526,786172,707919


In [17]:
dataset2 = dataset2.drop(['Año', 'Mes'], axis=1)

In [18]:
any(search_nanull(dataset2))

False

In [19]:
cols = {'CapacidadEnvase2':'CapacidadEnvase', 
        'SegmentoPrecio2':'CategoriaMarca',
        'Marca2':'Marca', 'Cupo2':'Envase'}
for col, new_col in zip(cols.keys(), cols.values()):
    print(new_col)
    dataset2[new_col] = dataset2[col].apply(categorical2int)
    dataset2 = dataset2.drop(col, axis=1)

CapacidadEnvase
CategoriaMarca
Marca
Envase


In [20]:
dataset2.head(10)

Unnamed: 0,Cliente,Volumen,disc,nr,CapacidadEnvase,CategoriaMarca,Marca,Envase
0,10,0.112229,-30.590603,900.328567,1,1,1,1
1,10,0.021734,0.0,149.184463,1,1,2,2
2,10,0.043469,0.0,359.625828,1,2,3,2
3,10,0.026345,-31.065261,134.748399,1,1,1,2
4,10,0.086938,0.0,496.901005,1,1,4,2
5,10,0.369485,-114.891895,2853.911219,1,1,1,2
6,10,0.825907,-221.071444,6153.614786,1,1,5,2
7,10,0.03663,0.0,303.759828,1,1,1,1
8,10,0.043469,0.0,358.300464,1,2,6,2
9,10,0.004347,0.0,43.469689,1,2,6,1


In [21]:
cols_structure = {'disc': 'Descuento', 'nr': 'IngresoNeto'}
dataset2 = dataset2.rename(columns=cols_structure)

In [22]:
dataset2

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase
0,10,0.112229,-30.590603,900.328567,1,1,1,1
1,10,0.021734,0.000000,149.184463,1,1,2,2
2,10,0.043469,0.000000,359.625828,1,2,3,2
3,10,0.026345,-31.065261,134.748399,1,1,1,2
4,10,0.086938,0.000000,496.901005,1,1,4,2
...,...,...,...,...,...,...,...,...
1406111,20577,0.039122,0.000000,544.150314,1,3,3,2
1406112,20580,0.000000,0.000000,0.000000,1,1,3,2
1406113,20580,0.058683,0.000000,759.218996,1,3,3,2
1406114,20580,0.000000,0.000000,0.000000,1,1,3,2


In [23]:
dataset2[dataset2['IngresoNeto'] <= 0]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase
725145,9299,0.028540,0.0,0.0,1,1,1,2
910771,14481,0.021734,0.0,0.0,1,1,1,2
1387941,212,0.000000,0.0,0.0,1,3,3,2
1387949,591,0.000000,0.0,0.0,1,3,3,2
1387951,614,0.000000,0.0,0.0,1,3,3,2
...,...,...,...,...,...,...,...,...
1406103,20529,0.000000,0.0,0.0,1,3,3,2
1406105,20529,0.000000,0.0,0.0,1,3,3,2
1406108,20569,0.000000,0.0,0.0,1,1,3,2
1406112,20580,0.000000,0.0,0.0,1,1,3,2


In [24]:
dataset2[dataset2['Volumen'] <= 0]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase
1387941,212,0.0,0.0,0.0,1,3,3,2
1387949,591,0.0,0.0,0.0,1,3,3,2
1387951,614,0.0,0.0,0.0,1,3,3,2
1387953,614,0.0,0.0,0.0,1,3,3,2
1387973,1006,0.0,0.0,0.0,1,3,3,2
...,...,...,...,...,...,...,...,...
1406103,20529,0.0,0.0,0.0,1,3,3,2
1406105,20529,0.0,0.0,0.0,1,3,3,2
1406108,20569,0.0,0.0,0.0,1,1,3,2
1406112,20580,0.0,0.0,0.0,1,1,3,2


In [25]:
dataset2 = dataset2.merge(dataset1)
dataset2.sample(10)

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
173890,7292,0.374095,0.0,3366.471952,1,1,1,1,0,9,5,2
1241505,2301,0.105379,0.0,677.658735,1,1,5,2,0,8,3,3
354439,16379,0.018705,0.0,167.914918,1,1,4,1,1,2,4,3
1139905,272,0.065203,-151.885159,466.989565,1,1,5,2,0,6,3,3
1023008,11351,0.004347,0.0,34.253092,1,1,2,1,1,1,4,1
1034860,4489,0.021734,-23.022637,145.309242,1,1,5,2,0,6,3,3
546408,4390,0.017388,-9.675629,139.183304,1,1,2,1,0,1,3,1
63107,2639,0.026345,-31.065261,134.748399,1,1,1,2,1,6,1,1
915784,14288,0.20865,-463.96091,1413.063549,1,1,5,1,1,1,8,3
303885,13340,0.065203,0.0,459.65636,1,1,1,2,1,8,3,2


In [26]:
dataset2['Nevera'] = dataset2['Nevera'].astype('bool')
category_cols = ['CapacidadEnvase','CategoriaMarca', 'Marca', 'Envase', 'Distribuidor', 'Tipo', 'Categoria']
for col in category_cols:
    dataset2[col] = dataset2[col].astype('category')

In [27]:
dataset2.describe(include=['category','bool'])

Unnamed: 0,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
count,1406116,1406116,1406116,1406116,1406116,1406116,1406116,1406116
unique,9,3,9,5,2,9,9,6
top,1,1,1,2,True,1,3,2
freq,1255266,991679,545290,786172,866769,574489,574647,586357


## Data clean block

In [28]:
# All dataclean process for now
# read
dataset1 = pd.read_csv('data/Input1_clientes_estructura.csv', sep=';')
dataset2 = pd.read_csv('data/Input2_clientes_venta.csv', sep=';')

# string data to int
dataset1 = dataset1.drop('Regional2', axis=1)
cols_dataset1 = {'Gerencia2': 'Distribuidor', 
        'SubCanal2': 'Tipo', 'Categoria':'Category'}
dataset1 = string_to_int(dataset1, cols_dataset1)

dataset2 = dataset2.drop(['Año', 'Mes'], axis=1)
cols_dataset2 = {'CapacidadEnvase2':'CapacidadEnvase', 
        'SegmentoPrecio2':'CategoriaMarca',
        'Marca2':'Marca', 'Cupo2':'Envase'}
dataset2 = string_to_int(dataset2, cols_dataset2)

# Rename columns
cols_structure = {'Category': 'Categoria'}
dataset1 = dataset1.rename(columns=cols_structure)

cols_sell = {'disc': 'Descuento', 'nr': 'IngresoNeto'}
dataset2 = dataset2.rename(columns=cols_sell)

# merge data
dataset = dataset2.merge(dataset1)

# corret dtypes in dataset
dataset['Nevera'] = dataset['Nevera'].astype('bool')
category_cols = ['CapacidadEnvase','CategoriaMarca', 'Marca',
                 'Envase', 'Distribuidor', 'Tipo', 'Categoria']

for col in category_cols:
    dataset[col] = dataset[col].astype('category')

# Search for NA or NULL data
msg = "Some data have na or null values"
assert not any(search_nanull(dataset)), msg

In [29]:
dataset.dtypes

Cliente               int64
Volumen             float64
Descuento           float64
IngresoNeto         float64
CapacidadEnvase    category
CategoriaMarca     category
Marca              category
Envase             category
Nevera                 bool
Distribuidor       category
Tipo               category
Categoria          category
dtype: object

In [30]:
dataset.describe(include=['category','bool'])

Unnamed: 0,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
count,1406116,1406116,1406116,1406116,1406116,1406116,1406116,1406116
unique,9,3,9,5,2,9,9,6
top,1,1,1,2,True,1,3,2
freq,1255266,991679,545290,786172,866769,574489,574647,586357


In [31]:
dataset.describe()

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto
count,1406116.0,1406116.0,1406116.0,1406116.0
mean,8328.064,0.2294363,-133.5855,1704.834
std,4906.4,1.395798,1491.833,10470.79
min,1.0,0.0,-479456.7,0.0
25%,4148.0,0.02173441,-28.77037,177.8085
50%,8097.0,0.05268947,0.0,423.3184
75%,12318.0,0.1496381,0.0,1117.963
max,20580.0,300.0,0.0,2302965.0


# Some relations and gruops of data.

In [32]:
dataset1.groupby(['Categoria']).agg({'Nevera':'count'})

Unnamed: 0_level_0,Nevera
Categoria,Unnamed: 1_level_1
1,5195
2,8773
3,5454
4,898
5,454
6,147


In [33]:
dataset1.groupby('Distribuidor').agg({'Nevera':'count'})

Unnamed: 0_level_0,Nevera
Distribuidor,Unnamed: 1_level_1
1,6847
2,1189
3,1860
4,1971
5,604
6,3074
7,223
8,2634
9,2519
