In [1]:
import pandas as pd
import re
from extract.uncompress_file import extract

extract()

# Basic data cleaning.

In [2]:
def search_nanull(data: pd.DataFrame) -> list:
    result = []
    for col in data.columns:
        result.append(data[col].isna().unique() +
                      data[col].isnull().unique())
    return result


def string_to_int(data: pd.DataFrame, 
                  cols: dict) -> pd.DataFrame:
    """convert cols (cols.keys()) in new_cols (cols.values())
    of data and drop the oldones
    Args:
        data: DataFreme to add new columns
        cols: names of the old and new columns
    Returns:
        DataFrame with changes
    """
    categorical2int = lambda x: int(''.join(re.findall('\d*',x)))

    for col, new_col in zip(cols.keys(), cols.values()):
        data[new_col] = data[col].apply(categorical2int)
        data = data.drop(col, axis=1)

    return data

In [3]:
# All dataclean process for now
# read
dataset1 = pd.read_csv('data/Input1_clientes_estructura.csv', sep=';')
dataset2 = pd.read_csv('data/Input2_clientes_venta.csv', sep=';')

# string data to int
dataset1 = dataset1.drop('Regional2', axis=1)
cols_dataset1 = {'Gerencia2': 'Distribuidor', 
        'SubCanal2': 'Tipo', 'Categoria':'Category'}
dataset1 = string_to_int(dataset1, cols_dataset1)

dataset2 = dataset2.drop(['Año', 'Mes'], axis=1)
cols_dataset2 = {'CapacidadEnvase2':'CapacidadEnvase', 
        'SegmentoPrecio2':'CategoriaMarca',
        'Marca2':'Marca', 'Cupo2':'Envase'}
dataset2 = string_to_int(dataset2, cols_dataset2)

# Rename columns
cols_structure = {'Category': 'Categoria'}
dataset1 = dataset1.rename(columns=cols_structure)

cols_sell = {'disc': 'Descuento', 'nr': 'IngresoNeto'}
dataset2 = dataset2.rename(columns=cols_sell)

# merge data
data = dataset2.merge(dataset1)

# corret dtypes in dataset
data['Nevera'] = data['Nevera'].astype('bool')
category_cols = ['CapacidadEnvase','CategoriaMarca', 'Marca',
                 'Envase', 'Distribuidor', 'Tipo', 'Categoria']

for col in category_cols:
    data[col] = data[col].astype('category')

# Search for NA or NULL data
msg = "Some data have na or null values"
assert not any(search_nanull(data)), msg

In [4]:
print ('Size of data: {}'.format(data.shape))
print ('Number of entries: {}'.format(data.shape[0]))
print ('Number of columns: {}'.format(data.shape[1]))

print ('\nList of columns in dataset:')

for col in data.columns:
    print (col)

Size of data: (1406116, 12)
Number of entries: 1406116
Number of columns: 12

List of columns in dataset:
Cliente
Volumen
Descuento
IngresoNeto
CapacidadEnvase
CategoriaMarca
Marca
Envase
Nevera
Distribuidor
Tipo
Categoria


In [5]:
data.dtypes

Cliente               int64
Volumen             float64
Descuento           float64
IngresoNeto         float64
CapacidadEnvase    category
CategoriaMarca     category
Marca              category
Envase             category
Nevera                 bool
Distribuidor       category
Tipo               category
Categoria          category
dtype: object

In [6]:
data.describe(include=['category','bool'])

Unnamed: 0,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
count,1406116,1406116,1406116,1406116,1406116,1406116,1406116,1406116
unique,17,3,39,5,2,11,27,6
top,10,1,1,2,True,10,3,2
freq,707919,991679,406526,786172,866769,470904,567541,586357


In [7]:
data[data['Marca']==20]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
381,14,0.016861,0.0,105.470130,9,1,20,3,True,3,1,2
503,15,0.016861,0.0,105.470130,9,1,20,3,True,2,3,2
512,15,0.033721,0.0,210.940205,9,1,20,3,True,2,3,2
516,15,0.016861,0.0,105.470130,9,1,20,3,True,2,3,2
562,15,0.014173,0.0,107.597118,6,1,20,1,True,2,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1403951,3998,0.016861,0.0,105.470130,9,1,20,3,True,10,3,2
1404025,12677,0.016861,0.0,105.470130,9,1,20,3,True,1,3,2
1404050,12677,0.016861,0.0,105.470130,9,1,20,3,True,1,3,2
1404106,6676,0.016861,0.0,105.470130,9,1,20,3,True,9,3,1


### Train and test datasets

In [12]:
marca1_data = data.loc[(data['Marca'] == 20) & 
                       (data['Envase'] == 3) & 
                       (data['CapacidadEnvase'] == 9)]

marca2_data = data.loc[(data['Marca'] == 16) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

marca3_data = data.loc[(data['Marca'] == 9) &
                       (data['Envase'] == 3)  &
                       (data['CapacidadEnvase'] == 12)]

marca4_data = data.loc[(data['Marca'] == 38) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

marca5_data = data.loc[(data['Marca'] == 39) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

print ('Number of total Marca_1: {}'.format(marca1_data.shape[0]))
print ('Number of total Marca_2: {}'.format(marca2_data.shape[0]))
print ('Number of total Marca_3: {}'.format(marca3_data.shape[0]))
print ('Number of total Marca_Inno1: {}'.format(marca4_data.shape[0]))
print ('Number of total Marca_Inno2: {}'.format(marca5_data.shape[0]))
print ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print('Number of total entries of interest:', marca1_data.shape[0] 
      + marca2_data.shape[0] + marca3_data.shape[0] + marca4_data.shape[0] + marca5_data.shape[0])
print('Number of total entries:', data.shape[0])

Number of total Marca_1: 5347
Number of total Marca_2: 9676
Number of total Marca_3: 56898
Number of total Marca_Inno1: 9089
Number of total Marca_Inno2: 9089
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Number of total entries of interest: 90099
Number of total entries: 1406116


In [15]:
marca1_data[marca1_data.Nevera == False]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
3232,199,0.016861,0.0,105.470130,9,1,20,3,False,2,7,3
11447,480,0.033721,0.0,217.402716,9,1,20,3,False,2,11,2
13674,596,0.050582,0.0,326.104075,9,1,20,3,False,4,4,3
13974,614,0.016861,0.0,105.470130,9,1,20,3,False,4,8,2
14312,649,0.033721,0.0,210.940205,9,1,20,3,False,4,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1398738,4220,0.016861,0.0,105.470130,9,1,20,3,False,10,3,3
1399196,16711,0.084303,0.0,543.506736,9,1,20,3,False,8,10,3
1399204,16711,0.050582,0.0,326.104075,9,1,20,3,False,8,10,3
1400959,15778,0.050582,0.0,316.410391,9,1,20,3,False,8,6,1


In [25]:
# ¿Los mismos que compran inno1 compran inno2?
marca4_data.Cliente.isin(marca5_data.Cliente).describe()
# ¿Significa algo? :woman_shrugging: Tal vez la 
# probabilidad de estos perros sea igual no shé!

count     9089
unique       1
top       True
freq      9089
Name: Cliente, dtype: object