In [1]:
import pandas as pd
import re
from datetime import datetime as dt
from extract.uncompress_file import extract

extract()

# Basic data cleaning.

In [2]:
def search_nanull(data: pd.DataFrame) -> list:
    result = []
    for col in data.columns:
        result.append(data[col].isna().unique() +
                      data[col].isnull().unique())
    return result


def string_to_int(data: pd.DataFrame, 
                  cols: dict) -> pd.DataFrame:
    """convert cols (cols.keys()) in new_cols (cols.values())
    of data and drop the oldones
    Args:
        data: DataFreme to add new columns
        cols: names of the old and new columns
    Returns:
        DataFrame with changes
    """
    categorical2int = lambda x: int(''.join(re.findall('\d*',x)))

    for col, new_col in zip(cols.keys(), cols.values()):
        data[new_col] = data[col].apply(categorical2int)
        data = data.drop(col, axis=1)

    return data

In [None]:
# All dataclean process for now
# read
dataset1 = pd.read_csv('data/Input1_clientes_estructura.csv', sep=';')
dataset2 = pd.read_csv('data/Input2_clientes_venta.csv', sep=';')

# string data to int
dataset1 = dataset1.drop('Regional2', axis=1)
cols_dataset1 = {'Gerencia2': 'Distribuidor', 
        'SubCanal2': 'Tipo', 'Categoria':'Category'}
dataset1 = string_to_int(dataset1, cols_dataset1)

# dataset2 = dataset2.drop(['Año', 'Mes'], axis=1)
cols_dataset2 = {'CapacidadEnvase2':'CapacidadEnvase', 
        'SegmentoPrecio2':'CategoriaMarca',
        'Marca2':'Marca', 'Cupo2':'Envase'}
dataset2 = string_to_int(dataset2, cols_dataset2)

# Rename columns
cols_structure = {'Category': 'Categoria'}
dataset1 = dataset1.rename(columns=cols_structure)

cols_sell = {'disc': 'Descuento', 'nr': 'IngresoNeto'}
dataset2 = dataset2.rename(columns=cols_sell)

# merge data
data = dataset2.merge(dataset1)

# corret dtypes in dataset
data['Nevera'] = data['Nevera'].astype('bool')
category_cols = ['CapacidadEnvase','CategoriaMarca', 'Marca',
                 'Envase', 'Distribuidor', 'Tipo', 'Categoria']
# Corret datetime
data['date'] = data['Año'].astype('str') + '-' + data['Mes'].astype('str')
data['date'] = pd.to_datetime(data['date'], format='%Y-%m')
data = dataset.drop(['Año', 'Mes'], axis=1)

for col in category_cols:
    data[col] = data[col].astype('category')

# Search for NA or NULL data
msg = "Some data have na or null values"
assert not any(search_nanull(data)), msg

In [None]:
print ('Size of data: {}'.format(data.shape))
print ('Number of entries: {}'.format(data.shape[0]))
print ('Number of columns: {}'.format(data.shape[1]))

print ('\nList of columns in dataset:')

for col in data.columns:
    print (col)

In [None]:
data.dtypes

In [None]:
data.describe(include=['category','bool'])

In [None]:
data.describe()

In [None]:
filter_data = data.query('(Marca == 20 and Envase == 3 and CapacidadEnvase == 9) \
                            or (Marca == 16 and Envase == 2 and CapacidadEnvase == 10)\
                            or (Marca == 9 and Envase == 3 and CapacidadEnvase == 12)\
                            or (Marca == 38 and Envase == 2 and CapacidadEnvase == 10)\
                            or (Marca == 39 and Envase == 2 and CapacidadEnvase == 10)').index
data['Useful'] = data.index.isin(filter_data)

In [54]:
data.sample(10)

Unnamed: 0,Año,Mes,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria,Usefull
264219,2019,8,11539,0.018705,-33.583425,293.241332,12,3,9,3,True,8,3,3,True
1058363,2020,7,159,0.034775,0.0,373.308966,10,2,12,1,False,6,3,1,False
335373,2020,9,14865,0.02213,0.0,408.261374,3,3,9,3,True,6,5,3,False
341600,2019,11,15229,0.36864,0.0,3138.609209,12,1,1,1,False,9,14,1,False
936389,2019,7,14868,0.130406,-153.152716,745.308525,10,1,4,2,True,8,3,4,False
981335,2020,1,17109,0.018705,0.0,330.000552,12,3,9,3,True,4,5,2,True
841852,2020,2,12238,0.021734,-38.049139,168.664808,10,1,5,2,False,8,6,2,False
821644,2019,5,11717,0.018705,0.0,311.956659,12,3,9,3,False,1,4,2,True
736578,2020,3,9378,0.003543,0.0,38.201137,6,2,3,1,True,10,11,3,False
1032724,2019,5,4931,0.021734,0.0,167.222836,10,1,5,2,False,10,5,3,False


### Train and test datasetsindex

In [38]:
marca1_data = data.loc[(data['Marca'] == 20) & 
                       (data['Envase'] == 3) & 
                       (data['CapacidadEnvase'] == 9)]

marca2_data = data.loc[(data['Marca'] == 16) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

marca3_data = data.loc[(data['Marca'] == 9) &
                       (data['Envase'] == 3)  &
                       (data['CapacidadEnvase'] == 12)]

marca4_data = data.loc[(data['Marca'] == 38) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

marca5_data = data.loc[(data['Marca'] == 39) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

print ('Number of total Marca_1: {}'.format(marca1_data.shape[0]))
print ('Number of total Marca_2: {}'.format(marca2_data.shape[0]))
print ('Number of total Marca_3: {}'.format(marca3_data.shape[0]))
print ('Number of total Marca_Inno1: {}'.format(marca4_data.shape[0]))
print ('Number of total Marca_Inno2: {}'.format(marca5_data.shape[0]))
print ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print('Number of total entries of interest:', marca1_data.shape[0] 
      + marca2_data.shape[0] + marca3_data.shape[0] + marca4_data.shape[0] + marca5_data.shape[0])
print('Number of total entries:', data.shape[0])

Number of total Marca_1: 5347
Number of total Marca_2: 9676
Number of total Marca_3: 56898
Number of total Marca_Inno1: 9089
Number of total Marca_Inno2: 9089
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Number of total entries of interest: 90099
Number of total entries: 1406116


In [12]:
marca1_data[marca1_data.Nevera == False]

Unnamed: 0,Año,Mes,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
3232,2020,8,199,0.016861,0.0,105.470130,9,1,20,3,False,2,7,3
11447,2020,7,480,0.033721,0.0,217.402716,9,1,20,3,False,2,11,2
13674,2020,8,596,0.050582,0.0,326.104075,9,1,20,3,False,4,4,3
13974,2020,8,614,0.016861,0.0,105.470130,9,1,20,3,False,4,8,2
14312,2020,9,649,0.033721,0.0,210.940205,9,1,20,3,False,4,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398738,2020,7,4220,0.016861,0.0,105.470130,9,1,20,3,False,10,3,3
1399196,2020,7,16711,0.084303,0.0,543.506736,9,1,20,3,False,8,10,3
1399204,2020,8,16711,0.050582,0.0,326.104075,9,1,20,3,False,8,10,3
1400959,2020,8,15778,0.050582,0.0,316.410391,9,1,20,3,False,8,6,1


In [13]:
# ¿Los mismos que compran inno1 compran inno2?
marca4_data.Cliente.isin(marca5_data.Cliente).describe()
# ¿Significa algo? :woman_shrugging: Tal vez la 
# probabilidad de estos perros sea igual no shé!

count     9089
unique       1
top       True
freq      9089
Name: Cliente, dtype: object