In [1]:
import pandas as pd
import re
from datetime import datetime as dt
from extract.uncompress_file import extract

extract()

# Basic data cleaning.

In [2]:
def search_nanull(data: pd.DataFrame) -> list:
    result = []
    for col in data.columns:
        result.append(data[col].isna().unique() +
                      data[col].isnull().unique())
    return result


def string_to_int(data: pd.DataFrame, 
                  cols: dict) -> pd.DataFrame:
    """convert cols (cols.keys()) in new_cols (cols.values())
    of data and drop the oldones
    Args:
        data: DataFreme to add new columns
        cols: names of the old and new columns
    Returns:
        DataFrame with changes
    """
    categorical2int = lambda x: int(''.join(re.findall('\d*',x)))

    for col, new_col in zip(cols.keys(), cols.values()):
        data[new_col] = data[col].apply(categorical2int)
        data = data.drop(col, axis=1)

    return data

In [6]:
# All dataclean process for now
# read
dataset1 = pd.read_csv('data/Input1_clientes_estructura.csv', sep=';')
dataset2 = pd.read_csv('data/Input2_clientes_venta.csv', sep=';')

# string data to int
dataset1 = dataset1.drop('Regional2', axis=1)
cols_dataset1 = {'Gerencia2': 'Distribuidor', 
        'SubCanal2': 'Tipo', 'Categoria':'Category'}
dataset1 = string_to_int(dataset1, cols_dataset1)

# dataset2 = dataset2.drop(['Año', 'Mes'], axis=1)
cols_dataset2 = {'CapacidadEnvase2':'CapacidadEnvase', 
        'SegmentoPrecio2':'CategoriaMarca',
        'Marca2':'Marca', 'Cupo2':'Envase'}
dataset2 = string_to_int(dataset2, cols_dataset2)

# Rename columns
cols_structure = {'Category': 'Categoria'}
dataset1 = dataset1.rename(columns=cols_structure)

cols_sell = {'disc': 'Descuento', 'nr': 'IngresoNeto'}
dataset2 = dataset2.rename(columns=cols_sell)

# merge data
data = dataset2.merge(dataset1)

# corret dtypes in dataset
data['Nevera'] = data['Nevera'].astype('bool')
category_cols = ['CapacidadEnvase','CategoriaMarca', 'Marca',
                 'Envase', 'Distribuidor', 'Tipo', 'Categoria']
# Corret datetime
data['date'] = data['Año'].astype('str') + '-' + data['Mes'].astype('str')
data['date'] = pd.to_datetime(data['date'], format='%Y-%m')
data = data.drop(['Año', 'Mes'], axis=1)

for col in category_cols:
    data[col] = data[col].astype('category')

# Search for NA or NULL data
msg = "Some data have na or null values"
assert not any(search_nanull(data)), msg

In [7]:
print ('Size of data: {}'.format(data.shape))
print ('Number of entries: {}'.format(data.shape[0]))
print ('Number of columns: {}'.format(data.shape[1]))

print ('\nList of columns in dataset:')

for col in data.columns:
    print (col)

Size of data: (1406116, 13)
Number of entries: 1406116
Number of columns: 13

List of columns in dataset:
Cliente
Volumen
Descuento
IngresoNeto
CapacidadEnvase
CategoriaMarca
Marca
Envase
Nevera
Distribuidor
Tipo
Categoria
date


In [8]:
data.dtypes

Cliente                     int64
Volumen                   float64
Descuento                 float64
IngresoNeto               float64
CapacidadEnvase          category
CategoriaMarca           category
Marca                    category
Envase                   category
Nevera                       bool
Distribuidor             category
Tipo                     category
Categoria                category
date               datetime64[ns]
dtype: object

In [9]:
data.describe(include=['category','bool'])

Unnamed: 0,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria
count,1406116,1406116,1406116,1406116,1406116,1406116,1406116,1406116
unique,17,3,39,5,2,11,27,6
top,10,1,1,2,True,10,3,2
freq,707919,991679,406526,786172,866769,470904,567541,586357


In [10]:
data.describe()

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto
count,1406116.0,1406116.0,1406116.0,1406116.0
mean,8328.064,0.2294363,-133.5855,1704.834
std,4906.4,1.395798,1491.833,10470.79
min,1.0,0.0,-479456.7,0.0
25%,4148.0,0.02173441,-28.77037,177.8085
50%,8097.0,0.05268947,0.0,423.3184
75%,12318.0,0.1496381,0.0,1117.963
max,20580.0,300.0,0.0,2302965.0


In [11]:
filter_data = data.query('(Marca == 20 and Envase == 3 and CapacidadEnvase == 9) \
                            or (Marca == 16 and Envase == 2 and CapacidadEnvase == 10)\
                            or (Marca == 9 and Envase == 3 and CapacidadEnvase == 12)\
                            or (Marca == 38 and Envase == 2 and CapacidadEnvase == 10)\
                            or (Marca == 39 and Envase == 2 and CapacidadEnvase == 10)').index
data['Useful'] = data.index.isin(filter_data)

In [12]:
data.sample(10)

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria,date,Useful
254361,11071,0.018692,0.0,138.453567,14,1,1,1,False,9,3,1,2019-06-01,False
1378659,14718,0.056694,0.0,600.669611,6,3,7,1,True,1,11,2,2019-08-01,False
767155,10173,0.260813,0.0,1816.636263,10,1,4,2,True,10,3,6,2020-04-01,False
1105595,2164,0.018705,0.0,331.031968,12,3,9,3,False,9,3,2,2020-09-01,True
396197,1074,0.021734,-38.049139,168.1393,10,1,5,2,True,2,6,2,2019-10-01,False
1212523,6717,0.998904,0.0,5069.88538,17,1,5,2,True,6,5,3,2020-06-01,False
394233,995,1.130189,-332.573708,8877.809298,10,1,1,2,True,3,2,1,2019-06-01,False
513917,3952,0.004724,0.0,15.151502,6,3,23,1,False,10,3,3,2020-08-01,False
1046394,5006,0.301384,0.0,2363.490062,10,1,5,2,True,6,2,2,2019-07-01,False
91153,3939,0.326016,-580.94269,2575.210965,10,1,5,2,True,10,2,3,2020-09-01,False


### Train and test datasetsindex

In [13]:
marca1_data = data.loc[(data['Marca'] == 20) & 
                       (data['Envase'] == 3) & 
                       (data['CapacidadEnvase'] == 9)]

marca2_data = data.loc[(data['Marca'] == 16) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

marca3_data = data.loc[(data['Marca'] == 9) &
                       (data['Envase'] == 3)  &
                       (data['CapacidadEnvase'] == 12)]

marca4_data = data.loc[(data['Marca'] == 38) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

marca5_data = data.loc[(data['Marca'] == 39) &
                       (data['Envase'] == 2)  &
                       (data['CapacidadEnvase'] == 10)]

print ('Number of total Marca_1: {}'.format(marca1_data.shape[0]))
print ('Number of total Marca_2: {}'.format(marca2_data.shape[0]))
print ('Number of total Marca_3: {}'.format(marca3_data.shape[0]))
print ('Number of total Marca_Inno1: {}'.format(marca4_data.shape[0]))
print ('Number of total Marca_Inno2: {}'.format(marca5_data.shape[0]))
print ('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print('Number of total entries of interest:', marca1_data.shape[0] 
      + marca2_data.shape[0] + marca3_data.shape[0] + marca4_data.shape[0] + marca5_data.shape[0])
print('Number of total entries:', data.shape[0])

Number of total Marca_1: 5347
Number of total Marca_2: 9676
Number of total Marca_3: 56898
Number of total Marca_Inno1: 9089
Number of total Marca_Inno2: 9089
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Number of total entries of interest: 90099
Number of total entries: 1406116


In [14]:
marca1_data[marca1_data.Nevera == False]

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria,date,Useful
3232,199,0.016861,0.0,105.470130,9,1,20,3,False,2,7,3,2020-08-01,True
11447,480,0.033721,0.0,217.402716,9,1,20,3,False,2,11,2,2020-07-01,True
13674,596,0.050582,0.0,326.104075,9,1,20,3,False,4,4,3,2020-08-01,True
13974,614,0.016861,0.0,105.470130,9,1,20,3,False,4,8,2,2020-08-01,True
14312,649,0.033721,0.0,210.940205,9,1,20,3,False,4,3,2,2020-09-01,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398738,4220,0.016861,0.0,105.470130,9,1,20,3,False,10,3,3,2020-07-01,True
1399196,16711,0.084303,0.0,543.506736,9,1,20,3,False,8,10,3,2020-07-01,True
1399204,16711,0.050582,0.0,326.104075,9,1,20,3,False,8,10,3,2020-08-01,True
1400959,15778,0.050582,0.0,316.410391,9,1,20,3,False,8,6,1,2020-08-01,True


In [13]:
# ¿Los mismos que compran inno1 compran inno2?
marca4_data.Cliente.isin(marca5_data.Cliente).describe()
# ¿Significa algo? :woman_shrugging: Tal vez la 
# probabilidad de estos perros sea igual no shé!

count     9089
unique       1
top       True
freq      9089
Name: Cliente, dtype: object

In [15]:
data

Unnamed: 0,Cliente,Volumen,Descuento,IngresoNeto,CapacidadEnvase,CategoriaMarca,Marca,Envase,Nevera,Distribuidor,Tipo,Categoria,date,Useful
0,10,0.112229,-30.590603,900.328567,12,1,1,1,True,3,1,2,2019-05-01,False
1,10,0.021734,0.000000,149.184463,10,1,2,2,True,3,1,2,2019-05-01,False
2,10,0.043469,0.000000,359.625828,10,2,3,2,True,3,1,2,2019-05-01,False
3,10,0.026345,-31.065261,134.748399,16,1,1,2,True,3,1,2,2019-05-01,False
4,10,0.086938,0.000000,496.901005,10,1,4,2,True,3,1,2,2019-05-01,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1406111,20577,0.039122,0.000000,544.150314,10,3,39,2,False,8,3,2,2020-08-01,True
1406112,20580,0.000000,0.000000,0.000000,10,1,38,2,False,10,5,6,2020-07-01,True
1406113,20580,0.058683,0.000000,759.218996,10,3,39,2,False,10,5,6,2020-07-01,True
1406114,20580,0.000000,0.000000,0.000000,10,1,38,2,False,10,5,6,2020-09-01,True


### Train and test

In [32]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(data.drop(columns = ['Useful']), data['Useful'], 
                                                    test_size=0.2, random_state=0, 
                                                    stratify = data['Useful'])

In [33]:
train_data = pd.concat([x_train,y_train], axis = 1)

# export to csv test partition

test_data = pd.concat([x_test, y_test], axis = 1)
test_data.to_csv("data_TEST.csv")

print('Train Nr. obs:\t{}\t==> {:.2%} of Data'.format(train_data.shape[0], train_data.shape[0] / data.shape[0]))
print('Test Nr. obs:\t{}\t==> {:.2%} of Data'.format(test_data.shape[0], test_data.shape[0] / data.shape[0]))

Train Nr. obs:	1124892	==> 80.00% of Data
Test Nr. obs:	281224	==> 20.00% of Data


In [30]:
data = data.drop(columns = 'date')

In [37]:
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

rf = RandomForestRegressor(n_estimators=200)
rf = rf.fit(x_train, y_train )

In [43]:
import sklearn.model_selection as sk

scores = sk.cross_val_score(rf, x_train, y_train, cv=3, n_jobs=-1)
print("accuracies     = ",scores)
print("mean accuracy = %4.2f" % (scores.mean()))

accuracies     =  [1. 1. 1.]
mean accuracy = 1.00
