In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, quantile_transform, QuantileTransformer, RobustScaler

In [2]:
CONFIG = {
    'TRAIN_PATH': '/kaggle/input/datathon-entel-2022-reto2/train.csv',
    'TEST_PATH': '/kaggle/input/datathon-entel-2022-reto2/test.csv',
    'SAMPLE_SUBMISSION': '/kaggle/input/datathon-entel-2022-reto2/test_sample.csv'
}

df_train = pd.read_csv(CONFIG['TRAIN_PATH'])
df_test = pd.read_csv(CONFIG['TEST_PATH'])
df_sub = pd.read_csv(CONFIG['SAMPLE_SUBMISSION'])

## number of smartphone that we sold per week

In [3]:
px.area(df_train.iloc[: , 5:].sum())

## 10 Week before our target

In [4]:
px.area(df_train.iloc[: , -20:-10].sum())

## We'll use the last 10 weeks as the target

In [5]:
px.area(df_train.iloc[: , -10:].sum())

## How is the distribution in the target?

In [6]:
px.scatter(df_train.iloc[:, -10:].stack().value_counts(normalize=True))

#### 84.5% of the data is 0, 7.19% is 1, 3% is 2 and 1.4% is 3 --> This values represent 96.17% of all data

### exist relation between the previous 10 week and the next 10 week?

In [7]:
def previous_rmse(start, end):
    print(f'START {start} - END {end}')
    time = end - start
    a1 = df_train.iloc[:, start: end].stack().reset_index(drop=True)
    a2 = df_train.iloc[:, end: end + time].stack().reset_index(drop=True)
    rmse = mean_squared_error(a1, a2, squared=False)
    return rmse

In [8]:
print(previous_rmse(5, 15))
print(previous_rmse(15, 25))
print(previous_rmse(25, 35))
print(previous_rmse(35, 45))

START 5 - END 15
5.657942955462724
START 15 - END 25
6.427028600162597
START 25 - END 35
6.2113736768435
START 35 - END 45
5.466776992892005


In [9]:
rmse_list = []
for i in range(5, 45):
    _rmse =previous_rmse(i, i + 1)
    rmse_list.append(_rmse)

START 5 - END 6
START 6 - END 7
START 7 - END 8
START 8 - END 9
START 9 - END 10
START 10 - END 11
START 11 - END 12
START 12 - END 13
START 13 - END 14
START 14 - END 15
START 15 - END 16
START 16 - END 17
START 17 - END 18
START 18 - END 19
START 19 - END 20
START 20 - END 21
START 21 - END 22
START 22 - END 23
START 23 - END 24
START 24 - END 25
START 25 - END 26
START 26 - END 27
START 27 - END 28
START 28 - END 29
START 29 - END 30
START 30 - END 31
START 31 - END 32
START 32 - END 33
START 33 - END 34
START 34 - END 35
START 35 - END 36
START 36 - END 37
START 37 - END 38
START 38 - END 39
START 39 - END 40
START 40 - END 41
START 41 - END 42
START 42 - END 43
START 43 - END 44
START 44 - END 45


In [10]:
rmse_log_list =  pd.Series(rmse_list).ewm(1).mean()
px.line(rmse_list)

#### Podemos multiplicar la variación entre las semanas para dar mayor fuerza a las semanas más altas y restar a las menores. Así también apoyar a la idea de que en las siguientes semans debería subir de subir las ventas

In [11]:
p = 0.999
x_base = 100

print(df_train.iloc[:, 5:15].stack().quantile(p))
print(df_train.iloc[:, 15:25].stack().quantile(p))
print(df_train.iloc[:, 25:35].stack().quantile(p))
print(df_train.iloc[:, 35:45].stack().quantile(p))
print(df_train.iloc[:, 45:55].stack().quantile(p))

print('*' * 50)

print(df_train.iloc[:, 5:15].stack().apply(lambda x: x if x < x_base else x_base).quantile(p))
print(df_train.iloc[:, 15:25].stack().apply(lambda x: x if x < x_base else x_base).quantile(p))
print(df_train.iloc[:, 25:35].stack().apply(lambda x: x if x < x_base else x_base).quantile(p))
print(df_train.iloc[:, 35:45].stack().apply(lambda x: x if x < x_base else x_base).quantile(p))
print(df_train.iloc[:, 45:55].stack().apply(lambda x: x if x < x_base else x_base).quantile(p))

69.0
58.0
63.271000000066124
60.0
49.0
**************************************************
69.0
58.0
63.271000000066124
60.0
49.0


In [12]:
print(df_train.iloc[:, 5:15].stack().value_counts(normalize=True)[:5])
print('*' * 50)
print(df_train.iloc[:, 15:25].stack().value_counts(normalize=True)[:5])
print('*' * 50)
print(df_train.iloc[:, 25:35].stack().value_counts(normalize=True)[:5])
print('*' * 50)
print(df_train.iloc[:, 35:45].stack().value_counts(normalize=True)[:5])
print('*' * 50)
print(df_train.iloc[:, 45:55].stack().value_counts(normalize=True)[:5])

0    0.824529
1    0.077616
2    0.033125
3    0.017001
4    0.010279
dtype: float64
**************************************************
0    0.834013
1    0.071893
2    0.031009
3    0.016639
4    0.010372
dtype: float64
**************************************************
0    0.831355
1    0.073131
2    0.030458
3    0.016567
4    0.010167
dtype: float64
**************************************************
0    0.839934
1    0.069648
2    0.030155
3    0.015600
4    0.009569
dtype: float64
**************************************************
0    0.845486
1    0.071948
2    0.029377
3    0.014983
4    0.009181
dtype: float64


In [13]:
df_train_c = df_train.copy()
df_train_c['TOTAL'] = df_train_c.iloc[:, 35:45].sum(axis=1)
df_train_c = df_train_c.sort_values(by='TOTAL')

In [14]:
df_train_c['TOTAL'].value_counts(normalize=True)

0       0.587900
1       0.086363
2       0.052890
3       0.035677
4       0.026625
          ...   
318     0.000021
321     0.000021
322     0.000021
323     0.000021
4364    0.000021
Name: TOTAL, Length: 363, dtype: float64

In [15]:
px.line(df_train_c.T.iloc[5:-1, -10:])

In [16]:
_index = 3270

In [17]:
px.line(df_train.iloc[_index, 5:55])

In [18]:
px.line(df_train.iloc[_index, 5:55].diff())

In [19]:
px.line(df_train.iloc[_index, 5:55].ewm(1).mean())

In [20]:
df_train.iloc[:, 5:45].T.ewm(5).mean().T

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.206733,0.165641,...,0.003030,0.002523,0.002102,0.001751,0.001458,0.001215,0.001012,0.000843,0.000703,0.000585
1,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.166803,0.138983
2,1.0,0.454545,0.274725,0.186289,0.13438,0.100706,0.077424,0.060609,0.048079,0.038523,...,0.000705,0.000587,0.000489,0.000407,0.000339,0.000283,0.000235,0.000196,0.000163,0.000136
3,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.166803,0.138983
4,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.166803,0.138983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47168,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
47169,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
47170,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
47171,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### Relación entre la variable de volatilidad y el objetivo

In [21]:
df_train_c['volatility'] = np.sqrt((np.log(df_train.iloc[:, 40:45].T.ewm(1).mean()).stack().apply(lambda x: x if str(x) != '-inf' else 0).unstack(level=1) ** 2).sum())
df_train_c['std'] = df_train_c.iloc[:, 35:45].std(axis=1)# .apply(lambda x: x if x < 100 else 100)
df_train_c['mean'] = df_train_c.iloc[:, 35:45].mean(axis=1)# .apply(lambda x: x if x < 100 else 100)

#### La volatilidad en las últimas 5 semanas puede ser una variable muy relacionada al objetivo. Si asumimos que la suma de las ventas en las 1' semans sea nuestro objetivo

In [22]:
px.scatter(df_train_c, y='TOTAL', x='volatility')

In [23]:
print(df_train.iloc[:, 5:15].stack().quantile(0.99))
print(df_train.iloc[:, 15:25].stack().quantile(0.99))
print(df_train.iloc[:, 25:35].stack().quantile(0.99))
print(df_train.iloc[:, 35:45].stack().quantile(0.99))
print(df_train.iloc[:, 45:55].stack().quantile(0.99))

15.0
14.0
15.0
14.0
11.0


In [24]:
df_train[(df_train.iloc[:, 5:15].sum(axis=1) == 0)].iloc[:, 15:25].sum(axis=1).value_counts(normalize=True)[:5]

0    0.834931
1    0.077222
2    0.035390
3    0.017066
4    0.009958
dtype: float64

In [25]:
df_train[(df_train.iloc[:, 5:15].sum(axis=1) > 0)].iloc[:, 15:25].sum(axis=1).value_counts(normalize=True)[:5]

0    0.278621
1    0.117609
2    0.077629
3    0.055506
4    0.044246
dtype: float64

In [26]:
df_train[(df_train.iloc[:, 15:25].sum(axis=1) == 0)].iloc[:, 25:35].sum(axis=1).value_counts(normalize=True)[:5]

0    0.696851
1    0.105286
2    0.047602
3    0.028433
4    0.018601
dtype: float64

In [27]:
df_train[(df_train.iloc[:, 15:25].sum(axis=1) > 0)].iloc[:, 25:35].sum(axis=1).value_counts(normalize=True)[:5]

0    0.346490
1    0.113830
2    0.082044
3    0.058310
4    0.043785
dtype: float64

In [28]:
df_train[(df_train.iloc[:, 25:35].sum(axis=1) == 0)].iloc[:, 35:45].sum(axis=1).value_counts(normalize=True)[:5]

0    0.806676
1    0.075644
2    0.034713
3    0.018196
4    0.012855
dtype: float64

In [29]:
df_train[(df_train.iloc[:, 25:35].sum(axis=1) > 0)].iloc[:, 35:45].sum(axis=1).value_counts(normalize=True)[:5]

0    0.314248
1    0.099771
2    0.075627
3    0.057544
4    0.043850
dtype: float64

In [30]:
df_train[(df_train.iloc[:, 35:45].sum(axis=1) == 0)].iloc[:, 45:55].sum(axis=1).value_counts(normalize=True)[:5]

0    0.785490
1    0.081636
2    0.036996
3    0.020084
4    0.012512
dtype: float64

In [31]:
df_train[(df_train.iloc[:, 35:45].sum(axis=1) > 0)].iloc[:, 45:55].sum(axis=1).value_counts(normalize=True)[:5]

0    0.321039
1    0.115484
2    0.084002
3    0.056430
4    0.046759
dtype: float64

In [32]:
#### En promedio solo el 20% de los registros venden al menos un celular si en las 10 semanas anteriores no ha vendido nada, mientras que el 80% se queda en 0
#### De los que venden al menos un en 10 semanas, a las siguientes 10 semans tienen una proporición del 30% de vender 0

In [33]:
df_train_c_wot_o = df_train_c[df_train_c.TOTAL > 0]

In [34]:
df_train_c_wot_o.iloc[:10, 35:45]

Unnamed: 0,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
29837,0,0,0,0,1,0,0,0,0,0
33446,0,0,0,0,0,0,0,1,0,0
36043,0,0,0,0,0,0,0,0,1,0
8061,0,1,0,0,0,0,0,0,0,0
33017,0,0,0,0,0,0,0,0,0,1
32461,0,0,0,0,0,0,0,0,0,1
32842,1,0,0,0,0,0,0,0,0,0
8056,0,0,1,0,0,0,0,0,0,0
32456,0,0,0,0,0,0,0,0,0,1
9020,0,0,0,1,0,0,0,0,0,0


In [35]:
df_train_c_wot_o.iloc[: , 35:45].T.ewm(1).mean()

Unnamed: 0,29837,33446,36043,8061,33017,32461,32842,8056,32456,9020,...,18277,28586,3275,19667,31402,31401,3270,19666,23408,23286
SEMANA_31,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,271.0,106.0,142.0,163.0,149.0,168.0,231.0,336.0,553.0,816.0
SEMANA_32,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,...,240.333333,112.666667,148.666667,109.0,176.333333,215.333333,229.666667,268.666667,501.0,720.0
SEMANA_33,0.0,0.0,0.0,0.285714,0.0,0.0,0.142857,0.571429,0.0,0.0,...,201.857143,100.285714,148.285714,203.857143,197.285714,215.142857,208.142857,348.857143,476.428571,634.857143
SEMANA_34,0.0,0.0,0.0,0.133333,0.0,0.0,0.066667,0.266667,0.0,0.533333,...,208.333333,138.0,171.6,176.2,277.666667,270.533333,221.4,264.666667,418.066667,464.8
SEMANA_35,0.516129,0.0,0.0,0.064516,0.0,0.0,0.032258,0.129032,0.0,0.258065,...,221.064516,226.258065,176.967742,173.516129,362.483871,324.451613,229.967742,215.290323,274.548387,307.483871
SEMANA_36,0.253968,0.0,0.0,0.031746,0.0,0.0,0.015873,0.063492,0.0,0.126984,...,203.253968,308.920635,206.952381,203.730159,262.174603,256.15873,243.698413,285.746032,203.15873,235.111111
SEMANA_37,0.125984,0.0,0.0,0.015748,0.0,0.0,0.007874,0.031496,0.0,0.062992,...,139.629921,234.88189,224.110236,232.086614,207.661417,208.708661,305.834646,282.346457,259.015748,261.76378
SEMANA_38,0.062745,0.501961,0.0,0.007843,0.0,0.0,0.003922,0.015686,0.0,0.031373,...,87.611765,152.619608,243.129412,245.596078,194.278431,219.396078,284.835294,312.792157,370.945098,268.407843
SEMANA_39,0.031311,0.250489,0.500978,0.003914,0.0,0.0,0.001957,0.007828,0.0,0.015656,...,52.236791,129.765166,258.093933,268.342466,198.146771,244.246575,274.39726,343.956947,521.266145,448.054795
SEMANA_40,0.01564,0.125122,0.250244,0.001955,0.500489,0.500489,0.000978,0.00391,0.500489,0.00782,...,35.101662,68.322581,271.560117,270.173021,194.069404,239.11828,288.71261,383.016618,544.155425,461.040078


In [36]:
df_train_c.iloc[:, 35:45]

Unnamed: 0,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0,0,0,0,0,0,0,0,0,0
33840,0,0,0,0,0,0,0,0,0,0
33834,0,0,0,0,0,0,0,0,0,0
33810,0,0,0,0,0,0,0,0,0,0
33802,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
31401,168,239,215,319,375,190,162,230,269,234
3270,231,229,192,233,238,257,367,264,264,303
19666,336,235,409,191,169,354,279,343,375,422
23408,553,475,458,367,140,134,314,482,671,567


In [37]:
df_train_c.iloc[:, 45:55]

Unnamed: 0,SEMANA_41,SEMANA_42,SEMANA_43,SEMANA_44,SEMANA_45,SEMANA_46,SEMANA_47,SEMANA_48,SEMANA_49,SEMANA_50
0,0,0,0,0,0,0,0,0,0,0
33840,0,0,0,0,0,0,0,0,0,0
33834,0,0,0,0,0,0,0,0,0,0
33810,0,0,0,0,0,0,0,0,0,0
33802,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
31401,237,180,158,195,237,105,106,9,8,2
3270,213,120,157,165,163,124,110,20,101,119
19666,289,242,234,116,19,99,9,50,49,79
23408,550,426,366,272,122,81,50,16,67,53


In [38]:
df_train_c.iloc[:, 35:45].median(axis=1)

0          0.0
33840      0.0
33834      0.0
33810      0.0
33802      0.0
         ...  
31401    232.0
3270     247.5
19666    339.5
23408    466.5
23286    395.0
Length: 47173, dtype: float64

In [39]:
df_train_c.iloc[:, 45:55].mean(axis=1)

0          0.0
33840      0.0
33834      0.0
33810      0.0
33802      0.0
         ...  
31401    123.7
3270     129.2
19666    118.6
23408    200.3
23286    168.4
Length: 47173, dtype: float64

In [40]:
df_train_c.iloc[:, 5:45] # .ewm(1, axis=1).mean()

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,0,0,0,0,0,0,0,0,0,0,...,168,239,215,319,375,190,162,230,269,234
3270,65,80,193,369,690,1005,519,292,1201,1451,...,231,229,192,233,238,257,367,264,264,303
19666,0,10,26,13,10,22,8,37,82,137,...,336,235,409,191,169,354,279,343,375,422
23408,0,0,0,0,0,0,0,0,0,0,...,553,475,458,367,140,134,314,482,671,567


In [41]:
df_train_c.iloc[:, 5:45].ewm(40, axis=1).mean()

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.122397,0.108753,...,0.026487,0.025304,0.024197,0.023158,0.022182,0.021263,0.020397,0.019580,0.018807,0.018077
33840,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.083654,0.079918,0.076421,0.073140,0.070057,0.067155,0.064421,0.061839,0.059400,0.057091
33834,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.042343,0.040453,0.038682,0.037021,0.035461,0.033992,0.032608,0.031301,0.030067,0.028898
33810,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.042343,0.040453,0.038682,0.037021,0.035461,0.033992,0.032608,0.031301,0.030067,0.028898
33802,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.086830,0.082953,0.079322,0.075917,0.072717,0.069705,0.066867,0.064187,0.061655,0.059259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,27.304685,36.757307,44.558143,56.340869,69.772921,74.752267,78.305241,84.383409,91.666516,97.198252
3270,65.0,72.592593,113.723430,179.925456,287.039548,414.206084,430.308096,411.489108,508.122846,613.227214,...,385.978916,378.969491,370.786725,364.871070,359.523227,355.277108,355.754497,352.078041,348.603370,346.831013
19666,0.0,5.061728,12.214184,12.417973,11.910206,13.697330,12.821912,16.111721,24.176246,36.752931,...,209.485332,210.624613,219.306561,218.091264,216.021979,221.736501,224.068433,228.833831,234.600073,241.883299
23408,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,143.812050,158.600259,171.703563,180.088307,178.398513,176.559696,182.156654,194.170900,212.981763,226.740546


In [42]:
a = df_train_c.iloc[:, 35:45].std(axis=1)
b = df_train_c.iloc[:, 45:55].std(axis=1)

In [43]:
px.scatter(pd.concat([a, b], axis=1), x=0, y=1)

In [44]:
df_train.groupby(['Z_MARCA']).agg('sum').sum(axis=1)

Z_MARCA
0855cd73c3a9dc2c31a7c4dcaecfd53282238c6a457a5c771cbee045c0d9a521     50917
09c32757a9ca408c829114503523a5e6762dd1ba2b81067ad113837c1b30188b    129719
1d45ae99abcc02002be90eabecf61d0ce0613d1de5f0c37ddd7bbbd7e8198cf5      3151
22ea092e7643557ea91b74dea27589acac28a04fce976c577c422cd3ee1c9dc2      1549
285075a02b2679248a6b4636c3328bd3097626607c3e43c0c5498258fbfa9f29    611345
2aca20d3a48b13f1d2ee88e9746a4fb6d2846684084819272d6e5ff8809d62aa     10299
4b1acc0002585cda1245f0e07c6f58ff0e3b5a0e8d3187fa55d6e0ed43836c1d     11933
5132f94c2aebce767bd61d9e8f0d4f681d0809ca90cd2cd09be494f221367bb5    237625
53c759c773f2b832ddcc61534bdb919d44e03be212241e7dad49b286afa95340     39791
9fc7c2306f5afb4bfe47feefbd193f245633febbb1817113fdf0a768172285b5    575520
b870b7809747623def661ded7b0da0e52beddfc50235bbda65f1e8a34350ff7a      1777
c0dd56d5b73d72cbd9a3df3f70e0df98ce15417d0f8c3a678280beb763388a4f     11552
c25b14ec280222786bd605c2e1d072a98fc18612a2dfe094da1a269e436f4621      8110
df853f864c74fa85a

In [45]:
df_train.iloc[:, 45:55].stack().reset_index(drop=True)

0         0
1         0
2         0
3         0
4         0
         ..
471725    0
471726    0
471727    0
471728    0
471729    0
Length: 471730, dtype: int64

In [46]:
df_train.iloc[:, 45:55].stack().reset_index(drop=True).to_csv('entel_last.csv', index=False)

<a href='./entel_last.csv'>download</a>

In [47]:
df_train.iloc[:, 5: 45]

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47168,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47169,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df_train.iloc[:, 5: 45].stack().apply(lambda x: x if x > 0 else np.nan).unstack(level=1).count(axis=1)

0        1
1        1
2        1
3        1
4        1
        ..
47168    0
47169    0
47170    0
47171    0
47172    0
Length: 47173, dtype: int64

In [49]:
df_train.iloc[:, 5: 45].stack().apply(lambda x: 1 if x > 0 else 0).unstack(level=1).sum(axis=1)

0        1
1        1
2        1
3        1
4        1
        ..
47168    0
47169    0
47170    0
47171    0
47172    0
Length: 47173, dtype: int64

In [50]:
df_train.iloc[:, 45:55].sum(axis=1).value_counts(normalize=True)

0       0.594090
1       0.095584
2       0.056367
3       0.035062
4       0.026625
          ...   
415     0.000021
311     0.000021
301     0.000021
2003    0.000021
116     0.000021
Length: 329, dtype: float64

In [51]:
df_train_c.iloc[:, 5:45].ewm(5, axis=1).mean()

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.206733,0.165641,...,0.003030,0.002523,0.002102,0.001751,0.001458,0.001215,0.001012,0.000843,0.000703,0.000585
33840,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.177449,0.147787,0.123096,0.102538,0.085420,0.071163,0.059288,0.049397,0.041158,0.034293
33834,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.096790,0.080611,0.067143,0.055930,0.046592,0.038816,0.032339,0.026944,0.022450,0.018706
33810,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.096790,0.080611,0.067143,0.055930,0.046592,0.038816,0.032339,0.026944,0.022450,0.018706
33802,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.236169,0.196692,0.163830,0.136469,0.113686,0.094711,0.078907,0.065743,0.054777,0.045641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,75.080804,102.480828,121.279854,154.300296,191.145960,190.954697,186.123233,193.443200,206.046289,210.708413
3270,65.0,73.181818,120.582418,200.549925,336.935068,504.344204,507.732428,460.880842,613.887810,780.279733,...,241.339555,239.276928,231.378184,231.649037,232.709326,236.763491,258.495126,259.413505,260.178545,267.320313
19666,0.0,5.454545,13.582418,13.394933,12.448936,14.842319,13.260466,18.416088,31.560974,52.518980,...,263.168498,258.459973,283.611294,268.144657,251.592525,268.684551,270.405816,282.516712,297.943189,318.633401
23408,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,339.849911,362.441016,378.406436,376.501493,337.017732,303.133638,304.946830,334.484631,390.616355,420.033644


In [52]:
df_train_c.iloc[:, 35:45].stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).ewm(10, axis=1).mean()

Unnamed: 0,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
31401,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3270,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
19666,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
23408,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [53]:
df_train_c.iloc[:, 35:45].stack().apply(lambda x: 1 if x > 100 else 0).value_counts(normalize=False)

0    471515
1       215
dtype: int64

In [54]:
df_train_c.iloc[:, 5:45].ewm(5, axis=1).mean()

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.206733,0.165641,...,0.003030,0.002523,0.002102,0.001751,0.001458,0.001215,0.001012,0.000843,0.000703,0.000585
33840,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.177449,0.147787,0.123096,0.102538,0.085420,0.071163,0.059288,0.049397,0.041158,0.034293
33834,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.096790,0.080611,0.067143,0.055930,0.046592,0.038816,0.032339,0.026944,0.022450,0.018706
33810,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.096790,0.080611,0.067143,0.055930,0.046592,0.038816,0.032339,0.026944,0.022450,0.018706
33802,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.236169,0.196692,0.163830,0.136469,0.113686,0.094711,0.078907,0.065743,0.054777,0.045641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,75.080804,102.480828,121.279854,154.300296,191.145960,190.954697,186.123233,193.443200,206.046289,210.708413
3270,65.0,73.181818,120.582418,200.549925,336.935068,504.344204,507.732428,460.880842,613.887810,780.279733,...,241.339555,239.276928,231.378184,231.649037,232.709326,236.763491,258.495126,259.413505,260.178545,267.320313
19666,0.0,5.454545,13.582418,13.394933,12.448936,14.842319,13.260466,18.416088,31.560974,52.518980,...,263.168498,258.459973,283.611294,268.144657,251.592525,268.684551,270.405816,282.516712,297.943189,318.633401
23408,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,339.849911,362.441016,378.406436,376.501493,337.017732,303.133638,304.946830,334.484631,390.616355,420.033644


In [55]:
pd.concat([df_train_c.iloc[:, 5:45].diff(1, axis=1).fillna(0).stack(), df_train_c.iloc[:, 5:45].diff(1, axis=1).fillna(0).stack()], axis=1).to_numpy()

array([[   0.,    0.],
       [   0.,    0.],
       [   0.,    0.],
       ...,
       [ -13.,  -13.],
       [ 352.,  352.],
       [-153., -153.]])

In [56]:
df_train_c.iloc[:, 5:45]

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,0,0,0,0,0,0,0,0,0,0,...,168,239,215,319,375,190,162,230,269,234
3270,65,80,193,369,690,1005,519,292,1201,1451,...,231,229,192,233,238,257,367,264,264,303
19666,0,10,26,13,10,22,8,37,82,137,...,336,235,409,191,169,354,279,343,375,422
23408,0,0,0,0,0,0,0,0,0,0,...,553,475,458,367,140,134,314,482,671,567


In [57]:
df_train_c.iloc[:, 5:45].stack() / df_train_c.groupby(['Z_PUNTO_VENTA'])[df_train_c.iloc[:, 5:45].columns].transform('mean').stack()

0      SEMANA_01     0.000000
       SEMANA_02     0.000000
       SEMANA_03     0.000000
       SEMANA_04     0.000000
       SEMANA_05     0.000000
                      ...    
23286  SEMANA_36    16.770492
       SEMANA_37    30.523077
       SEMANA_38    28.727885
       SEMANA_39    49.886429
       SEMANA_40    41.596603
Length: 1886920, dtype: float64

In [58]:
df_train_c.groupby(['Z_PUNTO_VENTA'])[df_train_c.iloc[:, 5:45].columns].transform('mean')

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,1.060606,1.163636,1.066667,1.042424,1.151515,1.224242,1.418182,1.496970,1.230303,1.169697,...,1.345455,1.187879,0.981818,0.987879,0.830303,0.848485,0.787879,0.975758,1.048485,0.872727
33840,0.443548,0.346774,0.508065,0.540323,0.467742,0.532258,0.854839,0.862903,1.153226,0.951613,...,0.854839,0.725806,0.596774,0.669355,0.556452,0.370968,0.395161,0.443548,0.540323,0.266129
33834,0.408696,0.321739,0.365217,0.434783,0.347826,0.400000,0.365217,0.417391,0.434783,0.530435,...,0.504348,0.495652,0.434783,0.417391,0.426087,0.278261,0.252174,0.226087,0.330435,0.260870
33810,1.687898,1.834395,2.006369,1.751592,1.426752,1.643312,1.668790,1.560510,1.464968,1.808917,...,1.445860,1.146497,1.286624,1.343949,1.305732,1.184713,1.152866,1.191083,1.292994,1.229299
33802,0.196721,0.147541,0.131148,0.098361,0.081967,0.049180,0.065574,0.196721,0.081967,0.131148,...,0.114754,0.081967,0.131148,0.114754,0.032787,0.065574,0.081967,0.081967,0.049180,0.049180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,10.754032,12.705645,12.774194,13.665323,11.870968,12.024194,9.088710,10.520161,14.096774,14.975806,...,17.745968,15.076613,12.911290,11.814516,10.286290,9.838710,9.435484,9.572581,12.568548,11.395161
3270,10.754032,12.705645,12.774194,13.665323,11.870968,12.024194,9.088710,10.520161,14.096774,14.975806,...,17.745968,15.076613,12.911290,11.814516,10.286290,9.838710,9.435484,9.572581,12.568548,11.395161
19666,10.754032,12.705645,12.774194,13.665323,11.870968,12.024194,9.088710,10.520161,14.096774,14.975806,...,17.745968,15.076613,12.911290,11.814516,10.286290,9.838710,9.435484,9.572581,12.568548,11.395161
23408,9.977477,10.585586,8.734234,9.797297,8.806306,11.463964,9.765766,10.418919,11.027027,12.855856,...,12.220721,10.513514,9.842342,10.671171,9.112613,8.608108,7.972973,9.909910,11.621622,10.797297


In [59]:
df_train_c.iloc[:, 5:45]# .shift(axis=1).fillna(0)

Unnamed: 0,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,SEMANA_06,SEMANA_07,SEMANA_08,SEMANA_09,SEMANA_10,...,SEMANA_31,SEMANA_32,SEMANA_33,SEMANA_34,SEMANA_35,SEMANA_36,SEMANA_37,SEMANA_38,SEMANA_39,SEMANA_40
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31401,0,0,0,0,0,0,0,0,0,0,...,168,239,215,319,375,190,162,230,269,234
3270,65,80,193,369,690,1005,519,292,1201,1451,...,231,229,192,233,238,257,367,264,264,303
19666,0,10,26,13,10,22,8,37,82,137,...,336,235,409,191,169,354,279,343,375,422
23408,0,0,0,0,0,0,0,0,0,0,...,553,475,458,367,140,134,314,482,671,567
