In [73]:
import numpy as np
import pandas as pd

from warnings import filterwarnings

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler, normalize, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold

filterwarnings('ignore')

In [74]:
CONFIG = {
    'TRAIN_PATH': '/kaggle/input/datathon-entel-2022-reto2/train.csv',
    'TEST_PATH': '/kaggle/input/datathon-entel-2022-reto2/test.csv',
    'SAMPLE_SUBMISSION': '/kaggle/input/datathon-entel-2022-reto2/test_sample.csv'
}

df_train = pd.read_csv(CONFIG['TRAIN_PATH'])
df_test = pd.read_csv(CONFIG['TEST_PATH'])
df_sub = pd.read_csv(CONFIG['SAMPLE_SUBMISSION'])

In [75]:
n = 40
x_base = 100

In [76]:
def vol_col(x):
    return np.sqrt(np.log(x).stack().apply(lambda x: x if str(x) != '-inf' else 0).unstack(level=1)).sum().to_dict()

def vol(x):
    return np.sqrt(np.log(x).stack().apply(lambda x: x if str(x) != '-inf' else 0).unstack(level=1)).sum().fillna(0)

In [77]:
def featuring(df):
    df = pd.concat([df_train.iloc[:, :5], df], axis=1)
        
    df_z_punto_venta = df.groupby(['Z_PUNTO_VENTA'])[df.iloc[:, 5:].columns].transform('max')
    df_z_modelo = df.groupby(['Z_MODELO'])[df.iloc[:, 5:].columns].transform('max')
    df_z_gama = df.groupby(['Z_GAMA'])[df.iloc[:, 5:].columns].transform('max')
    df_z_marca = df.groupby(['Z_MARCA'])[df.iloc[:, 5:].columns].transform('max')
    df_z_departamento = df.groupby(['Z_DEPARTAMENTO'])[df.iloc[:, 5:].columns].transform('max')
    
    df_z_s_punto_venta = df.groupby(['Z_PUNTO_VENTA'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_modelo = df.groupby(['Z_MODELO'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_gama = df.groupby(['Z_GAMA'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_marca = df.groupby(['Z_MARCA'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_departamento = df.groupby(['Z_DEPARTAMENTO'])[df.iloc[:, 5:].columns].transform('sum')
    
    df_b_punto_venta = df['Z_PUNTO_VENTA'].apply(lambda x: 1 if x in 
                                                     ['da45328ba820604eb99694768f2a430cd933d161601dcb8491b4a9b555232c59',
                                                      'e1f2d2708f545ddc1d7266ba0cc5ccc88147b77fdf3450e68a974e93018ecf60'] else 0)
    df_b_departameto = df['Z_DEPARTAMENTO'].apply(lambda x: 1 if x  in 
                                                    ['d6c21b948958417ca98b682a573eb8aa1084b292d32f760f253ef53da13e5589'] else 0)
    
    Z_MARCA = df['Z_MARCA'].replace(df['Z_MARCA'].value_counts(normalize=True).to_dict())
    Z_GAMA = df['Z_GAMA'].replace(df['Z_GAMA'].value_counts(normalize=True).to_dict())
    Z_MODELO = df['Z_MODELO'].replace(df['Z_MODELO'].value_counts(normalize=True).to_dict())
    Z_DEPARTAMENTO = df['Z_DEPARTAMENTO'].replace(df['Z_DEPARTAMENTO'].value_counts(normalize=True).to_dict())
    Z_PUNTO_VENTA = df['Z_PUNTO_VENTA'].replace(df['Z_PUNTO_VENTA'].value_counts(normalize=True).to_dict())
    
    df_max = df.iloc[:, 5:].max(axis=1)
    df_sum = df.iloc[:, 5:].sum(axis=1)
    df_std = df.iloc[:, 5:].std(axis=1)
    df_mean = df.iloc[:, 5:].mean(axis=1)
        
    df_total= df_sum.apply(lambda x: 1 if x > 0 else 0)
    df_count = df.iloc[:, 5:].stack().apply(lambda x: x if x > 0 else np.nan).unstack(level=1).count(axis=1)
    
    features = df.iloc[:, 5:].stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1)
    
    df_z = pd.concat([
        features,
        df_z_punto_venta,
        df_z_modelo, 
        df_z_gama,
        df_z_marca,
        df_z_departamento,
                      
        df_z_s_punto_venta,
        df_z_s_modelo,
        df_z_s_gama,
        df_z_s_marca,
        df_z_s_departamento,
        
        df_b_punto_venta,
        df_b_departameto,
                      
        Z_MARCA,
        Z_GAMA,
        Z_MODELO,
        Z_DEPARTAMENTO,
        Z_PUNTO_VENTA,
                      
        df_max,
        df_sum,
        df_std,
        df_mean,
                      
        df_total,
        df_count

    ], axis=1).T.reset_index(drop=True).T
    
    return df_z

In [78]:
x_train = featuring(df_train.iloc[:, 5:45])
y_train = df_train.iloc[:, 45:55]

x_test = featuring(df_train.iloc[:, 15:55])

In [79]:
print(f'X TRAIN SHAPE: {x_train.shape}')
print(f'Y TRAIN SHAPE: {y_train.shape}')
print(f'X TEST SHAPE: {x_test.shape}')

In [80]:
def data_sequence_to_models():
    
    correlated_features = set()
    correlation_matrix = x_train.corr()

    for i in range(len(correlation_matrix .columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.95:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)
                
    x_train_model = x_train.drop(labels=correlated_features, axis=1)
    x_test_model = x_test.drop(labels=correlated_features, axis=1)

    print(f'TRAIN SHAPE: {x_train_model.shape}')
    
    sc = RobustScaler()
    _x_train = sc.fit_transform(x_train_model)
    _x_test = sc.transform(x_test_model)
    
    return _x_train, _x_test

In [81]:
x_train_data, x_test_data = data_sequence_to_models()

y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

In [82]:
def MLP(l_extras_features):
    
    tabular = keras.layers.Input(shape=l_extras_features)

    for n_hidden in [1024, 512, 256, 128, 64, 32]:
        out_tabular = keras.layers.Dense(n_hidden, activation='linear')(tabular)
        out_tabular = keras.layers.BatchNormalization()(out_tabular)
        out_tabular = keras.layers.Dropout(0.2)(out_tabular)

    out = keras.layers.Dense(10, activation='relu')(out_tabular)

    model = keras.Model(inputs = [tabular], outputs = out)

    mse = tf.keras.losses.MeanSquaredError()
    rmse = tf.keras.metrics.RootMeanSquaredError()
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0004), loss=mse, metrics=[rmse])

    return model

In [83]:
def training_MLP():

    EPOCH = 1000
    BATCH_SIZE = 512

    models = []

    kf = KFold(n_splits=5, shuffle=True, random_state=2022)

    for fold, (train_idx, val_idx) in enumerate(kf.split(x_train_data, y_train)):

        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)

        X_train, X_valid = x_train_data[train_idx], x_train_data[val_idx]
                        
        Y_train, Y_valid = y_train[train_idx], y_train[val_idx]

        l_ext = X_train.shape[-1]

        model = MLP(l_ext)

        es = keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error', 
                                           min_delta=1e-05,
                                           patience=30,
                                           verbose=1, 
                                           mode='min', 
                                           restore_best_weights=True)
        plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_root_mean_squared_error',
                                                    factor=0.1,
                                                    patience=10,
                                                    verbose=1,
                                                    min_lr=5e-7, 
                                                    mode='min')

        model.fit([X_train], Y_train,
                  validation_data=([X_valid], Y_valid),
                  epochs=EPOCH,
                  batch_size=BATCH_SIZE,
                  callbacks = [es, plateau],
                  verbose=1)

        eval_model(model, [X_valid], Y_valid)

        models.append(model)

    return models

def eval_model(model, x_valid, y_valid):
    preds = pd.DataFrame(np.round(model.predict(x_valid)).astype('int32')).stack().reset_index(drop=True)
    y_valid = pd.DataFrame(y_valid).stack().reset_index(drop=True)
    print(f' RMSE --> {mean_squared_error(y_valid, preds, squared=False)}')

In [84]:
models = training_MLP()

In [85]:
preds_MLP_40 = []

for model in models:
    _pred = model.predict([x_test_data])
    preds_MLP_40.append(_pred)
    
pred_sub_MLP_40 = (preds_MLP_40[0] + preds_MLP_40[1] + preds_MLP_40[2] + preds_MLP_40[3] + preds_MLP_40[4]) / 5

# MODELO EN 20 SEMANAS

In [86]:
n = 20

In [87]:
x_train = pd.concat([
    featuring(df_train.iloc[:, 5:25]),
    featuring(df_train.iloc[:, 15:35]),
    featuring(df_train.iloc[:, 25:45]),
], axis=0).reset_index(drop=True)

y_train = pd.concat([
    pd.DataFrame(df_train.iloc[:, 25:35].values),
    pd.DataFrame(df_train.iloc[:, 35:45].values),
    pd.DataFrame(df_train.iloc[:, 45:55].values),
], axis=0).reset_index(drop=True)


x_test = featuring(df_train.iloc[:, 35:55])

In [88]:
print(f'X TRAIN SHAPE: {x_train.shape}')
print(f'Y TRAIN SHAPE: {y_train.shape}')
print(f'X TEST SHAPE: {x_test.shape}')

In [89]:
x_train_data, x_test_data = data_sequence_to_models()

y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

In [90]:
print(f'X TRAIN SHAPE: {x_train_data.shape}')
print(f'Y TRAIN SHAPE: {y_train.shape}')
print(f'X TEST SHAPE: {x_test_data.shape}')

In [91]:
models_MLP_20 = training_MLP()

In [92]:
preds_MLP_20 = []

for model in models_MLP_20:
    _pred = model.predict([x_test_data])
    preds_MLP_20.append(_pred)
    
pred_sub_MLP_20 = (preds_MLP_20[0] + preds_MLP_20[1] + preds_MLP_20[2] + preds_MLP_20[3] + preds_MLP_20[4]) / 5

# MODELO BASADO EN 10 SEMANAS

In [93]:
n = 10

In [94]:
x_train = pd.concat([
    featuring(df_train.iloc[:, 5:15]),
    featuring(df_train.iloc[:, 15:25]),
    featuring(df_train.iloc[:, 25:35]),
    featuring(df_train.iloc[:, 35:45]),
], axis=0).reset_index(drop=True)

y_train = pd.concat([
    pd.DataFrame(df_train.iloc[:, 15:25].values),
    pd.DataFrame(df_train.iloc[:, 25:35].values),
    pd.DataFrame(df_train.iloc[:, 35:45].values),
    pd.DataFrame(df_train.iloc[:, 45:55].values)
], axis=0).reset_index(drop=True)


x_test = featuring(df_train.iloc[:, 45:55])

In [95]:
print(f'X TRAIN SHAPE: {x_train.shape}')
print(f'Y TRAIN SHAPE: {y_train.shape}')
print(f'X TEST SHAPE: {x_test.shape}')

In [96]:
x_train_data, x_test_data = data_sequence_to_models()

y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

In [97]:
models_MLP_10 = training_MLP()

In [98]:
preds_MLP_10 = []

for model in models_MLP_10:
    _pred = model.predict([x_test_data])
    preds_MLP_10.append(_pred)
    
pred_sub_MLP_10 = (preds_MLP_10[0] + preds_MLP_10[1] + preds_MLP_10[2] + preds_MLP_10[3] + preds_MLP_10[4]) / 5

# INTEGRACIÓN DE MODELOS

In [99]:
pred_sub = pred_sub_MLP_40 * 0.2 + pred_sub_MLP_20 * 0.5  + pred_sub_MLP_10 * 0.3

In [100]:
df_submission = pd.merge(df_train.iloc[:, :5], pd.DataFrame(pred_sub), how='inner', left_index=True, right_index=True)
df_submission = df_submission.rename(columns={
    0: 'SEMANA_51', 
    1: 'SEMANA_52',
    2: 'SEMANA_53',
    3: 'SEMANA_54',
    4: 'SEMANA_55',
    5: 'SEMANA_56',
    6: 'SEMANA_57',
    7: 'SEMANA_58',
    8: 'SEMANA_59',
    9: 'SEMANA_60'
})

df_submission['BASE_ID'] = df_submission['Z_MODELO'].astype(str) + '|' + df_submission['Z_PUNTO_VENTA'].astype(str) + '|' + df_submission['Z_GAMA'].astype(str)
df_submission = df_submission.iloc[:, 5:]
df_submission = df_submission.set_index('BASE_ID').stack().to_frame().reset_index()
df_submission['BASE_ID'] = df_submission['BASE_ID'].astype(str) + '|' + df_submission['level_1'].astype(str)
df_submission = df_submission.drop(['level_1'], axis=1)
df_submission.columns = ['ID', 'Demanda']

In [101]:
df_submission.to_csv('entel_v1_mlp_2.csv', index=False)

In [102]:
p = 0.9999
print(df_submission.Demanda.quantile(p))
print(pd.DataFrame(y_train).stack().quantile(p))
# 76.832

<a href='./entel_v1_mlp_2.csv'>download</a>

In [103]:
np.round(df_submission.Demanda).value_counts(normalize=True)