In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

from tensorflow import keras

In [14]:
CONFIG_KAGGLE = {
    'TRAIN_PATH': '/kaggle/input/datathon-entel-2022-reto2/train.csv',
    'TEST_PATH': '/kaggle/input/datathon-entel-2022-reto2/test.csv',
    'SAMPLE_SUBMISSION': '/kaggle/input/datathon-entel-2022-reto2/test_sample.csv'
}

CONFIG = {
    'TRAIN_PATH': '../data/train.csv',
    'TEST_PATH': '../data/test.csv',
    'SAMPLE_SUBMISSION': '../data/test_sample.csv'
}

df_train = pd.read_csv(CONFIG_KAGGLE['TRAIN_PATH'])
df_test = pd.read_csv(CONFIG_KAGGLE['TEST_PATH'])
df_sub = pd.read_csv(CONFIG_KAGGLE['SAMPLE_SUBMISSION'])

In [15]:
def featuring(df_train, df, x_base):
    df = pd.concat([df_train.iloc[:, :5], df], axis=1)

    df_z_punto_venta = df.groupby(['Z_PUNTO_VENTA'])[df.iloc[:, 5:].columns].transform('max')
    df_z_modelo = df.groupby(['Z_MODELO'])[df.iloc[:, 5:].columns].transform('max')
    df_z_gama = df.groupby(['Z_GAMA'])[df.iloc[:, 5:].columns].transform('max')
    df_z_marca = df.groupby(['Z_MARCA'])[df.iloc[:, 5:].columns].transform('max')
    df_z_departamento = df.groupby(['Z_DEPARTAMENTO'])[df.iloc[:, 5:].columns].transform('max')

    df_z_s_punto_venta = df.groupby(['Z_PUNTO_VENTA'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_modelo = df.groupby(['Z_MODELO'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_gama = df.groupby(['Z_GAMA'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_marca = df.groupby(['Z_MARCA'])[df.iloc[:, 5:].columns].transform('sum')
    df_z_s_departamento = df.groupby(['Z_DEPARTAMENTO'])[df.iloc[:, 5:].columns].transform('sum')

    df_b_punto_venta = df['Z_PUNTO_VENTA'].apply(lambda x: 1 if x in
                                                                [
                                                                    'da45328ba820604eb99694768f2a430cd933d161601dcb8491b4a9b555232c59',
                                                                    'e1f2d2708f545ddc1d7266ba0cc5ccc88147b77fdf3450e68a974e93018ecf60'] else 0)
    df_b_departameto = df['Z_DEPARTAMENTO'].apply(lambda x: 1 if x in
                                                                 [
                                                                     'd6c21b948958417ca98b682a573eb8aa1084b292d32f760f253ef53da13e5589'] else 0)

    Z_MARCA = df['Z_MARCA'].replace(df['Z_MARCA'].value_counts(normalize=True).to_dict())
    Z_GAMA = df['Z_GAMA'].replace(df['Z_GAMA'].value_counts(normalize=True).to_dict())
    Z_MODELO = df['Z_MODELO'].replace(df['Z_MODELO'].value_counts(normalize=True).to_dict())
    Z_DEPARTAMENTO = df['Z_DEPARTAMENTO'].replace(df['Z_DEPARTAMENTO'].value_counts(normalize=True).to_dict())
    Z_PUNTO_VENTA = df['Z_PUNTO_VENTA'].replace(df['Z_PUNTO_VENTA'].value_counts(normalize=True).to_dict())

    df_max = df.iloc[:, 5:].max(axis=1)
    df_sum = df.iloc[:, 5:].sum(axis=1)
    df_std = df.iloc[:, 5:].std(axis=1)
    df_mean = df.iloc[:, 5:].mean(axis=1)

    df_total = df_sum.apply(lambda x: 1 if x > 0 else 0)
    df_count = df.iloc[:, 5:].stack().apply(lambda x: x if x > 0 else np.nan).unstack(level=1).count(axis=1)

    features = df.iloc[:, 5:].stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1)

    df_z = pd.concat([
        features,
        df_z_punto_venta,
        df_z_modelo,
        df_z_gama,
        df_z_marca,
        df_z_departamento,

        df_z_s_punto_venta,
        df_z_s_modelo,
        df_z_s_gama,
        df_z_s_marca,
        df_z_s_departamento,

        df_b_punto_venta,
        df_b_departameto,

        Z_MARCA,
        Z_GAMA,
        Z_MODELO,
        Z_DEPARTAMENTO,
        Z_PUNTO_VENTA,

        df_max,
        df_sum,
        df_std,
        df_mean,

        df_total,
        df_count

    ], axis=1).T.reset_index(drop=True).T

    return df_z

In [16]:
def data_sequence_to_models(x_train, x_test, n):
    correlated_features = set()
    correlation_matrix = x_train.iloc[:, n:].corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.95:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    x_train_model = x_train.drop(labels=correlated_features, axis=1)
    x_test_model = x_test.drop(labels=correlated_features, axis=1)

    print(f'TRAIN SHAPE: {x_train_model.shape}')

    sc = RobustScaler()
    _x_train = sc.fit_transform(x_train_model)
    _x_test = sc.transform(x_test_model)

    return _x_train, _x_test


In [17]:
def eval_model(model, x_valid, y_valid):
    preds = pd.DataFrame(np.round(model.predict(x_valid)).astype('int32')).stack().reset_index(drop=True)
    y_valid = pd.DataFrame(y_valid).stack().reset_index(drop=True)
    print(f' RMSE --> {mean_squared_error(y_valid, preds, squared=False)}')

In [18]:
EPOCH = 1000
BATCH_SIZE = 512


def data(x_train_data, y_train, t_idx, v_idx, model_type, n=0):
    if model_type == 'mlp':
        _x_train, _x_valid = x_train_data[t_idx], x_train_data[v_idx]

        _y_train, _y_valid = y_train[t_idx], y_train[v_idx]

        model = mlp(_x_train.shape[-1])

        return [_x_train], [_x_valid], _y_train, _y_valid, model

    if model_type == 'lstm':
        _x_train, _x_valid = x_train_data[t_idx], x_train_data[v_idx]

        x_t_features, x_v_features = _x_train[:, :n], _x_valid[:, :n]
        x_t_features = np.reshape(x_t_features, (x_t_features.shape[0], x_t_features.shape[1], 1))
        x_v_features = np.reshape(x_v_features, (x_v_features.shape[0], x_v_features.shape[1], 1))

        x_t_extras, x_v_extras = _x_train[:, n:], _x_valid[:, n:]

        _y_train, _y_valid = y_train[t_idx], y_train[v_idx]

        l_fet = x_t_features.shape[-2:]
        l_ext = x_t_extras.shape[-1]

        model = lstm(l_fet, l_ext)

        return [x_t_features, x_t_extras], [x_v_features, x_v_extras], _y_train, _y_valid, model


def training(model_type, x_train_data, y_train, n):
    """
    :param n: n define the number of column to use for lstm or cnn
    :param y_train: numpy array
    :param x_train_data: numpy array
    :param model_type: str type: could be lstm, cnn or mlp
    :return: list of tensorflow models
    """

    models = []

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)

    y_group = pd.Series(y_train.sum(axis=1)).apply(lambda x: x if x < 15 else 15).values

    for fold, (train_idx, val_idx) in enumerate(kf.split(x_train_data, y_group)):
        print('-' * 15, '>', f'Fold {fold + 1}', '<', '-' * 15)

        _x_train, _x_valid, _y_train, _y_valid, model = data(x_train_data, y_train, train_idx, val_idx, model_type, n)
        
        es = keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',
                                   min_delta=1e-05,
                                   patience=30,
                                   verbose=1,
                                   mode='min',
                                   restore_best_weights=True)
        
        plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_root_mean_squared_error',
                                            factor=0.1,
                                            patience=10,
                                            verbose=1,
                                            min_lr=5e-7,
                                            mode='min')

        model.fit(_x_train, _y_train,
                  validation_data=(_x_valid, _y_valid),
                  epochs=EPOCH,
                  batch_size=BATCH_SIZE,
                  callbacks=[es, plateau],
                  verbose=1)

        eval_model(model, _x_valid, _y_valid)

        models.append(model)

    return models


In [19]:
def lstm(l_features, l_extras_features):
    features = keras.layers.Input(shape=l_features)
    tabular = keras.layers.Input(shape=l_extras_features)

    out_features = keras.layers.LSTM(250, return_sequences=True)(features)
    out_features = keras.layers.Dropout(0.2)(out_features)
    out_features = keras.layers.LSTM(150, return_sequences=True)(out_features)
    out_features = keras.layers.Dropout(0.2)(out_features)
    out_features = keras.layers.LSTM(100)(out_features)
    out_features = keras.layers.Flatten()(out_features)

    out_features = keras.layers.Dense(50, activation='linear')(out_features)
    out_features = keras.layers.Dropout(0.2)(out_features)
    out_features = keras.layers.Dense(32, activation='linear')(out_features)

    for n_hidden in [512, 256, 128, 64, 32]:
        out_tabular = keras.layers.Dense(n_hidden, activation='relu')(tabular)
        out_tabular = keras.layers.BatchNormalization()(out_tabular)
        out_tabular = keras.layers.Dropout(0.2)(out_tabular)

    out = keras.layers.Multiply()([out_features, out_tabular])
    out = keras.layers.Dense(10, activation='relu')(out)

    model = keras.Model(inputs=[features, tabular], outputs=out)

    mse = keras.losses.MeanSquaredError()
    rmse = keras.metrics.RootMeanSquaredError()
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0004), loss=mse, metrics=[rmse])

    return model


def mlp(l_extras_features):
    tabular = keras.layers.Input(shape=l_extras_features)

    for n_hidden in [1024, 512, 256, 128, 64, 32]:
        out_tabular = keras.layers.Dense(n_hidden, activation='linear')(tabular)
        out_tabular = keras.layers.BatchNormalization()(out_tabular)
        out_tabular = keras.layers.Dropout(0.2)(out_tabular)

    out = keras.layers.Dense(10, activation='relu')(out_tabular)

    model = keras.Model(inputs=[tabular], outputs=out)

    mse = keras.losses.MeanSquaredError()
    rmse = keras.metrics.RootMeanSquaredError()
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0004), loss=mse, metrics=[rmse])

    return model


In [20]:
def lstm_test_csv(x_test, models, n, name):
    predictions = []

    x_test_features = x_test[:, :n]
    x_test_features = np.reshape(x_test_features, (x_test_features.shape[0], x_test_features.shape[1], 1))

    x_test_extras = x_test[:, n:]

    for model in models:
        _pred = model.predict([x_test_features, x_test_extras])
        predictions.append(_pred)
        
        
    sub_predictions = (predictions[0] + predictions[1] + predictions[2] + predictions[3] + predictions[4]) / 5


    result_to_csv(sub_predictions, name)


def mlp_test_csv(x_test, models, n, name):
    predictions = []
    for model in models:
        _pred = model.predict([x_test])
        predictions.append(_pred)
        
    sub_predictions = (predictions[0] + predictions[1] + predictions[2] + predictions[3] + predictions[4]) / 5

    result_to_csv(sub_predictions, name)


def result_to_csv(predictions, name):
    df_submission = pd.merge(df_train.iloc[:, :5],
                             pd.DataFrame(predictions),
                             how='inner',
                             left_index=True,
                             right_index=True)

    df_submission = df_submission.rename(columns={
        0: 'SEMANA_51',
        1: 'SEMANA_52',
        2: 'SEMANA_53',
        3: 'SEMANA_54',
        4: 'SEMANA_55',
        5: 'SEMANA_56',
        6: 'SEMANA_57',
        7: 'SEMANA_58',
        8: 'SEMANA_59',
        9: 'SEMANA_60'
    })

    df_submission['BASE_ID'] = df_submission['Z_MODELO'].astype(str) + '|' + \
                               df_submission['Z_PUNTO_VENTA'].astype(str) + '|' + \
                               df_submission['Z_GAMA'].astype(str)

    df_submission = df_submission.iloc[:, 5:]

    df_submission = df_submission.set_index('BASE_ID').stack().to_frame().reset_index()
    df_submission['BASE_ID'] = df_submission['BASE_ID'].astype(str) + '|' + df_submission['level_1'].astype(str)

    df_submission = df_submission.drop(['level_1'], axis=1)
    df_submission.columns = ['ID', 'Demanda']

    df_submission.to_csv(f'{name}.csv', index=False)

# MODELOS LSTM

In [21]:

def model_lstm_based_40():
    n = 40
    x_base = 100

    x_train = featuring(df_train, df_train.iloc[:, 5:45], x_base)
    x_test = featuring(df_train, df_train.iloc[:, 15:55], x_base)

    y_train = df_train.iloc[:, 45:55]

    x_train_data, x_test_data = data_sequence_to_models(x_train, x_test, n)

    y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

    models = training('lstm', x_train_data, y_train, n)

    lstm_test_csv(x_test_data, models, n, 'lstm_40')


def model_lstm_based_20():
    n = 20
    x_base = 100

    x_train = pd.concat([
        featuring(df_train, df_train.iloc[:, 5:25], x_base),
        featuring(df_train, df_train.iloc[:, 15:35], x_base),
        featuring(df_train, df_train.iloc[:, 25:45], x_base),
    ], axis=0).reset_index(drop=True)

    y_train = pd.concat([
        pd.DataFrame(df_train.iloc[:, 25:35].values),
        pd.DataFrame(df_train.iloc[:, 35:45].values),
        pd.DataFrame(df_train.iloc[:, 45:55].values),
    ], axis=0).reset_index(drop=True)

    x_test = featuring(df_train, df_train.iloc[:, 35:55], x_base)

    x_train_data, x_test_data = data_sequence_to_models(x_train, x_test, n)

    y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

    models = training('lstm', x_train_data, y_train, n)

    lstm_test_csv(x_test_data, models, n, 'lstm_20')
    
def model_lstm_based_10():
    n = 10
    x_base = 100

    x_train = pd.concat([
        featuring(df_train, df_train.iloc[:, 5:15], x_base),
        featuring(df_train, df_train.iloc[:, 15:25], x_base),
        featuring(df_train, df_train.iloc[:, 25:35], x_base),
        featuring(df_train, df_train.iloc[:, 35:45], x_base),
    ], axis=0).reset_index(drop=True)

    y_train = pd.concat([
        pd.DataFrame(df_train.iloc[:, 15:25].values),
        pd.DataFrame(df_train.iloc[:, 25:35].values),
        pd.DataFrame(df_train.iloc[:, 35:45].values),
        pd.DataFrame(df_train.iloc[:, 45:55].values)
    ], axis=0).reset_index(drop=True)

    x_test = featuring(df_train, df_train.iloc[:, 45:55], x_base)

    x_train_data, x_test_data = data_sequence_to_models(x_train, x_test, n)

    y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

    models = training('lstm', x_train_data, y_train, n)

    lstm_test_csv(x_test_data, models, n, 'lstm_10')

In [22]:
model_lstm_based_40()
model_lstm_based_20()
model_lstm_based_10()

# Modelos MLP

In [23]:
def model_mlp_based_40():
    n = 0
    x_base = 100

    x_train = pd.concat([
        featuring(df_train, df_train.iloc[:, 5:25], x_base),
        featuring(df_train, df_train.iloc[:, 15:35], x_base),
        featuring(df_train, df_train.iloc[:, 25:45], x_base),
    ], axis=0).reset_index(drop=True)

    y_train = pd.concat([
        pd.DataFrame(df_train.iloc[:, 25:35].values),
        pd.DataFrame(df_train.iloc[:, 35:45].values),
        pd.DataFrame(df_train.iloc[:, 45:55].values),
    ], axis=0).reset_index(drop=True)

    x_test = featuring(df_train, df_train.iloc[:, 35:55], x_base)

    x_train_data, x_test_data = data_sequence_to_models(x_train, x_test, n)

    y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

    models = training('mlp', x_train_data, y_train, n)

    mlp_test_csv(x_test_data, models, n, 'mlp_40')


def model_mlp_based_20():
    n = 0
    x_base = 100

    x_train = pd.concat([
        featuring(df_train, df_train.iloc[:, 5:15], x_base),
        featuring(df_train, df_train.iloc[:, 15:25], x_base),
        featuring(df_train, df_train.iloc[:, 25:35], x_base),
        featuring(df_train, df_train.iloc[:, 35:45], x_base),
    ], axis=0).reset_index(drop=True)

    y_train = pd.concat([
        pd.DataFrame(df_train.iloc[:, 15:25].values),
        pd.DataFrame(df_train.iloc[:, 25:35].values),
        pd.DataFrame(df_train.iloc[:, 35:45].values),
        pd.DataFrame(df_train.iloc[:, 45:55].values)
    ], axis=0).reset_index(drop=True)

    x_test = featuring(df_train, df_train.iloc[:, 45:55], x_base)

    x_train_data, x_test_data = data_sequence_to_models(x_train, x_test, n)

    y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

    models = training('mlp', x_train_data, y_train, n)

    mlp_test_csv(x_test_data, models, n, 'mlp_20')


def model_mlp_based_10():
    n = 0
    x_base = 100

    x_train = featuring(df_train, df_train.iloc[:, 5:45], x_base)
    x_test = featuring(df_train, df_train.iloc[:, 15:55], x_base)

    y_train = df_train.iloc[:, 45:55]

    x_train_data, x_test_data = data_sequence_to_models(x_train, x_test, n)

    y_train = y_train.stack().apply(lambda x: x if x < x_base else x_base).unstack(level=1).values

    models = training('mlp', x_train_data, y_train, n)

    mlp_test_csv(x_test_data, models, n, 'mlp_10')

In [24]:
model_mlp_based_40()
model_mlp_based_20()
model_mlp_based_10()

# Integración

In [None]:
df_train.iloc[:, 45:55].stack().to_frame().rename(columns={0: 'Demanda'}).to_csv('entel_last.csv', index=False)

In [None]:
r1 = pd.read_csv('lstm_40.csv')
r2 = pd.read_csv('lstm_20.csv')
r3 = pd.read_csv('lstm_10.csv')

r1['Demanda'] = r1['Demanda'] * 0.30 + r2['Demanda'] * 0.50 + r3['Demanda'] * 0.2

r1.to_csv('lstm_final.csv', index=False)

In [None]:
r1 = pd.read_csv('mlp_40.csv')
r2 = pd.read_csv('mlp_20.csv')
r3 = pd.read_csv('mlp_10.csv')

r1['Demanda'] = r1['Demanda'] * 0.30 + r2['Demanda'] * 0.50 + r3['Demanda'] * 0.2

r1.to_csv('mlp_final.csv', index=False)

In [None]:
r1 = pd.read_csv('lstm_final.csv')
r2 = pd.read_csv('mlp_final.csv')
r3 = pd.read_csv('entel_last.csv')

r1['Demanda'] = np.round(r1['Demanda'] * 0.50 + \
                         r2['Demanda'] * 0.30 + \
                         r3['Demanda'] * 0.20)

r1.to_csv('entel_final.csv', index=False)
