In [1]:
import numpy as np
import pandas as pd

from warnings import filterwarnings

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler, normalize, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GroupKFold, KFold

filterwarnings('ignore')

In [2]:
CONFIG = {
    'TRAIN_PATH': '/kaggle/input/datathon-entel-2022-reto2/train.csv',
    'TEST_PATH': '/kaggle/input/datathon-entel-2022-reto2/test.csv',
    'SAMPLE_SUBMISSION': '/kaggle/input/datathon-entel-2022-reto2/test_sample.csv'
}

In [3]:
df_train = pd.read_csv(CONFIG['TRAIN_PATH'])
df_test = pd.read_csv(CONFIG['TEST_PATH'])
df_sub = pd.read_csv(CONFIG['SAMPLE_SUBMISSION'])

In [4]:
df_train.head()

Unnamed: 0,Z_MARCA,Z_GAMA,Z_MODELO,Z_DEPARTAMENTO,Z_PUNTO_VENTA,SEMANA_01,SEMANA_02,SEMANA_03,SEMANA_04,SEMANA_05,...,SEMANA_41,SEMANA_42,SEMANA_43,SEMANA_44,SEMANA_45,SEMANA_46,SEMANA_47,SEMANA_48,SEMANA_49,SEMANA_50
0,f223faa96f22916294922b171a2696d868fd1f9129302e...,de88c121a82a06352bf1aaceba20578356408a334ba046...,f0465138ce3c092d78c1e33657fe604564d40cdc8cb196...,591c0a0133cb5fcd00af7bbf046f094256901239749fb7...,d2c888e1a77f2eb0732555cf018c3ca71bbcb32c73778c...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5132f94c2aebce767bd61d9e8f0d4f681d0809ca90cd2c...,76df0c6db32d4e04e0ef6a3a6a1e1686677e34308d9435...,85ac1d5351fa6c551dcaf4e9440939949f59fd6986c5ee...,d6c21b948958417ca98b682a573eb8aa1084b292d32f76...,99af5fbe4f1ce1a2d5c05d8d50543bb993dd621f259c8e...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,f223faa96f22916294922b171a2696d868fd1f9129302e...,de88c121a82a06352bf1aaceba20578356408a334ba046...,ab82a1d82b6fbd27ffe90900dffa4e8018745ef082fdf2...,d6c21b948958417ca98b682a573eb8aa1084b292d32f76...,4d9927f9d9ebe9b3742b20cf87b37b72c526f39d160289...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,285075a02b2679248a6b4636c3328bd3097626607c3e43...,8563abec343968034b0624650aed7254081e9e39e6b32e...,3f7d19feb71e55fc12b796a4cda0fbcec00511a039e758...,d6c21b948958417ca98b682a573eb8aa1084b292d32f76...,2fc9e521d966b9a311c1d3fc70abafa98dd0f37a51c71a...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,df853f864c74fa85acd3a25cd0afed68f1f1d0ab249e17...,8563abec343968034b0624650aed7254081e9e39e6b32e...,dbd49c8cda7f4bbbfc2a8b337b5aa79dc8067b46ca1bf5...,d6c21b948958417ca98b682a573eb8aa1084b292d32f76...,729be1d813198ffecf16f8c581e474cd58ee5aa7d11cdf...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### En lugar de decir es mejor de 100, podemos crear una nueva variable, si es mayor a 100 o mejor para darle una mayor fuerza a los datos

In [16]:
class ENTEL_DATASET:
    
    def __init__(self, df):
        
        self.df = df
        self.x_train = None
        self.x_test = None
        
        self.scaler = None
        self.last10_scaler = None
        self.x_extras = None
        
        self.featuring()
        
        
    def featuring(self):
        
        def remove_outliers(df):
            return  df.stack().apply(lambda x: x if x < 100 else 100).unstack(level=1)
                
        self.x_train = self.df.iloc[:, 25:45]
        self.y_train = self.df.iloc[:, -10:].stack().apply(lambda x: x if x < 100 else 100).unstack(level=1).values
        
        self.x_test = self.df.iloc[:, 35:]
        
        self.x_train = remove_outliers(self.x_train)
        self.x_test = remove_outliers(self.x_test)
        
    def data_train(self):
        
        x_train = self.x_train
                
        self.last10_scaler = StandardScaler()
        
        mean_last_10_m = x_train.iloc[:, -10:].sum(axis=1)
        mean_last_0 = mean_last_10_m.apply(lambda x: 1 if x > 0 else 0)
        mean_last_10_m = self.last10_scaler.fit_transform(pd.concat([
            mean_last_10_m,
            mean_last_0,
        ], axis=1))
        
        self.scaler = StandardScaler()
        
        x_train[x_train.columns] = self.scaler.fit_transform(x_train)
        x_train = np.reshape(x_train.values, (x_train.shape[0], 1, x_train.shape[1]))
        
        x_extras_train = self.extras()      
        x_extras_train = pd.concat([
            pd.DataFrame(x_extras_train), 
            pd.DataFrame(mean_last_10_m)
        ],
            axis=1).values
        
        return x_train, x_extras_train, self.y_train
        
    def data_test(self):
        
        x_test = self.x_test
        
        mean_last_10_m = x_test.iloc[:, -10:].sum(axis=1)
        mean_last_0 = mean_last_10_m.apply(lambda x: 1 if x > 0 else 0)
        mean_last_10_m = self.last10_scaler.transform(pd.concat([
            mean_last_10_m,
            mean_last_0,
        ], axis=1))
        
        x_test[x_test.columns] = self.scaler.transform(x_test)
        x_test = np.reshape(x_test.values, (x_test.shape[0], 1, x_test.shape[1]))
        
        x_extras_test = self.x_extras
        x_extras_test = pd.concat([
            pd.DataFrame(x_extras_test),  
            pd.DataFrame(mean_last_10_m)
        ],
            axis=1).values
        
        return x_test, x_extras_test
        
    def extras(self):
        
        df_train = self.df.iloc[:, :45]
        
        z_marca = df_train.Z_MARCA.replace(df_train.Z_MARCA.value_counts(normalize=False).to_dict())
        z_gama = df_train.Z_GAMA.replace(df_train.Z_GAMA.value_counts(normalize=False).to_dict())
        z_model = df_train.Z_MODELO.replace(df_train.Z_MODELO.value_counts(normalize=False).to_dict())
        z_city = df_train.Z_DEPARTAMENTO.replace(df_train.Z_DEPARTAMENTO.value_counts(normalize=False).to_dict())
        z_sell = df_train.Z_PUNTO_VENTA.replace(df_train.Z_PUNTO_VENTA.value_counts(normalize=False).to_dict())

        z_marca_mean = df_train.Z_MARCA.replace(df_train.groupby(['Z_MARCA']).sum().mean(axis=1).to_dict())
        z_gama_mean = df_train.Z_GAMA.replace(df_train.groupby(['Z_GAMA']).sum().mean(axis=1).to_dict())
        z_model_mean = df_train.Z_MODELO.replace(df_train.groupby(['Z_MODELO']).sum().mean(axis=1).to_dict())
        z_city_mean = df_train.Z_DEPARTAMENTO.replace(df_train.groupby(['Z_DEPARTAMENTO']).sum().mean(axis=1).to_dict())
        z_sell_mean = df_train.Z_PUNTO_VENTA.replace(df_train.groupby(['Z_PUNTO_VENTA']).sum().mean(axis=1).to_dict())

        z_marca_max = df_train.Z_MARCA.replace(df_train.groupby(['Z_MARCA']).sum().max(axis=1).to_dict())
        z_gama_max = df_train.Z_GAMA.replace(df_train.groupby(['Z_GAMA']).max().max(axis=1).to_dict())
        z_model_max = df_train.Z_MODELO.replace(df_train.groupby(['Z_MODELO']).max().max(axis=1).to_dict())
        z_city_max = df_train.Z_DEPARTAMENTO.replace(df_train.groupby(['Z_DEPARTAMENTO']).max().max(axis=1).to_dict())
        z_sell_max = df_train.Z_PUNTO_VENTA.replace(df_train.groupby(['Z_PUNTO_VENTA']).max().max(axis=1).to_dict())

        z_best_p_sell = df_train.Z_PUNTO_VENTA.apply(lambda x: 1 if x in 
                                                     ['da45328ba820604eb99694768f2a430cd933d161601dcb8491b4a9b555232c59',
                                                      'e1f2d2708f545ddc1d7266ba0cc5ccc88147b77fdf3450e68a974e93018ecf60'] else 0)
        z_best_city = df_train.Z_DEPARTAMENTO.apply(lambda x: 1 if x  in 
                                                    ['d6c21b948958417ca98b682a573eb8aa1084b292d32f760f253ef53da13e5589'] else 0)
    
    
        x_extras_train = pd.DataFrame([z_marca, 
                                       z_gama,
                                       z_model, 
                                       z_city,
                                       z_sell, 
                                       z_marca_mean, 
                                       z_gama_mean, 
                                       z_model_mean, 
                                       z_city_mean, 
                                       z_sell_mean,
                                       z_marca_max, 
                                       z_gama_max, 
                                       z_model_max, 
                                       z_city_max, 
                                       z_sell_max,
                                       z_best_p_sell,
                                       z_best_city,
                                      ]).T

        scaler_extra = StandardScaler()
        x_extras_train = scaler_extra.fit_transform(x_extras_train)
        
        self.x_extras = x_extras_train
        
        return x_extras_train

In [10]:
entel_dataset = ENTEL_DATASET(df_train)
x_train_data, x_extra_data, y_train_data = entel_dataset.data_train()
x_test_data, x_test_extra_data = entel_dataset.data_test()

In [11]:
class ENTEL_MODELS:
    
    def __init__(self):
        
        print('ENTEL MODELS')     
        
    def LSTM(self, l_features, l_extras_features):
            
        features = keras.layers.Input(shape=l_features)
        tabular = keras.layers.Input(shape=l_extras_features)

        out_features = keras.layers.LSTM(150, return_sequences=True)(features)
        out_features = keras.layers.Dropout(0.2)(out_features)
        out_features = keras.layers.LSTM(100, return_sequences=True)(out_features)
        out_features = keras.layers.Flatten()(out_features)
        
        out_features = keras.layers.Dense(50, activation='linear')(out_features)
        out_features = keras.layers.Dropout(0.2)(out_features)
        out_features = keras.layers.Dense(32, activation='linear')(out_features)
        
        for n_hidden in [128, 64, 32]:
            out_tabular = keras.layers.Dense(n_hidden, activation='relu')(tabular)
            out_tabular = keras.layers.BatchNormalization()(out_tabular)
            out_tabular = keras.layers.Dropout(0.2)(out_tabular)

        out = tf.keras.layers.Multiply()([out_features, out_tabular])
        out = keras.layers.Dense(10, activation='relu')(out)
        
        model = keras.Model(inputs = [features, tabular], outputs = out)

        mse = tf.keras.losses.MeanSquaredError()
        rmse = tf.keras.metrics.RootMeanSquaredError()
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0004), loss=mse, metrics=[rmse])
        
        self.model = model

        return model
        
    def training(self, model_type = 'LSTM'):
        
        EPOCH = 1000
        BATCH_SIZE = 512
        
        models = []

        kf = KFold(n_splits=5, shuffle=True, random_state=2022)

        for fold, (train_idx, val_idx) in enumerate(kf.split(x_train_data, y_train_data)):

            print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)

            X_train, X_valid = x_train_data[train_idx], x_train_data[val_idx]
            X_extra_train, X_extra_valid = x_extra_data[train_idx], x_extra_data[val_idx]
            Y_train, Y_valid = y_train_data[train_idx], y_train_data[val_idx]

            l_fet = X_train.shape[-2:]
            l_ext = X_extra_train.shape[-1]
            
            model = self.LSTM(l_fet, l_ext)
            
            es = keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error', 
                                               min_delta=1e-05,
                                               patience=30,
                                               verbose=0, 
                                               mode='min', 
                                               restore_best_weights=True)
            plateau = keras.callbacks.ReduceLROnPlateau(monitor='val_root_mean_squared_error',
                                                        factor=0.1,
                                                        patience=10,
                                                        verbose=0,
                                                        min_lr=5e-7, 
                                                        mode='min')

            model.fit([X_train, X_extra_train], Y_train,
                      validation_data=([X_valid, X_extra_valid], Y_valid),
                      epochs=EPOCH,
                      batch_size=BATCH_SIZE,
                      callbacks = [es, plateau],
                      verbose=2)
            
            
            self.eval_model(model, [X_valid, X_extra_valid], Y_valid)

            models.append(model)
            
        return models
            
            
    def eval_model(self, model, x_valid, y_valid):
        preds = pd.DataFrame(np.round(model.predict(x_valid)).astype('int32')).stack().reset_index(drop=True)
        y_valid = pd.DataFrame(y_valid).stack().reset_index(drop=True)
        print(f' RMSE --> {mean_squared_error(y_valid, preds, squared=False)}')

In [12]:
entel_models = ENTEL_MODELS()
models = entel_models.training()

ENTEL MODELS
--------------- > Fold 1 < ---------------
Epoch 1/1000
74/74 - 7s - loss: 11.5720 - root_mean_squared_error: 3.4018 - val_loss: 8.1787 - val_root_mean_squared_error: 2.8598
Epoch 2/1000
74/74 - 2s - loss: 9.6478 - root_mean_squared_error: 3.1061 - val_loss: 7.0831 - val_root_mean_squared_error: 2.6614
Epoch 3/1000
74/74 - 2s - loss: 8.2160 - root_mean_squared_error: 2.8664 - val_loss: 6.0991 - val_root_mean_squared_error: 2.4696
Epoch 4/1000
74/74 - 2s - loss: 7.5666 - root_mean_squared_error: 2.7507 - val_loss: 5.4625 - val_root_mean_squared_error: 2.3372
Epoch 5/1000
74/74 - 2s - loss: 6.6718 - root_mean_squared_error: 2.5830 - val_loss: 4.5574 - val_root_mean_squared_error: 2.1348
Epoch 6/1000
74/74 - 2s - loss: 5.7452 - root_mean_squared_error: 2.3969 - val_loss: 4.1980 - val_root_mean_squared_error: 2.0489
Epoch 7/1000
74/74 - 2s - loss: 5.4570 - root_mean_squared_error: 2.3360 - val_loss: 3.6218 - val_root_mean_squared_error: 1.9031
Epoch 8/1000
74/74 - 2s - loss: 4

In [13]:
preds = []

for model in models:
    _pred = model.predict([x_test_data, x_test_extra_data])
    preds.append(_pred)

In [14]:
pred_sub = ((preds[0] + preds[1] + preds[2] + preds[3] + preds[4]) / 5)

df_submission = pd.merge(df_train.iloc[:, :5], pd.DataFrame(pred_sub), how='inner', left_index=True, right_index=True)
df_submission = df_submission.rename(columns={0: 'SEMANA_51', 
                                             1: 'SEMANA_52',
                                             2: 'SEMANA_53',
                                             3: 'SEMANA_54',
                                             4: 'SEMANA_55',
                                             5: 'SEMANA_56',
                                             6: 'SEMANA_57',
                                             7: 'SEMANA_58',
                                             8: 'SEMANA_59',
                                             9: 'SEMANA_60'
                                            })

df_submission['BASE_ID'] = df_submission['Z_MODELO'].astype(str) + '|' + df_submission['Z_PUNTO_VENTA'].astype(str) + '|' + df_submission['Z_GAMA'].astype(str)
df_submission = df_submission.iloc[:, 5:]
df_submission = df_submission.set_index('BASE_ID').stack().to_frame().reset_index()
df_submission['BASE_ID'] = df_submission['BASE_ID'].astype(str) + '|' + df_submission['level_1'].astype(str)
df_submission = df_submission.drop(['level_1'], axis=1)
df_submission.columns = ['ID', 'Demanda']

In [17]:
df_submission.to_csv('entel_v1.0.csv', index=False)

<a href='./entel_v1.0.csv'>download</a>