In [0]:
!pip install tensorflow
!pip install mlflow

In [0]:
##PACKAGES#######
# For dataframe
import pandas as pd
import numpy as np
from functools import reduce
# For date
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
import time
# For data transformation
from scipy import stats
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
# For neural network
import tensorflow as tf
from tensorflow import keras
import mlflow.tensorflow
from tensorflow.keras import layers
# For auto loggoing
mlflow.tensorflow.autolog()


### Funtions ####
def transform_feature(data, feature_start, feature_end, predict_start = None, predict_end = None):
          # Need categorical features
          data=data.dropna(axis=1,how='all')
          data = data.fillna(0)
          data[['order_quant', 'monetary',
                'running_second_at_order',
                'order_age', 'cylinders',
                'bore_size']]             = data[['order_quant','monetary',
                                                'running_second_at_order',
                                                'order_age',  'cylinders',
                                                'bore_size']].apply(pd.to_numeric) 
          
          # Feature engineering
          features_data = data.loc[(data.date >= (pd.to_datetime(feature_start))) \
                                   &(data.date <= (pd.to_datetime(feature_end))), :]

          recency = (features_data.groupby('cust')['date'].max() - features_data.groupby('cust')['date'].min())\
                        .apply(lambda x: x.days).rename('recency')

          frequency = features_data.groupby('cust')['date']\
                                    .count()\
                                    .rename('frequency')

          t = features_data.groupby('cust')['date'].min()\
                            .apply(lambda x: (pd.to_datetime(feature_end)- x).days)\
                            .rename('t')

          time_between = (t / frequency).rename('time_between')

          total_orderintake = features_data.groupby('cust')['monetary'].sum()\
                                            .rename('total_orderintake')

          avg_orderintake = (total_orderintake / frequency).rename('avg_orderintake')

          total_orderquant = features_data.groupby('cust')['order_quant']\
                                            .sum()\
                                            .rename('total_orderquant')

          avg_orderquant = (total_orderquant / frequency).rename('avg_orderquant')

          total_running_second_at_order = features_data.groupby('cust')['running_second_at_order']\
                                                        .sum()\
                                                        .rename('total_running_second_at_order')

          avg_running_second_at_order = (total_running_second_at_order / frequency)\
                                            .rename('avg_running_second_at_order')

          order_age_mean = features_data.groupby('cust')['order_age']\
                                        .mean()\
                                        .rename('order_age_mean')

          duplicated_features = features_data[['cust'
                                             ,'cylinders'
                                             ,'bore_size']]\
                                            .drop_duplicates()

          total_cylinders = duplicated_features.groupby('cust')['cylinders']\
                                            .sum()\
                                            .rename('total_cylinders')

          avg_cylinders = (total_cylinders / frequency).rename('avg_cylinders')
          mean_boresize = duplicated_features.groupby('cust')['bore_size']\
                                              .mean()\
                                              .rename('mean_boresize')
          feature_data_raw = pd.concat([ recency,
                                        frequency,
                                        t,
                                        time_between, 
                              #          total_orderintake,
                                        avg_orderintake,
                              #          total_orderquant,
                                        avg_orderquant,
                                        avg_running_second_at_order, 
                                        order_age_mean,
                                        total_cylinders,
                                        avg_cylinders,
                                        mean_boresize
                                        ],
                                       axis=1).reset_index()
          feature_data_raw = feature_data_raw.fillna(0)
          feature_data_raw = feature_data_raw.loc[feature_data_raw['frequency'] > 0, :]

          # Log transformation order intake
          feature_data = feature_data_raw.copy()
          feature_data['avg_orderintake'] = np.log(feature_data['avg_orderintake'])


          # Scaling min-max
          scaler = MinMaxScaler()
          transformcols = [   
                                'avg_running_second_at_order',
                                'frequency',
                                'recency',
                                't',
                                'time_between',
                                'avg_orderquant',
                                'order_age_mean',
                                'total_cylinders',
                                'avg_cylinders',
                                'mean_boresize',
                           ]


          feature_data[transformcols] = scaler.fit_transform(feature_data[transformcols])
          feature_data = feature_data.fillna(0)

          if (predict_start==None) & (predict_end==None):   
            feature_data= feature_data.set_index('cust')
            feature_data = feature_data.loc[feature_data['frequency'] > 0, :]
            # return raw data, X_test
            return feature_data_raw, feature_data 

          else:          
            target_data = data.loc[(data.date >= (pd.to_datetime(predict_start))) 
                                 & (data.date <= (pd.to_datetime(predict_end))), :]


            target_orderintake = target_data.groupby(['cust'])['monetary']\
                                        .sum()\
                                        .rename('target_orderintake')\
                                          .reset_index()
            model_data = reduce(lambda  left,right: pd.merge(left,
                                                                right,
                                                                on=['cust'],
                                                                how='outer'),
                                   [feature_data,target_orderintake])
            model_data = model_data.fillna(0)
            model_data= model_data.set_index('cust')
            model_data = model_data.loc[model_data['frequency'] > 0, :]
            # Return X_train, y_train or X_test, y_test
            return model_data.iloc[:, :-1],  model_data.iloc[:, -1]
    



def get_train_test(
              data,
              feature_start=None,
              feature_end=None, 
              predict_start=None, 
              predict_end=None):
             #years_back: max 7 years ago
    
                    # Transform date
                    data['date'] = pd.to_datetime(data.date, format = '%Y-%m-%d')
                    data['date'] = pd.to_datetime(data.date.dt.date)
                    # Assign date
                    d = datetime.fromisoformat(str(max(data['date'])))
              
              
                    # In case no period is assigned, by default:

                    if(feature_start==None) & (feature_end==None) &(predict_start==None) & (predict_end==None) :
                      # Train years data up to last 365 days, to predict 365 days up to date
                        train_feature_start = d + relativedelta(months=-6*4)
            
                        train_feature_end   = d + relativedelta(months=-6)

                        train_predict_start = train_feature_end + relativedelta(days=+1)
                      # Until last year
                        train_predict_end   = d
                    else:
                        train_feature_start = feature_start
                        train_feature_end   = feature_end
                        train_predict_start = predict_start 
                        train_predict_end   = predict_end 
                                          
                    X_train, y_train = transform_feature(data, 
                                                         feature_start  =  train_feature_start,
                                                         feature_end    =  train_feature_end, 
                                                         predict_start  =  train_predict_start ,
                                                         predict_end    =  train_predict_end )
                    
            
        
                    test_feature_start = d + relativedelta(months=-6*5)
                    test_feature_end   = d + relativedelta(months=-6*2)
                    test_predict_start = test_feature_end        + relativedelta(days=+1)
                    test_predict_end   = test_predict_start      + relativedelta(months=+6)

                    X_test, y_test = transform_feature(data, 
                                                         feature_start = test_feature_start,
                                                         feature_end   = test_feature_end, 
                                                         predict_start = test_predict_start ,
                                                         predict_end   = test_predict_end) 

                    print('X_train, y_train, X_test, y_test in that order are generated:')
                    print('Training features from ' + str(train_feature_start) + ' to ' + str(train_feature_end))
                    print('Training target from ' + str(train_predict_start) + ' to ' + str(train_predict_end))
                    print('Test features from ' + str(test_feature_start) + ' to ' + str(test_feature_end))
                    print('Test target from ' + str(test_predict_start) + ' to ' + str(test_predict_end))
                    return  X_train, y_train, X_test, y_test
                      
  
def get_predict(
              data,
              feature_start=None,
              feature_end=None):
  # feature_start, feature_end: chosen period to predict
  # String - %Y-%m-%d, e.g: '2020-10-20' 
          
              # Transform date
              data['date'] = pd.to_datetime(data.date, format = '%Y-%m-%d')
              data['date'] = pd.to_datetime(data.date.dt.date)
              # Assign date
              d = datetime.fromisoformat(str(max(data['date'])))

              # Train the previous year data up to date to predict 365 days ahead
              if(feature_start==None) & (feature_end==None):
                predict_feature_start = d + relativedelta(months=-6*3)
                predict_feature_end   = d 
              else:
                predict_feature_start = feature_start
                predict_feature_end = feature_end
              
              predict_target_start = d + relativedelta(days=+1)
              predict_target_end   = d + relativedelta(months=+6*2)

              X_raw, X_predict = transform_feature(data, 
                                                   feature_start = predict_feature_start ,
                                                   feature_end   = predict_feature_end, 
                                                   predict_start = None, 
                                                   predict_end = None )

              print('X_raw, X_predict in that order are generated:')
              print('Predict features from ' + str(predict_feature_start) + ' to ' + str(predict_feature_end))
              print('Predict target from ' + str(predict_target_start) + ' to ' + str(predict_target_end))
              return X_raw, X_predict
          
def DNN_model(X_train,
              y_train,
              X_test,
              y_test,
              X_predict,
              epochs=2000,
              batch_size=16,
              validation_split=0.2,
              optimizer=0.001
             ):
                            # Alternative: instead of using all historical data, 
                            # after run on historical data for training set, 
                            # general test set using MCMC simulation, 
                            # then run the model again for test set, then predict 
          
                            model = keras.Sequential()
                            model.add(layers.Dense(256, activation='relu', input_shape=[len(X_train.columns), ]))
                            model.add(layers.Dropout(0.3))
                            model.add(layers.Dense(64, activation='relu'))
                            model.add(layers.Dropout(0.3))
                            model.add(layers.Dense(32, activation='relu'))
                            model.add(layers.Dropout(0.3))
                            model.add(layers.Dense(16, activation='relu'))
                            model.add(layers.Dense(1))

                            optimizer = tf.keras.optimizers.Adam(optimizer)

                            model.compile(loss='mse',
                                          optimizer=optimizer, 
                                          metrics=['mae', 'mse'])

                            model.fit(X_train,
                                      y_train, 
                                      epochs=epochs,
                                      batch_size=batch_size,
                                      validation_split=validation_split)
                          
                            # Evaluate model
                            train_result = model.predict(X_train).ravel()
                            test_result  = model.predict(X_test).ravel()
                            r2_diff      = abs(r2_score(y_test, test_result) - r2_score(y_train, train_result))
                            # Need to check val_loss as well
                            if r2_diff <= 0.1:
                              print('Not overfit')
                              predict_result  = model.predict(X_predict).ravel()
                              return model, train_result , test_result, predict_result
                            else:
                              print('Overfit')
                              pass

def get_result():
        X_raw, X_predict = get_predict(data,
                                      feature_start=None,
                                      feature_end=None)
    
        X_train, y_train, X_test, y_test = get_train_test(
                                                    data,
                                                    feature_start=None,
                                                    feature_end=None, 
                                                    predict_start=None, 
                                                    predict_end=None)
      
        model, train_result , test_result, predict_result = DNN_model(X_train,
                                                                      y_train,
                                                                      X_test,
                                                                      y_test,
                                                                      X_predict,
                                                                      epochs=1000,
                                                                      batch_size=16,
                                                                      validation_split=0.2,
                                                                      optimizer=0.001
                                                                       )

        # Combine result
        X_predict        = X_predict.reset_index()
        X_cust           = X_predict[['cust']]
        
        predict_result   = pd.DataFrame(predict_result, 
                                        columns=['dnn_monetary_pred']) 
        
        Xy_preds         = X_cust.merge(predict_result, 
                                        left_index=True, 
                                        right_index=True)
        
        Xy_preds_final   = pd.merge(left=Xy_preds,
                                    right=X_raw, 
                                    left_on='cust',
                                    right_on='cust')
        
        Xy_preds_final.sort_values(by='dnn_monetary_pred',
                                   ascending=False)
        return Xy_preds_final
                          

In [0]:
if __name__ == '__main__':
        mlflow.set_experiment("CLV_NN_Python")
        mlflow.start_run(nested=True)
    
        t = time.localtime()
        date_string = str(t.tm_mday) + '_' + str(t.tm_mon) + '_' + str(t.tm_year)
        # Load input
        data=pd.read_parquet('CLV_input_10_11_2020.parquet', "pyarrow")
        
        # Run
        result = get_result()

        # Save to DBFS
        print('Save result to DBFS')
        result.to_parquet('/CLV_output_{}.parquet'.format(date_string), 'pyarrow', 'snappy')
        
        mlflow.end_run()