### Import Libraries

In [23]:
import keras
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
from mlflow.models import infer_signature

In [24]:
data = pd.read_csv("~/Personal_project/Customer-Lifetime-Value-Prediction/data/final_dataset.csv")
data.head(5)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,Monetary,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
0,541431.0,23166.0,74215,2011-01-18 10:01:00,1.04,12346.0,77183.6,0.0,1,77183.6,...,-0.003681,-0.012714,-0.015699,0.055104,-0.003688,2011,1,18,10,1
1,537626.0,85116.0,12,2010-12-07 14:57:00,2.1,12347.0,25.2,0.0,162,3653.45,...,-0.05809,0.034467,-0.090115,0.036342,0.050075,2010,12,7,14,1
2,537626.0,22375.0,4,2010-12-07 14:57:00,4.25,12347.0,17.0,0.0,162,3653.45,...,-0.031094,0.066135,-0.042782,0.062202,-0.060593,2010,12,7,14,1
3,537626.0,71477.0,12,2010-12-07 14:57:00,3.25,12347.0,39.0,0.0,162,3653.45,...,-0.037794,0.041952,-0.082633,0.030501,0.078181,2010,12,7,14,1
4,537626.0,22492.0,36,2010-12-07 14:57:00,0.65,12347.0,23.4,0.0,162,3653.45,...,0.001612,0.074616,-0.024091,-0.016158,-0.066717,2010,12,7,14,1


In [29]:
data['Quantity'].unique()

array([74215,    12,     4,    36,     6,    10,    24,     3,   240,
           8,     2,    16,    48,    20,    18,    72,   120,   144,
          80,    96,     1,    25,     5,    32,    60,    40,    15,
           9,    30,    28,   180,    13,   288,   100,    50,   256,
         108,    11,    64,   192,   576,   160,   168,   200,   480,
         384,   128,   216,   432,   320,   600,   400,    84,   336,
         720,  1152,   250,   960,    43,    75,     7,    42,    21,
         125,    44,    14,    70,   150,   360,    56,   300,   912,
          90,    94,    66,    17,    27,    52,    45,   102,   109,
         132,   234,   244,  1488,    19,  2040,   864,    29,    35,
        1728,   291,   462,  1200,   227,  2700,   222,   228,   246,
         420,    54,    78,    33,  1788,  4800,   774,    41,   280,
         270,   220,   350,   348,  1900,  2880,   116,   968,   276,
         700,   456,    37,   648,   198,  1440,    22,    26,  4300,
          63,    23,

### Understanding Dataset

In [25]:
print("Shape of dataset :")
print("Number of columns present : ", data.shape[1])
print("Number of rows present: ", data.shape[0])


Shape of dataset :
Number of columns present :  37
Number of rows present:  363119


In [26]:
print("Number of null values:\n", data.isnull().sum())

Number of null values:
 InvoiceNo                 0
StockCode                 0
Quantity                  0
InvoiceDate               0
UnitPrice                 0
CustomerID                0
TotalPrice                0
Recency                   0
Frequency                 0
Monetary                  0
customer_lifetime_days    0
Country_Label             0
desc_topic_1              0
desc_topic_2              0
desc_topic_3              0
desc_topic_4              0
desc_topic_5              0
desc_topic_6              0
desc_topic_7              0
desc_topic_8              0
desc_topic_9              0
desc_topic_10             0
desc_topic_11             0
desc_topic_12             0
desc_topic_13             0
desc_topic_14             0
desc_topic_15             0
desc_topic_16             0
desc_topic_17             0
desc_topic_18             0
desc_topic_19             0
desc_topic_20             0
InvoiceYear               0
InvoiceMonth              0
InvoiceDay              

In [27]:
print("About data__:\n", data.describe())

About data__:
            InvoiceNo      StockCode       Quantity      UnitPrice  \
count  363119.000000  363119.000000  363119.000000  363119.000000   
mean   560820.303099   26967.327190      13.129062       2.886098   
std     13076.895436   15676.370832     188.527851       4.361971   
min    536365.000000   10002.000000       1.000000       0.000000   
25%    549547.000000   21955.000000       2.000000       1.250000   
50%    562150.000000   22603.000000       6.000000       1.700000   
75%    572237.000000   23171.000000      12.000000       3.750000   
max    581587.000000   90208.000000   80995.000000     649.500000   

          CustomerID     TotalPrice        Recency      Frequency  \
count  363119.000000  363119.000000  363119.000000  363119.000000   
mean    15295.738347      22.073616       1.529623     594.085011   
std      1711.946809     321.862986      12.708105    1293.994833   
min     12346.000000       0.000000       0.000000       1.000000   
25%     13969.0000

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363119 entries, 0 to 363118
Data columns (total 37 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   InvoiceNo               363119 non-null  float64
 1   StockCode               363119 non-null  float64
 2   Quantity                363119 non-null  int64  
 3   InvoiceDate             363119 non-null  object 
 4   UnitPrice               363119 non-null  float64
 5   CustomerID              363119 non-null  float64
 6   TotalPrice              363119 non-null  float64
 7   Recency                 363119 non-null  float64
 8   Frequency               363119 non-null  int64  
 9   Monetary                363119 non-null  float64
 10  customer_lifetime_days  363119 non-null  int64  
 11  Country_Label           363119 non-null  int64  
 12  desc_topic_1            363119 non-null  float64
 13  desc_topic_2            363119 non-null  float64
 14  desc_topic_3        