### Import Libraries

In [28]:
import keras
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
from mlflow.models import infer_signature

In [29]:
data = pd.read_csv("~/Personal_project/Customer-Lifetime-Value-Prediction/data/final_dataset.csv")
data.head(5)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,Monetary,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
0,541431.0,23166.0,74215,2011-01-18 10:01:00,1.04,12346.0,77183.6,0.0,2,0.0,...,0.032051,-0.062761,0.067446,0.034499,-0.057299,2011,1,18,10,1
1,537626.0,85116.0,12,2010-12-07 14:57:00,2.1,12347.0,25.2,0.0,182,4310.0,...,-0.032339,0.023959,-0.022264,0.033915,0.013968,2010,12,7,14,1
2,537626.0,22375.0,4,2010-12-07 14:57:00,4.25,12347.0,17.0,0.0,182,4310.0,...,-0.088507,0.119232,0.051086,0.038123,-0.035789,2010,12,7,14,1
3,537626.0,71477.0,12,2010-12-07 14:57:00,3.25,12347.0,39.0,0.0,182,4310.0,...,-0.066149,-0.013725,0.000939,0.012026,0.097659,2010,12,7,14,1
4,537626.0,22492.0,36,2010-12-07 14:57:00,0.65,12347.0,23.4,0.0,182,4310.0,...,-0.052948,0.0872,0.02829,0.025431,-0.012222,2010,12,7,14,1


In [30]:
data['Quantity'].unique()

array([74215,    12,     4,    36,     6,    30,     3,    24,    10,
         240,     8,     2,    18,    16,    48,    20,    72,   120,
         144,     1,    80,    96,    25,     5,    32,    60,    40,
          15,     9,    28,   180,    64,    13,   288,   100,    50,
         256,     7,   108,    11,   272,   192,   576,   160,   168,
         200,   480,   384,   128,   216,   432,   320,   600,   400,
          84,   336,   720,  1152,   250,   960,    21,    43,    75,
          33,    42,   125,    44,    14,    70,   360,   150,    56,
         300,   912,    90,    94,    66,    17,   183,    27,    52,
         408,   224,   378,    45,    22,   102,   109,   132,   234,
         244,  1488,    19,  2040,   864,    29,    35,  1728,   291,
         462,  1200,   227,  2700,   222,   228,   246,   420,    54,
          78,  1788,  4800,   774,    41,   280,   270,   220,   350,
         348,  1900,  2880,   116,   968,   276,   700,   456,    37,
         648,   198,

### Understanding Dataset

In [31]:
print("Shape of dataset :")
print("Number of columns present : ", data.shape[1])
print("Number of rows present: ", data.shape[0])


Shape of dataset :
Number of columns present :  37
Number of rows present:  397924


In [32]:
print("Number of null values:\n", data.isnull().sum())

Number of null values:
 InvoiceNo                     0
StockCode                 34805
Quantity                      0
InvoiceDate                   0
UnitPrice                     0
CustomerID                    0
TotalPrice                    0
Recency                       0
Frequency                     0
Monetary                      0
customer_lifetime_days        0
Country_Label                 0
desc_topic_1                  0
desc_topic_2                  0
desc_topic_3                  0
desc_topic_4                  0
desc_topic_5                  0
desc_topic_6                  0
desc_topic_7                  0
desc_topic_8                  0
desc_topic_9                  0
desc_topic_10                 0
desc_topic_11                 0
desc_topic_12                 0
desc_topic_13                 0
desc_topic_14                 0
desc_topic_15                 0
desc_topic_16                 0
desc_topic_17                 0
desc_topic_18                 0
desc_topic_19   

In [33]:
print("About data__:\n", data.describe())

About data__:
            InvoiceNo      StockCode       Quantity      UnitPrice  \
count  397924.000000  363119.000000  397924.000000  397924.000000   
mean   560617.126645   26967.327190      13.021823       3.116174   
std     13106.167695   15676.370832     180.420210      22.096788   
min    536365.000000   10002.000000       1.000000       0.000000   
25%    549234.000000   21955.000000       2.000000       1.250000   
50%    561893.000000   22603.000000       6.000000       1.950000   
75%    572090.000000   23171.000000      12.000000       3.750000   
max    581587.000000   90208.000000   80995.000000    8142.750000   

          CustomerID     TotalPrice        Recency      Frequency  \
count  397924.000000  397924.000000  397924.000000  397924.000000   
mean    15294.315171      22.394749       1.347167     675.249616   
std      1713.169877     309.055588      12.017406    1473.496014   
min     12346.000000       0.000000       0.000000       1.000000   
25%     13969.0000

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397924 entries, 0 to 397923
Data columns (total 37 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   InvoiceNo               397924 non-null  float64
 1   StockCode               363119 non-null  float64
 2   Quantity                397924 non-null  int64  
 3   InvoiceDate             397924 non-null  object 
 4   UnitPrice               397924 non-null  float64
 5   CustomerID              397924 non-null  float64
 6   TotalPrice              397924 non-null  float64
 7   Recency                 397924 non-null  float64
 8   Frequency               397924 non-null  int64  
 9   Monetary                397924 non-null  float64
 10  customer_lifetime_days  397924 non-null  int64  
 11  Country_Label           397924 non-null  int64  
 12  desc_topic_1            397924 non-null  float64
 13  desc_topic_2            397924 non-null  float64
 14  desc_topic_3        

In [35]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice',
       'CustomerID', 'TotalPrice', 'Recency', 'Frequency', 'Monetary',
       'customer_lifetime_days', 'Country_Label', 'desc_topic_1',
       'desc_topic_2', 'desc_topic_3', 'desc_topic_4', 'desc_topic_5',
       'desc_topic_6', 'desc_topic_7', 'desc_topic_8', 'desc_topic_9',
       'desc_topic_10', 'desc_topic_11', 'desc_topic_12', 'desc_topic_13',
       'desc_topic_14', 'desc_topic_15', 'desc_topic_16', 'desc_topic_17',
       'desc_topic_18', 'desc_topic_19', 'desc_topic_20', 'InvoiceYear',
       'InvoiceMonth', 'InvoiceDay', 'InvoiceHour', 'Weekday'],
      dtype='object')

### Feature Scaling 

In [36]:
import numpy as np
from sklearn.preprocessing import StandardScaler

original_cols = ['Monetary', 'TotalPrice', 'Frequency', 'Recency', 'Quantity', 'UnitPrice', 'customer_lifetime_days']
scaler = StandardScaler()

# Standardize all columns at once (more efficient)
data[original_cols] = scaler.fit_transform(data[original_cols])

In [37]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,Monetary,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
0,541431.0,23166.0,411.273612,2011-01-18 10:01:00,-0.093958,12346.0,249.668037,-0.112101,-0.456907,-0.365153,...,0.032051,-0.062761,0.067446,0.034499,-0.057299,2011,1,18,10,1
1,537626.0,85116.0,-0.005664,2010-12-07 14:57:00,-0.045987,12347.0,0.009077,-0.112101,-0.334748,-0.222337,...,-0.032339,0.023959,-0.022264,0.033915,0.013968,2010,12,7,14,1
2,537626.0,22375.0,-0.050005,2010-12-07 14:57:00,0.051312,12347.0,-0.017456,-0.112101,-0.334748,-0.222337,...,-0.088507,0.119232,0.051086,0.038123,-0.035789,2010,12,7,14,1
3,537626.0,71477.0,-0.005664,2010-12-07 14:57:00,0.006056,12347.0,0.053729,-0.112101,-0.334748,-0.222337,...,-0.066149,-0.013725,0.000939,0.012026,0.097659,2010,12,7,14,1
4,537626.0,22492.0,0.127359,2010-12-07 14:57:00,-0.111608,12347.0,0.003253,-0.112101,-0.334748,-0.222337,...,-0.052948,0.0872,0.02829,0.025431,-0.012222,2010,12,7,14,1


### Data Split - Train and Test

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X = data.drop(columns=['Monetary'])
y = data['Monetary']

In [40]:
X.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,customer_lifetime_days,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
0,541431.0,23166.0,411.273612,2011-01-18 10:01:00,-0.093958,12346.0,249.668037,-0.112101,-0.456907,-1.918277,...,0.032051,-0.062761,0.067446,0.034499,-0.057299,2011,1,18,10,1
1,537626.0,85116.0,-0.005664,2010-12-07 14:57:00,-0.045987,12347.0,0.009077,-0.112101,-0.334748,0.976813,...,-0.032339,0.023959,-0.022264,0.033915,0.013968,2010,12,7,14,1
2,537626.0,22375.0,-0.050005,2010-12-07 14:57:00,0.051312,12347.0,-0.017456,-0.112101,-0.334748,0.976813,...,-0.088507,0.119232,0.051086,0.038123,-0.035789,2010,12,7,14,1
3,537626.0,71477.0,-0.005664,2010-12-07 14:57:00,0.006056,12347.0,0.053729,-0.112101,-0.334748,0.976813,...,-0.066149,-0.013725,0.000939,0.012026,0.097659,2010,12,7,14,1
4,537626.0,22492.0,0.127359,2010-12-07 14:57:00,-0.111608,12347.0,0.003253,-0.112101,-0.334748,0.976813,...,-0.052948,0.0872,0.02829,0.025431,-0.012222,2010,12,7,14,1


In [None]:
# Option 1A: Use Monetary as CLV (simplest)
y = df['Monetary']  # Total customer spend so far

# Option 1B: Calculate more sophisticated CLV
df['CLV'] = df['Monetary'] * (df['Frequency'] / df['customer_lifetime_days']) * 365
# This estimates annual customer value
y = df['CLV']

# Option 1C: Future CLV prediction
# If you want to predict future value, create a forward-looking metric
df['Average_Order_Value'] = df['Monetary'] / df['Frequency']
df['Purchase_Rate'] = df['Frequency'] / df['customer_lifetime_days']
df['Predicted_CLV'] = df['Average_Order_Value'] * df['Purchase_Rate'] * 365
y = df['Predicted_CLV']