### Import Libraries

In [38]:
import keras
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
from mlflow.models import infer_signature

In [39]:
data = pd.read_csv(r"D:\Melbin\SELF\Customer_Lifetime_Value_Prediction\data\final_dataset.csv")
data.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Recency,Frequency,Monetary
0,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:01:00,1.04,12346.0,United Kingdom,77183.6,,2,154367.2
1,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:17:00,1.04,12346.0,United Kingdom,77183.6,0.0,2,154367.2
2,537626,85116,BLACK CANDELABRA T-LIGHT HOLDER,12,2010-12-07 14:57:00,2.1,12347.0,Iceland,25.2,,182,4310.0
3,537626,22375,AIRLINE BAG VINTAGE JET SET BROWN,4,2010-12-07 14:57:00,4.25,12347.0,Iceland,17.0,0.0,182,4310.0
4,537626,71477,COLOUR GLASS. STAR T-LIGHT HOLDER,12,2010-12-07 14:57:00,3.25,12347.0,Iceland,39.0,0.0,182,4310.0


In [40]:
data['Quantity'].unique()

array([74215,    12,     4,    36,     6,    30,     3,    24,    10,
         240,     8,     2,    18,    16,    48,    20,    72,   120,
         144,     1,    80,    96,    25,     5,    32,    60,    40,
          15,     9,    28,   180,    64,    13,   288,   100,    50,
         256,     7,   108,    11,   272,   192,   576,   160,   168,
         200,   480,   384,   128,   216,   432,   320,    86,   600,
         400,    84,   336,   720,  1152,   250,   960,    27,    21,
          43,    75,    33,    17,    42,    14,   125,    44,    70,
         360,   150,    56,   300,   912,    90,    94,    66,   183,
          52,   408,   224,   378,    45,    22,   102,   109,   132,
         234,   244,  1488,   624,    19,  2040,   864,    29,    35,
        1728,   291,   462,  1200,   227,  2700,   222,   228,   246,
         420,   164,    54,    78,  1788,  4800,   774,    41,   280,
         270,   220,   350,   348,  1900,  2880,   116,   968,   140,
         252,   110,

### Understanding Dataset

In [41]:
print("Shape of dataset :")
print("Number of columns present : ", data.shape[1])
print("Number of rows present: ", data.shape[0])


Shape of dataset :
Number of columns present :  12
Number of rows present:  406829


In [42]:
print("Number of null values:\n", data.isnull().sum())

Number of null values:
 InvoiceNo         0
StockCode         0
Description       0
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           0
TotalPrice        0
Recency        4372
Frequency         0
Monetary          0
dtype: int64


In [43]:
data = data.dropna(subset=['StockCode'])

In [44]:
print("About data__:\n", data.describe())

About data__:
             Quantity      UnitPrice     CustomerID     TotalPrice  \
count  406829.000000  406829.000000  406829.000000  406829.000000   
mean       13.412279       3.460471   15287.690570      23.407255   
std       248.624170      69.315162    1713.600303     427.437730   
min         1.000000       0.000000   12346.000000       0.000000   
25%         2.000000       1.250000   13953.000000       4.680000   
50%         5.000000       1.950000   15152.000000      11.800000   
75%        12.000000       3.750000   16791.000000      19.800000   
max     80995.000000   38970.000000   18287.000000  168469.600000   

             Recency      Frequency       Monetary  
count  402457.000000  406829.000000  406829.000000  
mean        1.433656     673.695688   12088.958782  
std        12.198087    1471.805936   32495.300306  
min         0.000000       1.000000       0.000000  
25%         0.000000      92.000000    1119.810000  
50%         0.000000     206.000000    2740.4

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406829 entries, 0 to 406828
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    406829 non-null  object 
 1   StockCode    406829 non-null  object 
 2   Description  406829 non-null  object 
 3   Quantity     406829 non-null  int64  
 4   InvoiceDate  406829 non-null  object 
 5   UnitPrice    406829 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      406829 non-null  object 
 8   TotalPrice   406829 non-null  float64
 9   Recency      402457 non-null  float64
 10  Frequency    406829 non-null  int64  
 11  Monetary     406829 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 37.2+ MB


In [46]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalPrice', 'Recency',
       'Frequency', 'Monetary'],
      dtype='object')

In [47]:
data.drop(columns=['InvoiceDate'], inplace =True)

data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'UnitPrice',
       'CustomerID', 'Country', 'TotalPrice', 'Recency', 'Frequency',
       'Monetary'],
      dtype='object')

### Feature Scaling 

In [48]:
import numpy as np
from sklearn.preprocessing import StandardScaler

original_cols = ['Monetary', 'TotalPrice', 'Frequency', 'Recency', 'Quantity', 'UnitPrice']
scaler = StandardScaler()

# Standardize all columns at once (more efficient)
data[original_cols] = scaler.fit_transform(data[original_cols])

In [49]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country,TotalPrice,Recency,Frequency,Monetary
0,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,298.449177,-0.03492,12346.0,United Kingdom,180.518195,,-0.456376,4.378431
1,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,298.449177,-0.03492,12346.0,United Kingdom,180.518195,-0.117531,-0.456376,4.378431
2,537626,85116,BLACK CANDELABRA T-LIGHT HOLDER,-0.00568,-0.019627,12347.0,Iceland,0.004194,,-0.334077,-0.239387
3,537626,22375,AIRLINE BAG VINTAGE JET SET BROWN,-0.037858,0.01139,12347.0,Iceland,-0.01499,-0.117531,-0.334077,-0.239387
4,537626,71477,COLOUR GLASS. STAR T-LIGHT HOLDER,-0.00568,-0.003036,12347.0,Iceland,0.03648,-0.117531,-0.334077,-0.239387


### Data Split - Spliting the data into training, validation and test sets

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
# X = data.drop(columns=['Monetary'])
# y = data['Monetary']

In [52]:
# X.head()

In [53]:
# # Option 1A: Use Monetary as CLV (simplest)
# y = df['Monetary']  # Total customer spend so far

# # Option 1B: Calculate more sophisticated CLV
# df['CLV'] = df['Monetary'] * (df['Frequency'] / df['customer_lifetime_days']) * 365
# # This estimates annual customer value
# y = df['CLV']

# # Option 1C: Future CLV prediction
# # If you want to predict future value, create a forward-looking metric
# df['Average_Order_Value'] = df['Monetary'] / df['Frequency']
# df['Purchase_Rate'] = df['Frequency'] / df['customer_lifetime_days']
# df['Predicted_CLV'] = df['Average_Order_Value'] * df['Purchase_Rate'] * 365
# y = df['Predicted_CLV']

In [54]:
train, test= train_test_split(data,test_size= 0.25, random_state= 42)

In [55]:
train

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country,TotalPrice,Recency,Frequency,Monetary
169442,556776,47590B,PINK HAPPY BIRTHDAY BUNTING,-0.045902,0.028703,14730.0,United Kingdom,-0.029261,-0.117531,-0.117336,-0.310319
132961,561534,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,0.332179,-0.045740,14307.0,United Kingdom,0.010371,-0.117531,-0.335436,-0.279833
308208,554853,22386,JUMBO BAG PINK POLKADOT,-0.013725,-0.019916,16839.0,United Kingdom,-0.006100,-0.117531,-0.255262,0.176701
402338,555124,21159,MOODY BOY DOOR HANGER,-0.045902,-0.029005,18204.0,United Kingdom,-0.047977,-0.117531,-0.320488,-0.310226
244808,543996,21533,RETROSPOT LARGE MILK JUG,-0.029813,0.021489,15811.0,United Kingdom,0.014722,-0.117531,-0.363293,-0.302291
...,...,...,...,...,...,...,...,...,...,...,...
259178,563444,85123A,WHITE HANGING HEART T-LIGHT HOLDER,-0.029813,-0.007365,16033.0,United Kingdom,-0.013352,-0.117531,0.324978,-0.097976
365838,566732,22572,ROCKING HORSE GREEN CHRISTMAS,-0.005680,-0.037661,17716.0,United Kingdom,-0.030899,-0.117531,-0.285157,-0.193344
131932,560844,23237,SET OF 4 KNICK KNACK TINS LEAVES,-0.005680,0.004177,14298.0,United Kingdom,0.050517,-0.117531,0.656544,1.234125
146867,569524,23210,WHITE ROCKING HORSE HAND PAINTED,-0.049924,-0.031890,14506.0,United Kingdom,-0.051837,-0.117531,-0.243032,-0.310100


In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 305121 entries, 169442 to 121958
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    305121 non-null  object 
 1   StockCode    305121 non-null  object 
 2   Description  305121 non-null  object 
 3   Quantity     305121 non-null  float64
 4   UnitPrice    305121 non-null  float64
 5   CustomerID   305121 non-null  float64
 6   Country      305121 non-null  object 
 7   TotalPrice   305121 non-null  float64
 8   Recency      301852 non-null  float64
 9   Frequency    305121 non-null  float64
 10  Monetary     305121 non-null  float64
dtypes: float64(7), object(4)
memory usage: 27.9+ MB


In [57]:
## Train dataset
train_x = train.drop(['Monetary'], axis=1).values
train_y = train[['Monetary']].values.ravel()

## Test dataset
test_x =test.drop(['Monetary'], axis=1).values
test_y =test[['Monetary']].values.ravel()

##  Splitting this train data inot train and validation

train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.20, random_state=42)

signature=infer_signature(train_x, train_y)

### ANN Model

In [58]:
def train_model(param, epochs, train_x, train_y, valid_x, valid_y, test_x, test_y):
    
    ## Define model Architecture
    mean= np.mean(train_x, axis=0)                              ##Normalization
    var=np.var(train_x,axis=0)
    
    model=keras.Sequential(
    [
        keras.Input([train_x.shape[1]]),                        ##Input shape {Number of columns}
        keras.layers.Normalization(mean=mean, variance=var),    ## Normalization
        keras.layers.Dense(64, activation ='relu'),             ## Hidden Neurons {64  layers}
        keras.layers.Dense(1)                                   ## Output Layer {1 output layer}
    ])
    
    ## Compile the model
    model.compile(optimizer=keras.optimizers.SGD(
        learning_rate =param["lr"],                             ## train with differernt learning rate 
        momentum =param['momentum']                             ## train with multile momentum
    ),
                  loss="mean_squared_error",
                  metrics=[keras.metrics.RootMeanSquaredError()]
    )
    
    ## Trean the ANN model with lr and momentum parameters with MLFlow trackering
    with mlflow.start_run(nested = True):
        model.fit(train_x, train_y, validation_data= (valid_x, valid_y),
                 epochs=epochs,
                 batch_size=64
                 )
        ## Evaluate the model
        eval_result = model.evaluate(valid_x, valid_y, batch_size=64)
        eval_rmse = eval_result[1]
        
        ## Log the parameters and results
        mlflow.log_params(param)
        mlflow.log_metric("eval_rmse", eval_rmse)
        
        ## Log the model
        mlflow.tensorflow.log_model(model, "model", signature= signature)
        
        return {"loss": eval_rmse, "status": STATUS_OK, "model": model}

In [None]:
def objective(params):
    ## MLFlow will track the parameters and results for each run
    result = train_model(
        params,
        epochs =3,
        train_x = np.array(train_x),
        valid_x = np.array(valid_x),
        test_x = np.array(test_x),
        train_y = np.array(train_y),
        valid_y = np.array(valid_y),
        test_y = np.array(test_y),


    )
    return result

In [60]:
## Set all parameters
space={
    "lr": hp.loguniform("lr",np.log(1e-5),np.log(1e-1)),
    "momentum": hp.uniform("momentum", 0.0, 1.0)
}


In [61]:
mlflow.set_experiment("/customer-life-value-predicion")
with mlflow.start_run():
    # Conduct the hyperparameter search using Hyperopt
    trials= Trials()
    best=fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=4,
        trials=trials
    )
    
    
    # featch the details of the best run
    best_run = sorted(trials.results, key=lambda x:x["loss"])[0]
    
    # Log the best parameters, loss, and model
    mlflow.log_params(best)
    mlflow.log_metric("eval_rmse", best_run["loss"])
    mlflow.tensorflow.log_model(best_run["model"], "model", signature= signature)
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    
    # Print out the best parameters and corresponding loss
    print(f"Best parameters: {best}")
    print(f"Best eval rmse: {best_run['loss']}")
    

  0%|          | 0/4 [00:00<?, ?trial/s, best loss=?]

job exception: unsupported operand type(s) for /: 'str' and 'int'



  0%|          | 0/4 [04:20<?, ?trial/s, best loss=?]


TypeError: unsupported operand type(s) for /: 'str' and 'int'