### Import Libraries

In [27]:
import keras
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
from mlflow.models import infer_signature

In [28]:
data = pd.read_csv("~/Personal_project/Customer_Life_value/Customer-Lifetime-Value-Prediction/data/final_dataset.csv")
data.head(5)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,Monetary,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
0,541431.0,23166.0,74215,2011-01-18 10:01:00,1.04,12346.0,77183.6,0.0,1,77183.6,...,-0.003681,-0.012714,-0.015699,0.055104,-0.003688,2011,1,18,10,1
1,537626.0,85116.0,12,2010-12-07 14:57:00,2.1,12347.0,25.2,0.0,162,3653.45,...,-0.05809,0.034467,-0.090115,0.036342,0.050075,2010,12,7,14,1
2,537626.0,22375.0,4,2010-12-07 14:57:00,4.25,12347.0,17.0,0.0,162,3653.45,...,-0.031094,0.066135,-0.042782,0.062202,-0.060593,2010,12,7,14,1
3,537626.0,71477.0,12,2010-12-07 14:57:00,3.25,12347.0,39.0,0.0,162,3653.45,...,-0.037794,0.041952,-0.082633,0.030501,0.078181,2010,12,7,14,1
4,537626.0,22492.0,36,2010-12-07 14:57:00,0.65,12347.0,23.4,0.0,162,3653.45,...,0.001612,0.074616,-0.024091,-0.016158,-0.066717,2010,12,7,14,1


In [29]:
data['Quantity'].unique()

array([74215,    12,     4,    36,     6,    10,    24,     3,   240,
           8,     2,    16,    48,    20,    18,    72,   120,   144,
          80,    96,     1,    25,     5,    32,    60,    40,    15,
           9,    30,    28,   180,    13,   288,   100,    50,   256,
         108,    11,    64,   192,   576,   160,   168,   200,   480,
         384,   128,   216,   432,   320,   600,   400,    84,   336,
         720,  1152,   250,   960,    43,    75,     7,    42,    21,
         125,    44,    14,    70,   150,   360,    56,   300,   912,
          90,    94,    66,    17,    27,    52,    45,   102,   109,
         132,   234,   244,  1488,    19,  2040,   864,    29,    35,
        1728,   291,   462,  1200,   227,  2700,   222,   228,   246,
         420,    54,    78,    33,  1788,  4800,   774,    41,   280,
         270,   220,   350,   348,  1900,  2880,   116,   968,   276,
         700,   456,    37,   648,   198,  1440,    22,    26,  4300,
          63,    23,

### Understanding Dataset

In [30]:
print("Shape of dataset :")
print("Number of columns present : ", data.shape[1])
print("Number of rows present: ", data.shape[0])


Shape of dataset :
Number of columns present :  37
Number of rows present:  363119


In [31]:
print("Number of null values:\n", data.isnull().sum())

Number of null values:
 InvoiceNo                 0
StockCode                 0
Quantity                  0
InvoiceDate               0
UnitPrice                 0
CustomerID                0
TotalPrice                0
Recency                   0
Frequency                 0
Monetary                  0
customer_lifetime_days    0
Country_Label             0
desc_topic_1              0
desc_topic_2              0
desc_topic_3              0
desc_topic_4              0
desc_topic_5              0
desc_topic_6              0
desc_topic_7              0
desc_topic_8              0
desc_topic_9              0
desc_topic_10             0
desc_topic_11             0
desc_topic_12             0
desc_topic_13             0
desc_topic_14             0
desc_topic_15             0
desc_topic_16             0
desc_topic_17             0
desc_topic_18             0
desc_topic_19             0
desc_topic_20             0
InvoiceYear               0
InvoiceMonth              0
InvoiceDay              

In [32]:
data = data.dropna(subset=['StockCode'])

In [33]:
print("About data__:\n", data.describe())

About data__:
            InvoiceNo      StockCode       Quantity      UnitPrice  \
count  363119.000000  363119.000000  363119.000000  363119.000000   
mean   560820.303099   26967.327190      13.129062       2.886098   
std     13076.895436   15676.370832     188.527851       4.361971   
min    536365.000000   10002.000000       1.000000       0.000000   
25%    549547.000000   21955.000000       2.000000       1.250000   
50%    562150.000000   22603.000000       6.000000       1.700000   
75%    572237.000000   23171.000000      12.000000       3.750000   
max    581587.000000   90208.000000   80995.000000     649.500000   

          CustomerID     TotalPrice        Recency      Frequency  \
count  363119.000000  363119.000000  363119.000000  363119.000000   
mean    15295.738347      22.073616       1.529623     594.085011   
std      1711.946809     321.862986      12.708105    1293.994833   
min     12346.000000       0.000000       0.000000       1.000000   
25%     13969.0000

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363119 entries, 0 to 363118
Data columns (total 37 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   InvoiceNo               363119 non-null  float64
 1   StockCode               363119 non-null  float64
 2   Quantity                363119 non-null  int64  
 3   InvoiceDate             363119 non-null  object 
 4   UnitPrice               363119 non-null  float64
 5   CustomerID              363119 non-null  float64
 6   TotalPrice              363119 non-null  float64
 7   Recency                 363119 non-null  float64
 8   Frequency               363119 non-null  int64  
 9   Monetary                363119 non-null  float64
 10  customer_lifetime_days  363119 non-null  int64  
 11  Country_Label           363119 non-null  int64  
 12  desc_topic_1            363119 non-null  float64
 13  desc_topic_2            363119 non-null  float64
 14  desc_topic_3        

In [35]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice',
       'CustomerID', 'TotalPrice', 'Recency', 'Frequency', 'Monetary',
       'customer_lifetime_days', 'Country_Label', 'desc_topic_1',
       'desc_topic_2', 'desc_topic_3', 'desc_topic_4', 'desc_topic_5',
       'desc_topic_6', 'desc_topic_7', 'desc_topic_8', 'desc_topic_9',
       'desc_topic_10', 'desc_topic_11', 'desc_topic_12', 'desc_topic_13',
       'desc_topic_14', 'desc_topic_15', 'desc_topic_16', 'desc_topic_17',
       'desc_topic_18', 'desc_topic_19', 'desc_topic_20', 'InvoiceYear',
       'InvoiceMonth', 'InvoiceDay', 'InvoiceHour', 'Weekday'],
      dtype='object')

In [36]:
data.drop(columns=['InvoiceDate'], inplace =True)

data.columns

Index(['InvoiceNo', 'StockCode', 'Quantity', 'UnitPrice', 'CustomerID',
       'TotalPrice', 'Recency', 'Frequency', 'Monetary',
       'customer_lifetime_days', 'Country_Label', 'desc_topic_1',
       'desc_topic_2', 'desc_topic_3', 'desc_topic_4', 'desc_topic_5',
       'desc_topic_6', 'desc_topic_7', 'desc_topic_8', 'desc_topic_9',
       'desc_topic_10', 'desc_topic_11', 'desc_topic_12', 'desc_topic_13',
       'desc_topic_14', 'desc_topic_15', 'desc_topic_16', 'desc_topic_17',
       'desc_topic_18', 'desc_topic_19', 'desc_topic_20', 'InvoiceYear',
       'InvoiceMonth', 'InvoiceDay', 'InvoiceHour', 'Weekday'],
      dtype='object')

### Feature Scaling 

In [37]:
import numpy as np
from sklearn.preprocessing import StandardScaler

original_cols = ['Monetary', 'TotalPrice', 'Frequency', 'Recency', 'Quantity', 'UnitPrice']
scaler = StandardScaler()

# Standardize all columns at once (more efficient)
data[original_cols] = scaler.fit_transform(data[original_cols])

In [38]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,Monetary,customer_lifetime_days,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
0,541431.0,23166.0,393.586266,-0.423226,12346.0,239.734409,-0.120366,-0.458337,2.308879,0,...,-0.003681,-0.012714,-0.015699,0.055104,-0.003688,2011,1,18,10,1
1,537626.0,85116.0,-0.005989,-0.180216,12347.0,0.009713,-0.120366,-0.333916,-0.229629,365,...,-0.05809,0.034467,-0.090115,0.036342,0.050075,2010,12,7,14,1
2,537626.0,22375.0,-0.048423,0.312681,12347.0,-0.015763,-0.120366,-0.333916,-0.229629,365,...,-0.031094,0.066135,-0.042782,0.062202,-0.060593,2010,12,7,14,1
3,537626.0,71477.0,-0.005989,0.083426,12347.0,0.052589,-0.120366,-0.333916,-0.229629,365,...,-0.037794,0.041952,-0.082633,0.030501,0.078181,2010,12,7,14,1
4,537626.0,22492.0,0.121313,-0.512635,12347.0,0.004121,-0.120366,-0.333916,-0.229629,365,...,0.001612,0.074616,-0.024091,-0.016158,-0.066717,2010,12,7,14,1


### Data Split - Spliting the data into training, validation and test sets

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
# X = data.drop(columns=['Monetary'])
# y = data['Monetary']

In [41]:
# X.head()

In [42]:
# # Option 1A: Use Monetary as CLV (simplest)
# y = df['Monetary']  # Total customer spend so far

# # Option 1B: Calculate more sophisticated CLV
# df['CLV'] = df['Monetary'] * (df['Frequency'] / df['customer_lifetime_days']) * 365
# # This estimates annual customer value
# y = df['CLV']

# # Option 1C: Future CLV prediction
# # If you want to predict future value, create a forward-looking metric
# df['Average_Order_Value'] = df['Monetary'] / df['Frequency']
# df['Purchase_Rate'] = df['Frequency'] / df['customer_lifetime_days']
# df['Predicted_CLV'] = df['Average_Order_Value'] * df['Purchase_Rate'] * 365
# y = df['Predicted_CLV']

In [43]:
train, test= train_test_split(data,test_size= 0.25, random_state= 42)

In [44]:
train

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,TotalPrice,Recency,Frequency,Monetary,customer_lifetime_days,...,desc_topic_16,desc_topic_17,desc_topic_18,desc_topic_19,desc_topic_20,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour,Weekday
314837,546625.0,22621.0,-0.053727,-0.329232,17542.0,-0.055066,-0.120366,-0.442881,-0.350880,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,2011,3,15,11,1
361402,542647.0,22699.0,-0.037814,0.014650,18245.0,-0.013588,-0.120366,-0.337007,-0.274234,347,...,-0.219872,0.416395,0.454158,0.308120,-0.000782,2011,1,31,11,0
117650,571653.0,22603.0,-0.005989,-0.496588,14298.0,-0.041737,-0.120366,0.584945,1.124096,352,...,-0.107827,0.086027,-0.077787,0.095582,0.059368,2011,10,18,12,1
264842,537794.0,21670.0,-0.037814,-0.375083,16713.0,-0.045279,-0.120366,0.001480,-0.123757,347,...,0.175698,-0.226734,-0.041067,0.128047,-0.150300,2010,12,8,13,2
76486,549729.0,22386.0,-0.053727,-0.184802,13634.0,-0.049194,-0.120366,-0.331598,-0.306260,293,...,-0.002461,-0.023381,-0.096533,-0.108392,0.033132,2011,4,11,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,573160.0,22945.0,-0.005989,-0.466785,14359.0,-0.036890,-0.120366,-0.417379,-0.324785,68,...,-0.076524,-0.101654,0.075370,-0.034572,0.049475,2011,10,28,8,4
259178,569148.0,21259.0,-0.059031,0.702413,16613.0,-0.031609,-0.120366,-0.437471,-0.336212,0,...,-0.063681,-0.024505,0.008244,0.034817,0.167170,2011,9,30,15,4
131932,564304.0,22507.0,-0.053727,0.473159,14527.0,-0.022443,-0.120366,0.220956,-0.086764,366,...,-0.189179,0.228391,-0.333141,-0.077197,0.027160,2011,8,24,12,2
146867,553176.0,48138.0,-0.064336,1.160922,14689.0,-0.043881,-0.120366,-0.451382,-0.352439,0,...,0.024366,0.057480,-0.094853,-0.019024,-0.094111,2011,5,15,11,6


In [45]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 272339 entries, 314837 to 121958
Data columns (total 36 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   InvoiceNo               272339 non-null  float64
 1   StockCode               272339 non-null  float64
 2   Quantity                272339 non-null  float64
 3   UnitPrice               272339 non-null  float64
 4   CustomerID              272339 non-null  float64
 5   TotalPrice              272339 non-null  float64
 6   Recency                 272339 non-null  float64
 7   Frequency               272339 non-null  float64
 8   Monetary                272339 non-null  float64
 9   customer_lifetime_days  272339 non-null  int64  
 10  Country_Label           272339 non-null  int64  
 11  desc_topic_1            272339 non-null  float64
 12  desc_topic_2            272339 non-null  float64
 13  desc_topic_3            272339 non-null  float64
 14  desc_topic_4        

In [46]:
## Train dataset
train_x = train.drop(['Monetary'], axis=1).values
train_y = train[['Monetary']].values.ravel()

## Test dataset
test_x =test.drop(['Monetary'], axis=1).values
test_y =test[['Monetary']].values.ravel()

##  Splitting this train data inot train and validation

train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.20, random_state=42)

signature=infer_signature(train_x, train_y)

### ANN Model

In [47]:
def train_model(param, epochs, train_x, train_y, valid_x, valid_y, test_x, test_y):
    """
    Train a neural network model with given hyperparameters
    """
    # Define model Architecture
    mean = np.mean(train_x, axis=0)  # Normalization
    var = np.var(train_x, axis=0)
    
    model = keras.Sequential([
        keras.Input([train_x.shape[1]]),  # Input shape {Number of columns}
        keras.layers.Normalization(mean=mean, variance=var),  # Normalization
        keras.layers.Dense(param.get('hidden_units', 64), activation='relu'),  # Hidden layer
        keras.layers.Dropout(param.get('dropout_rate', 0.2)),  # Dropout for regularization
        keras.layers.Dense(1)  # Output layer
    ])
    
    # Compile the model
    optimizer_name = param.get('optimizer', 'sgd')
    if optimizer_name == 'sgd':
        optimizer = keras.optimizers.SGD(
            learning_rate=param["lr"],
            momentum=param['momentum']
        )
    elif optimizer_name == 'adam':
        optimizer = keras.optimizers.Adam(learning_rate=param["lr"])
    else:
        optimizer = keras.optimizers.RMSprop(learning_rate=param["lr"])
    
    model.compile(
        optimizer=optimizer,
        loss="mean_squared_error",
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
    
    # Train the ANN model with hyperparameters and MLflow tracking
    with mlflow.start_run(nested=True):
        # Early stopping to prevent overfitting
        early_stopping = keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
        
        history = model.fit(
            train_x, train_y,
            validation_data=(valid_x, valid_y),
            epochs=epochs,
            batch_size=param.get('batch_size', 64),
            callbacks=[early_stopping],
            verbose=0  # Reduce output during optimization
        )
        
        # Evaluate the model
        eval_result = model.evaluate(valid_x, valid_y, batch_size=64, verbose=0)
        eval_rmse = eval_result[1]
        
        # Log the parameters and results
        mlflow.log_params(param)
        mlflow.log_metric("eval_rmse", eval_rmse)
        mlflow.log_metric("final_train_loss", history.history['loss'][-1])
        mlflow.log_metric("final_val_loss", history.history['val_loss'][-1])
        
        # Create signature for model logging
        signature = infer_signature(train_x, train_y)
        
        # Log the model (only for the best runs to save space)
        if eval_rmse < 0.5:  # Adjust threshold as needed
            mlflow.tensorflow.log_model(model, "model", signature=signature)
        
        return {"loss": eval_rmse, "status": STATUS_OK}

In [48]:
def objective(params):
    """
    Objective function for hyperparameter optimization
    """
    result = train_model(
        params,
        epochs=50,  # Increased epochs with early stopping
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        valid_y=valid_y,
        test_x=test_x,
        test_y=test_y,
    )
    return result

In [49]:
# Expanded hyperparameter space
space = {
    "lr": hp.loguniform("lr", np.log(1e-5), np.log(1e-1)),
    "momentum": hp.uniform("momentum", 0.0, 1.0),
    "optimizer": hp.choice("optimizer", ["sgd", "adam", "rmsprop"]),
    "hidden_units": hp.choice("hidden_units", [32, 64, 128, 256]),
    "dropout_rate": hp.uniform("dropout_rate", 0.0, 0.5),
    "batch_size": hp.choice("batch_size", [32, 64, 128])
}


In [None]:
# Set up MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("customer-life-value-prediction")

# Main optimization loop
with mlflow.start_run():
    # Conduct the hyperparameter search using Hyperopt
    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=20,  # Increased evaluations
        trials=trials,
        verbose=True
    )
    
    # Fetch the details of the best run
    best_run = sorted(trials.results, key=lambda x: x["loss"])[0]
    
    # Log the best parameters and loss
    mlflow.log_params(best)
    mlflow.log_metric("best_eval_rmse", best_run["loss"])
    
    # Train final model with best parameters
    print("Training final model with best parameters...")
    final_result = train_model(
        best,
        epochs=100,  # More epochs for final model
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        valid_y=valid_y,
        test_x=test_x,
        test_y=test_y
    )
    
    # Print results
    print(f"Best parameters: {best}")
    print(f"Best validation RMSE: {best_run['loss']:.4f}")
    print(f"Final model validation RMSE: {final_result['loss']:.4f}")

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

2025-06-10 09:01:38.925688: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


🏃 View run popular-gnat-619 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/585d9fd81d224c939ba86d6910a0ad88

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443

  5%|▌         | 1/20 [06:22<2:01:09, 382.63s/trial, best loss: 0.6599171161651611]




🏃 View run redolent-crab-827 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/eac8cb1a6b9c44068e830cebc30892ce

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443      

🏃 View run unruly-moth-740 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/b541e6a08b234ababe50268e53656fb4

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443       

🏃 View run bustling-crow-857 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/7606cb8cde68436396518548b57ddbff

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443       

🏃 View run carefree-owl-226 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/bcdd5d580d25490094cad07710d88bc7

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443     

🏃 View run amusing-pug-245 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/b35f37df585f450eaf2e63610fb3e180

🧪 View experiment at: http://127.




🏃 View run exultant-hog-803 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/498ec0fe803d48d4ba62844888369641

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443       

🏃 View run bedecked-auk-804 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/7b5113636dbf481fb9d3c1de880273a7

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443       

🏃 View run dashing-doe-887 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/25da4cc748144dd6a399690ad08a743d

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443       

🏃 View run learned-crane-311 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/4a2c41f2148e433b86036a80aec7c508

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/657684567043828443      

🏃 View run overjoyed-grouse-146 at: http://127.0.0.1:5000/#/experiments/657684567043828443/runs/8eaf34d223e044bfb1d485ca2c2e719d

🧪 View experiment at: http: