# Customer Lifetime Value (CLV) Prediction

The objective of this project is to analyze customer purchase behavior and predict Customer Lifetime Value (CLV) using data science techniques such as regression, cohort analysis, and revenue segmentation.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Load Dataset
df = pd.read_csv("OnlineRetail.csv", encoding='latin1')
df.head()
#Dataset Overview
df.shape
df.info()
#Checking Missing Values
df.isnull().sum()

#DATA CLEANING
#Removing Missing Customer IDs
df = df.dropna(subset=['CustomerID'])
#Removing Negative Quantity & Price
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
#removing duplicate values
df = df.drop_duplicates()

#FEATURE CREATION
#Creating Total Revenue Column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
#Convert Date Format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
#CUSTOMER LEVEL AGGREGATION

#Create Customer Summary Table
customer_df = df.groupby('CustomerID').agg({'InvoiceDate': ['min', 'max', 'count'],'TotalPrice': 'sum'})

#Renaming columns
customer_df.columns = ['FirstPurchase', 'LastPurchase', 'Frequency', 'TotalRevenue']
customer_df = customer_df.reset_index()

#Calculate Customer Lifespan
customer_df['Lifespan'] = (customer_df['LastPurchase'] - customer_df['FirstPurchase']).dt.days + 1


In [6]:
#CLV CALCULATION

#Calculate Average Purchase Value
customer_df['AvgPurchaseValue'] = customer_df['TotalRevenue'] / customer_df['Frequency']

#Calculate CLV
customer_df['CLV'] = customer_df['AvgPurchaseValue'] * customer_df['Frequency'] * (customer_df['Lifespan'] / 365)


In [None]:
#RECENCY
latest_date = df['InvoiceDate'].max()

customer_df['Recency'] = (latest_date - customer_df['LastPurchase']).dt.days

#revenueperday
customer_df['RevenuePerDay'] = (customer_df['TotalRevenue'] / customer_df['Lifespan'])

#tenure
customer_df['TenureMonths'] = customer_df['Lifespan'] / 30

#Regression Model
#Prepare Features & Target
X = customer_df[['Frequency', 'AvgPurchaseValue', 'Lifespan', 'Recency', 'RevenuePerDay', 'TenureMonths']]
y = customer_df['CLV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
df['InvoiceMonth'] = df['InvoiceDate'].dt.to_period('M')

df['CohortMonth'] = df.groupby('CustomerID')['InvoiceMonth'].transform('min')

df['CohortIndex'] = (df['InvoiceMonth'].dt.year - df['CohortMonth'].dt.year) * 12 + (df['InvoiceMonth'].dt.month - df['CohortMonth'].dt.month) + 1

cohort_data = df.groupby(['CohortMonth', 'CohortIndex'])['CustomerID'].nunique().reset_index()

cohort_pivot = cohort_data.pivot(index='CohortMonth',columns='CohortIndex',values='CustomerID')

In [None]:
customer_df['CLV_Segment'] = pd.qcut(customer_df['CLV'],q=3,labels=['Low Value', 'Medium Value', 'High Value'])

segment_summary = customer_df.groupby('CLV_Segment').agg({'CustomerID': 'count','TotalRevenue': 'mean','CLV': 'mean'})



In [11]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 1805.511303649756
MSE: 99498981.25406545
R2: 0.02151189696711664


In [12]:
import joblib
joblib.dump(model, "clv_model.pkl")

['clv_model.pkl']

In [13]:
customer_df.to_csv("customer_data.csv", index=False)