# Customer Lifetime Value (LTV) Prediction Project
This project predicts customer Lifetime Value (LTV) using historical transaction data.
- **Objective**: Forecast LTV to support targeted marketing.
- **Tools**: Python, Pandas, XGBoost, Scikit-learn
- **Metrics**: MAE, RMSE


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
# Load data
customers_df = pd.read_csv("data/customer_data.csv")
transactions_df = pd.read_csv("data/transactions.csv")

# Convert date columns
customers_df['signup_date'] = pd.to_datetime(customers_df['signup_date'])
transactions_df['purchase_date'] = pd.to_datetime(transactions_df['purchase_date'])

customers_df.head(), transactions_df.head()

In [None]:
# Reference date
reference_date = datetime(2025, 5, 1)

# Aggregate features
agg_df = transactions_df.groupby("customer_id").agg(
    frequency=("purchase_date", "count"),
    recency=("purchase_date", lambda x: (reference_date - x.max()).days),
    total_value=("amount", "sum"),
    avg_order_value=("amount", "mean")
).reset_index()

# Define LTV
agg_df["ltv"] = agg_df["total_value"]
agg_df.head()

In [None]:
# Split data
X = agg_df[["frequency", "recency", "avg_order_value"]]
y = agg_df["ltv"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")

In [None]:
# Save model
joblib.dump(model, "outputs/model.pkl")

# Predict all
agg_df["predicted_ltv"] = model.predict(X)
agg_df.to_csv("outputs/ltv_predictions.csv", index=False)

In [None]:
# Feature importance
plt.figure(figsize=(8, 6))
xgb.plot_importance(model)
plt.title("Feature Importance")
plt.tight_layout()
plt.savefig("outputs/plots/feature_importance.png")
plt.show()

In [None]:
# Segment customers into LTV tiers
agg_df["ltv_segment"] = pd.qcut(agg_df["predicted_ltv"], q=3, labels=["Low", "Medium", "High"])
agg_df["ltv_segment"].value_counts().plot(kind="bar", title="Customer Segments")
plt.show()