In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

np.random.seed(42)
random.seed(42)


Data generation

In [None]:
def random_date(start, end):
    delta = end - start
    return start + timedelta(days=random.randint(0, delta.days))


In [None]:
NUM_CUSTOMERS = 5000
MIN_TXN_PER_CUSTOMER = 5
MAX_TXN_PER_CUSTOMER = 40

START_DATE = datetime(2023, 1, 1)
END_DATE = datetime(2023, 6, 30)

OUTPUT_PATH = "data/transactions.csv"


In [None]:
transactions = []
txn_id = 1

for customer_id in range(1, NUM_CUSTOMERS + 1):
    
    num_txns = random.randint(MIN_TXN_PER_CUSTOMER, MAX_TXN_PER_CUSTOMER)
    
    for _ in range(num_txns):
        amount = max(50, round(np.random.exponential(scale=1200), 2))
        
        transactions.append({
            "transaction_id": f"T{txn_id}",
            "customer_id": customer_id,
            "transaction_date": random_date(START_DATE, END_DATE),
            "total_amount": amount,
            "quantity": random.randint(1, 6),
            "num_products": random.randint(1, 5),
            "promotion_used": random.choice([0, 1])
        })
        
        txn_id += 1

df = pd.DataFrame(transactions)
df = df.sort_values(["customer_id", "transaction_date"]).reset_index(drop=True)

df.head()


Unnamed: 0,transaction_id,customer_id,transaction_date,total_amount,quantity,num_products,promotion_used
0,T7,1,2023-01-02,71.81,2,4,1
1,T1,1,2023-01-07,563.12,6,3,0
2,T11,1,2023-01-12,50.0,6,4,0
3,T4,1,2023-01-24,1095.53,2,2,0
4,T9,1,2023-01-27,1102.9,1,4,0


In [None]:
os.makedirs("data", exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)

print("âœ… Data saved to:", OUTPUT_PATH)
print("ðŸ‘¥ Unique customers:", df["customer_id"].nunique())
print("ðŸ§¾ Total transactions:", len(df))


âœ… Data saved to: data/transactions.csv
ðŸ‘¥ Unique customers: 5000
ðŸ§¾ Total transactions: 112503


In [None]:
import pandas as pd

df = pd.read_csv("data/transactions.csv")
df.head()


Unnamed: 0,transaction_id,customer_id,transaction_date,total_amount,quantity,num_products,promotion_used
0,T7,1,2023-01-02,71.81,2,4,1
1,T1,1,2023-01-07,563.12,6,3,0
2,T11,1,2023-01-12,50.0,6,4,0
3,T4,1,2023-01-24,1095.53,2,2,0
4,T9,1,2023-01-27,1102.9,1,4,0


In [None]:
print("Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isna().sum())


Shape: (112503, 7)

Data Types:
transaction_id       object
customer_id           int64
transaction_date     object
total_amount        float64
quantity              int64
num_products          int64
promotion_used        int64
dtype: object

Missing Values:
transaction_id      0
customer_id         0
transaction_date    0
total_amount        0
quantity            0
num_products        0
promotion_used      0
dtype: int64


In [None]:
df["transaction_date"] = pd.to_datetime(df["transaction_date"])


In [None]:
df = df[df["total_amount"] > 0]
df = df[df["quantity"] > 0]


In [None]:
df = df.sort_values(
    ["customer_id", "transaction_date"]
).reset_index(drop=True)


In [None]:
print("After cleaning:")
print("Shape:", df.shape)
print("Date range:")
print(df["transaction_date"].min(), "to", df["transaction_date"].max())


After cleaning:
Shape: (112503, 7)
Date range:
2023-01-01 00:00:00 to 2023-06-30 00:00:00


In [None]:
from datetime import timedelta

# 70% time-based cutoff
CUTOFF_DATE = df["transaction_date"].quantile(0.70)
PREDICTION_WINDOW_DAYS = 30

print("Cutoff date:", CUTOFF_DATE)


Cutoff date: 2023-05-07 00:00:00


In [None]:
past_txn = df[df["transaction_date"] <= CUTOFF_DATE]

future_txn = df[
    (df["transaction_date"] > CUTOFF_DATE) &
    (df["transaction_date"] <= CUTOFF_DATE + timedelta(days=PREDICTION_WINDOW_DAYS))
]

print("Past transactions:", past_txn.shape)
print("Future transactions:", future_txn.shape)


Past transactions: (79085, 7)
Future transactions: (18513, 7)


In [None]:
features = past_txn.groupby("customer_id").agg(

    # ---- RFM ----
    recency_days=("transaction_date",
                  lambda x: (CUTOFF_DATE - x.max()).days),
    frequency=("transaction_id", "count"),
    monetary=("total_amount", "sum"),

    # ---- Order behavior ----
    avg_order_value=("total_amount", "mean"),
    max_order_value=("total_amount", "max"),
    std_order_value=("total_amount", "std"),

    # ---- Quantity behavior ----
    total_quantity=("quantity", "sum"),
    avg_quantity=("quantity", "mean"),

    # ---- Product diversity ----
    avg_products_per_order=("num_products", "mean"),

    # ---- Promotion behavior ----
    promo_usage_ratio=("promotion_used", "mean"),

    # ---- Temporal ----
    customer_lifetime_days=("transaction_date",
                            lambda x: (x.max() - x.min()).days)

).reset_index()


In [None]:
features.fillna(0, inplace=True)
features.head()


Unnamed: 0,customer_id,recency_days,frequency,monetary,avg_order_value,max_order_value,std_order_value,total_quantity,avg_quantity,avg_products_per_order,promo_usage_ratio,customer_lifetime_days
0,1,30,10,14794.28,1479.428,4204.27,1476.749032,29,2.9,3.5,0.5,95
1,2,8,15,14233.1,948.873333,4044.76,1029.692787,47,3.133333,3.266667,0.533333,110
2,3,2,7,6450.74,921.534286,2880.51,970.977759,29,4.142857,3.285714,0.428571,92
3,4,2,28,38532.39,1376.156786,5200.98,1351.16543,98,3.5,2.607143,0.464286,110
4,5,9,15,15530.16,1035.344,2857.52,857.573538,41,2.733333,2.6,0.6,111


In [None]:
target = future_txn.groupby("customer_id").agg(
    future_spend_30d=("total_amount", "sum")
).reset_index()

target.head()


Unnamed: 0,customer_id,future_spend_30d
0,1,1783.64
1,2,4697.29
2,3,448.26
3,4,6785.4
4,5,2168.33


In [None]:
dataset = features.merge(target, on="customer_id", how="left")
dataset["future_spend_30d"] = dataset["future_spend_30d"].fillna(0)

dataset.head()


Unnamed: 0,customer_id,recency_days,frequency,monetary,avg_order_value,max_order_value,std_order_value,total_quantity,avg_quantity,avg_products_per_order,promo_usage_ratio,customer_lifetime_days,future_spend_30d
0,1,30,10,14794.28,1479.428,4204.27,1476.749032,29,2.9,3.5,0.5,95,1783.64
1,2,8,15,14233.1,948.873333,4044.76,1029.692787,47,3.133333,3.266667,0.533333,110,4697.29
2,3,2,7,6450.74,921.534286,2880.51,970.977759,29,4.142857,3.285714,0.428571,92,448.26
3,4,2,28,38532.39,1376.156786,5200.98,1351.16543,98,3.5,2.607143,0.464286,110,6785.4
4,5,9,15,15530.16,1035.344,2857.52,857.573538,41,2.733333,2.6,0.6,111,2168.33


In [None]:
dataset.shape

(5000, 13)

In [None]:
dataset.isnull().sum()

customer_id               0
recency_days              0
frequency                 0
monetary                  0
avg_order_value           0
max_order_value           0
std_order_value           0
total_quantity            0
avg_quantity              0
avg_products_per_order    0
promo_usage_ratio         0
customer_lifetime_days    0
future_spend_30d          0
dtype: int64

In [None]:
dataset.duplicated().sum()

np.int64(0)

In [None]:
dataset.describe()

Unnamed: 0,customer_id,recency_days,frequency,monetary,avg_order_value,max_order_value,std_order_value,total_quantity,avg_quantity,avg_products_per_order,promo_usage_ratio,customer_lifetime_days,future_spend_30d
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,10.0544,15.817,18900.680232,1195.025783,3833.652252,1105.529372,55.3536,3.496804,3.009154,0.501262,106.1134,4433.959222
std,1443.520003,13.16713,7.677219,10361.833961,363.183753,1641.898837,433.768588,27.697545,0.525743,0.439206,0.151496,19.867196,3819.246669
min,1.0,0.0,1.0,50.0,50.0,50.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,1250.75,2.0,9.0,10369.1275,961.563591,2709.0925,823.163208,32.0,3.181818,2.75,0.416667,100.0,1456.9225
50%,2500.5,6.0,16.0,18132.795,1165.984208,3617.53,1061.577739,54.0,3.5,3.0,0.5,112.0,3573.675
75%,3750.25,13.0,22.0,26426.995,1391.400333,4767.1025,1325.974937,78.0,3.8125,3.277778,0.592593,119.0,6539.055
max,5000.0,118.0,35.0,63325.83,3668.57,14089.65,4422.882206,141.0,6.0,5.0,1.0,126.0,27883.52


Train test split


In [None]:
X = dataset.drop(columns=["customer_id", "future_spend_30d"])
y = dataset["future_spend_30d"]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)


Feature shape: (5000, 11)
Target shape: (5000,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (4000, 11)
Test size: (1000, 11)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test_scaled)

print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("RÂ²  :", r2_score(y_test, y_pred))


MAE : 2606.399971703792
RMSE: 3376.023699240185
RÂ²  : 0.21259028633973742


In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np


In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    return mae, rmse, r2


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    
    "Ridge Regression": Ridge(alpha=1.0),
    
    "Random Forest": RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ),
    
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        random_state=42
    )
}


In [None]:
results = []

for name, model in models.items():
    mae, rmse, r2 = evaluate_model(
        model,
        X_train_scaled,
        y_train,
        X_test_scaled,
        y_test
    )
    
    results.append({
        "Model": name,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    })

results_df = pd.DataFrame(results).sort_values(by="MAE")
results_df


Unnamed: 0,Model,MAE,RMSE,R2
3,Gradient Boosting,2575.659405,3359.050867,0.220488
2,Random Forest,2606.399972,3376.023699,0.21259
0,Linear Regression,2612.885994,3402.295505,0.200288
1,Ridge Regression,2612.94343,3402.206243,0.20033


In [None]:
best_model = results_df.iloc[0]

print("âœ… Best Model Based on MAE")
print("Model:", best_model["Model"])
print("MAE :", round(best_model["MAE"], 2))
print("RMSE:", round(best_model["RMSE"], 2))
print("R2  :", round(best_model["R2"], 3))


âœ… Best Model Based on MAE
Model: Gradient Boosting
MAE : 2575.66
RMSE: 3359.05
R2  : 0.22


In [None]:
from sklearn.ensemble import RandomForestRegressor

final_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train_scaled, y_train)


In [None]:
y_pred_final = final_model.predict(X_test_scaled)

print("FINAL MODEL PERFORMANCE")
print("MAE :", mean_absolute_error(y_test, y_pred_final))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_final)))
print("R2  :", r2_score(y_test, y_pred_final))


FINAL MODEL PERFORMANCE
MAE : 2606.399971703792
RMSE: 3376.023699240185
R2  : 0.21259028633973742


In [None]:
import pickle
import os

os.makedirs("models", exist_ok=True)

with open("models/spend_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("âœ… Model and scaler saved successfully")


âœ… Model and scaler saved successfully


In [None]:
with open("models/spend_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

with open("models/scaler.pkl", "rb") as f:
    loaded_scaler = pickle.load(f)


In [None]:
def predict_next_30_days_spend_streamlit(
    recency_days,
    frequency,
    monetary,
    avg_order_value,
    max_order_value,
    std_order_value,
    total_quantity,
    avg_quantity,
    avg_products_per_order,
    promo_usage_ratio,
    customer_lifetime_days,
    model,
    scaler
):
    """
    Takes engineered features as input (from UI)
    Returns predicted spend for next 30 days
    """

    input_df = pd.DataFrame([{
        "recency_days": recency_days,
        "frequency": frequency,
        "monetary": monetary,
        "avg_order_value": avg_order_value,
        "max_order_value": max_order_value,
        "std_order_value": std_order_value,
        "total_quantity": total_quantity,
        "avg_quantity": avg_quantity,
        "avg_products_per_order": avg_products_per_order,
        "promo_usage_ratio": promo_usage_ratio,
        "customer_lifetime_days": customer_lifetime_days
    }])

    input_scaled = scaler.transform(input_df)
    prediction = model.predict(input_scaled)[0]

    return prediction


In [None]:
test_prediction = predict_next_30_days_spend_streamlit(
    recency_days=5,
    frequency=20,
    monetary=30000,
    avg_order_value=1500,
    max_order_value=4000,
    std_order_value=500,
    total_quantity=80,
    avg_quantity=4,
    avg_products_per_order=3,
    promo_usage_ratio=0.3,
    customer_lifetime_days=150,
    model=loaded_model,
    scaler=loaded_scaler
)

print("Predicted spend in next 30 days:", round(test_prediction, 2))


Predicted spend in next 30 days: 6338.35


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import pickle

@st.cache_resource
def load_artifacts():
    model = pickle.load(open("models/spend_model.pkl", "rb"))
    scaler = pickle.load(open("models/scaler.pkl", "rb"))
    return model, scaler

model, scaler = load_artifacts()

def predict_next_30_days_spend(
    recency_days,
    frequency,
    monetary,
    avg_order_value,
    max_order_value,
    std_order_value,
    total_quantity,
    avg_quantity,
    avg_products_per_order,
    promo_usage_ratio,
    customer_lifetime_days,
):
    input_df = pd.DataFrame([{
        "recency_days": recency_days,
        "frequency": frequency,
        "monetary": monetary,
        "avg_order_value": avg_order_value,
        "max_order_value": max_order_value,
        "std_order_value": std_order_value,
        "total_quantity": total_quantity,
        "avg_quantity": avg_quantity,
        "avg_products_per_order": avg_products_per_order,
        "promo_usage_ratio": promo_usage_ratio,
        "customer_lifetime_days": customer_lifetime_days
    }])

    input_scaled = scaler.transform(input_df)
    return model.predict(input_scaled)[0]

st.title("Customer Spend Prediction (Next 30 Days)")

recency_days = st.number_input("Recency (days)", 0, 365, 5)
frequency = st.number_input("Frequency", 1, 100, 20)
monetary = st.number_input("Total Spend", 0.0, 1e7, 30000.0)
avg_order_value = st.number_input("Avg Order Value", 0.0, 1e6, 1500.0)
max_order_value = st.number_input("Max Order Value", 0.0, 1e6, 4000.0)
std_order_value = st.number_input("Std Order Value", 0.0, 1e6, 500.0)
total_quantity = st.number_input("Total Quantity", 0, 1000, 80)
avg_quantity = st.number_input("Avg Quantity", 0.0, 100.0, 4.0)
avg_products_per_order = st.number_input("Avg Products/Order", 1.0, 20.0, 3.0)
promo_usage_ratio = st.slider("Promo Usage Ratio", 0.0, 1.0, 0.3)
customer_lifetime_days = st.number_input("Customer Lifetime (days)", 1, 1000, 150)

if st.button("Predict"):
    prediction = predict_next_30_days_spend(
        recency_days,
        frequency,
        monetary,
        avg_order_value,
        max_order_value,
        std_order_value,
        total_quantity,
        avg_quantity,
        avg_products_per_order,
        promo_usage_ratio,
        customer_lifetime_days,
    )
    st.success(f"Predicted Spend in Next 30 Days: â‚¹ {prediction:,.2f}")


Overwriting app.py
