# Create Data:
https://www.optimove.com/resources/learning-center/rfm-segmentation

In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker and set seed for reproducibility
fake = Faker()
random.seed(42)
np.random.seed(42)

# Number of records to generate
num_records = 50000

# Helper functions
def random_date(start, end):
    """Generate a random date between start and end."""
    return start + timedelta(days=random.randint(0, (end - start).days))

# Setting today's date for reference
today = datetime.today()

# Create distributions
recency_distribution = np.random.choice(
    [1, 2, 3], size=num_records, p=[0.6, 0.3, 0.1]
)  # recency tiers
frequency_distribution = np.random.choice(
    [1, 2, 3, 4], size=num_records, p=[0.5, 0.3, 0.15, 0.05]
)  # frequency tiers
monetary_distribution = np.random.choice(
    [1, 2, 3, 4], size=num_records, p=[0.4, 0.3, 0.2, 0.1]
)  # monetary tiers

# Base data columns
customer_ids = [f"C{str(i).zfill(5)}" for i in range(1, num_records + 1)]

# Create realistic 'Last Purchase Date' based on recency tiers
last_purchase_dates = [
    random_date(today - timedelta(days=365), today - timedelta(days=30))
    if tier == 1
    else random_date(today - timedelta(days=730), today - timedelta(days=365))
    if tier == 2
    else random_date(today - timedelta(days=1460), today - timedelta(days=730))
    for tier in recency_distribution
]

# Create realistic 'Total Number of Purchases' based on frequency tiers
total_purchases = [
    random.randint(1, 3) if tier == 1 else random.randint(4, 6) if tier == 2 else random.randint(7, 10)
    for tier in frequency_distribution
]

# Create realistic 'Total Spend (Monetary)' based on monetary tiers
total_spend = [
    random.uniform(50, 200) if tier == 1 else random.uniform(200, 500) if tier == 2 else random.uniform(500, 1000)
    for tier in monetary_distribution
]

# Account created date: skewed to recent years, with some long-term customers
account_created_dates = [
    random_date(today - timedelta(days=730), today)
    if random.random() < 0.7
    else random_date(today - timedelta(days=1460), today - timedelta(days=730))
    if random.random() < 0.9
    else random_date(today - timedelta(days=1825), today - timedelta(days=1460))
    for _ in range(num_records)
]

# Introduce some missing data and outliers
for i in range(0, num_records, random.randint(1000, 3000)):
    total_spend[i] = np.nan  # some missing spend data

# Add outliers
for i in range(0, num_records, random.randint(4000, 7000)):
    total_spend[i] = random.uniform(1000, 5000)  # outlier spend

# Create the dataframe
data = pd.DataFrame({
    "Customer ID": customer_ids,
    "Last Purchase Date": last_purchase_dates,
    "Total Number of Purchases": total_purchases,
    "Total Spend (Monetary Value)": total_spend,
    "Account Created Date": account_created_dates
})

# Display the generated data to the user
import ace_tools as tools; tools.display_dataframe_to_user(name="Fake RFM Dataset", dataframe=data)
