# Step A — Use LLM to DESIGN the DATA LOGIC (Once)

### Example Prompt
Design realistic rules for simulating customer purchase behavior and churn risk...


# Step B — IMPLEMENT That Logic in Python

### 1️⃣ Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)
random.seed(42)

### 2️⃣ Helper Functions (LLM-Guided)

> **Major Update**: 
> 1. Churn is probabilistic (Risk Score based).
> 2. Added "Karens" (Active customers with negative reviews) to lower model accuracy to realistic levels.

In [2]:
def get_purchase_behavior(income):
    """
    Returns (total_orders, avg_order_value) based on income level.
    """
    if income > 100000:
        # High income: High freq
        orders = random.randint(15, 50)
        value = round(random.uniform(150, 500), 2)
    elif income > 60000:
        # Med income
        orders = random.randint(5, 40)
        value = round(random.uniform(80, 300), 2)
    else:
        # Low income: LOWER freq to increase Churn Risk overlap
        orders = random.randint(1, 15) 
        value = round(random.uniform(20, 150), 2)
    return orders, value

def get_sentiment_and_churn(days_since_last_purchase, total_orders):
    """
    Returns (sentiment category, churn_label).
    """
    # 1. Determine Sentiment (NOISY)
    if total_orders > 30 and days_since_last_purchase < 60:
        # Loyal: Mostly Positive
        sentiment = random.choices(["Positive", "Neutral"], weights=[0.8, 0.2])[0]
    elif days_since_last_purchase > 180:
        # Churn Risk: Mix of Negative and Neutral (Ambiguous)
        sentiment = random.choices(["Negative", "Neutral"], weights=[0.6, 0.4])[0]
    else:
        # Active middling: Mostly Neutral, occasional "Karen" (Negative)
        sentiment = random.choices(["Neutral", "Negative"], weights=[0.9, 0.1])[0]
        
    # 2. Calculate Churn Probability (Risk Score)
    risk_score = 0
    
    # Factor A: Recency (Hidden from model)
    if days_since_last_purchase > 180:
        risk_score += 2
    elif days_since_last_purchase > 90:
        risk_score += 1
        
    # Factor B: Frequency (Visible)
    if total_orders < 5:
        risk_score += 1
        
    # Factor C: Sentiment (Visible)
    if sentiment == "Negative":
        risk_score += 1
        
    # Threshold for Churn
    churn = 1 if risk_score >= 3 else 0
    
    return sentiment, churn

def get_review_text(sentiment):
    """
    Returns a review string based on sentiment category.
    """
    positive_reviews = [
        "Very satisfied with the service", "Fast delivery and great quality",
        "Excellent shopping experience", "Highly recommended", "Will definitely buy again"
    ]
    neutral_reviews = [
        "It was okay", "Average experience", "Product is acceptable",
        "Nothing special", "Decent service"
    ]
    negative_reviews = [
        "Very disappointed", "Poor customer service", "Delivery was slow",
        "Product quality was bad", "Not worth the money"
    ]
    
    if sentiment == "Positive":
        return random.choice(positive_reviews)
    elif sentiment == "Negative":
        return random.choice(negative_reviews)
    else:
        return random.choice(neutral_reviews)

### 3️⃣ Main Data Generation Loop

In [3]:
NUM_RECORDS = 1200
data = []

for i in range(NUM_RECORDS):
    # 1. Generate Info (Linked Age & Income)
    age = random.randint(18, 65)
    
    # Increased Low Income pool to create more ambiguous data points
    if age < 35: 
        income = random.randint(30000, 60000)
    elif age < 50:
        income = random.randint(50000, 100000)
    else:
        income = random.randint(60000, 150000)
        
    days_since_last_purchase = random.randint(1, 365)
    
    # 2. Apply Logic
    total_orders, avg_order_value = get_purchase_behavior(income)
    sentiment, churn = get_sentiment_and_churn(days_since_last_purchase, total_orders)
    review_text = get_review_text(sentiment)

    # 3. Store Record
    data.append([
        i + 1,
        age,
        income,
        total_orders,
        avg_order_value,
        days_since_last_purchase,
        review_text,
        churn
    ])

### 4️⃣ Create DataFrame & Verify

In [4]:
columns = [
    "customer_id", "age", "income", "total_orders", 
    "avg_order_value", "days_since_last_purchase", "review_text", "churn"
]

df = pd.DataFrame(data, columns=columns)

# Minimal display to check structure
print(df.info())
print(df["churn"].value_counts(normalize=True))  # check balance
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               1200 non-null   int64  
 1   age                       1200 non-null   int64  
 2   income                    1200 non-null   int64  
 3   total_orders              1200 non-null   int64  
 4   avg_order_value           1200 non-null   float64
 5   days_since_last_purchase  1200 non-null   int64  
 6   review_text               1200 non-null   object 
 7   churn                     1200 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 75.1+ KB
None
churn
0    0.651667
1    0.348333
Name: proportion, dtype: float64


Unnamed: 0,customer_id,age,income,total_orders,avg_order_value,days_since_last_purchase,review_text,churn
0,1,58,74592,22,133.88,13,It was okay,0
1,2,61,131482,42,161.12,45,Fast delivery and great quality,0
2,3,50,138907,50,219.59,14,Will definitely buy again,0
3,4,44,64446,22,258.07,230,Poor customer service,1
4,5,62,115392,32,204.42,175,Delivery was slow,0


### 5️⃣ Save Dataset (Freeze)

In [5]:
df.to_csv("../data/synthetic_customers_raw.csv", index=False)
print("Dataset saved to data/synthetic_customers_raw.csv")

Dataset saved to data/synthetic_customers_raw.csv


# Step C — DOCUMENT AI USAGE (THIS IS WHAT MARKERS LOOK FOR)

> **Use of Generative AI for Dataset Simulation**
>
> A Large Language Model (LLM) was used to design realistic data generation rules, including **income vs. spending habits** and **probabilistic churn logic**.
>
> Based on the LLM’s guidance, rule-based logic was implemented in Python (using helper functions) to simulate 1,200 realistic customer records. 
> **Update**: We introduced probabilistic noise into the `churn` definition (based on risk scores rather than hard rules) to simulate real-world data variability and prevent target leakage.