In [None]:
# !pip install be-great

### Importing the libraries

In [None]:
import pandas as pd
import random
import numpy as np
from rich import print
from be_great import GReaT
print("Everything imported succesfully✅")

### Initial configuration and Setting the parameters

In [None]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Define realistic ranges for the demographic and purchasing behavior data
num_rows = 5000
model = GReaT(llm='distilgpt2', batch_size=32, epochs=5, save_steps=400000)



### Generating the Initial data



In [None]:
# Define helper functions
def random_income():
    return random.choice(['Low', 'Medium', 'High'])

def random_location():
    return random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'London', 'Berlin', 'Mumbai'])

def random_occupation():
    return random.choice(['Engineer', 'Doctor', 'Teacher', 'Sales', 'Artist', 'Technician', 'Manager', 'Clerk'])

def random_payment_method():
    return random.choice(['Credit Card', 'Debit Card', 'Online Wallet', 'Cash'])

def random_product_category():
    return random.choice(['Electronics', 'Apparel', 'Groceries', 'Furniture', 'Toys', 'Books', 'Sports', 'Beauty'])

def random_loyalty_program():
    return random.choice(['Yes', 'No'])

def random_discount_sensitivity():
    return random.randint(0, 50)  # Percentage of purchases on discount

# Generate data
data = {
    'Customer_ID': [f"CUST_{i+1}" for i in range(num_rows)],
    'Age': np.random.randint(18, 70, size=num_rows),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], size=num_rows),
    'Income_Level': [random_income() for _ in range(num_rows)],
    'Occupation': [random_occupation() for _ in range(num_rows)],
    'Location': [random_location() for _ in range(num_rows)],
    'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),
    'Family_Size': np.random.randint(1, 6, size=num_rows),
    'Education_Level': np.random.choice(['High School', 'Bachelor’s', 'Master’s'], size=num_rows),
    'Housing_Status': np.random.choice(['Own', 'Rent'], size=num_rows),

    'Purchase_Frequency': np.random.randint(1, 30, size=num_rows),
    'Recency_of_Purchase': np.random.randint(1, 365, size=num_rows),
    'Monetary_Value': np.random.uniform(100, 5000, size=num_rows).round(2),
    'Average_Order_Value': np.random.uniform(20, 500, size=num_rows).round(2),
    'Product_Categories': [random_product_category() for _ in range(num_rows)],
    'Preferred_Payment_Method': [random_payment_method() for _ in range(num_rows)],
    'Loyalty_Program': [random_loyalty_program() for _ in range(num_rows)],
    'Discount_Sensitivity': [random_discount_sensitivity() for _ in range(num_rows)],
    'Return_Frequency': np.random.randint(0, 5, size=num_rows),
    'Seasonal_Purchases': np.random.choice(['Yes', 'No'], size=num_rows),

    'Marketing_Channel': np.random.choice(['Email', 'SMS', 'App Notification'], size=num_rows),
    'Campaign_Response': np.random.choice(['Yes', 'No'], size=num_rows),
    'Customer_Reviews': np.random.uniform(1, 5, size=num_rows).round(1),
    'Support_Interactions': np.random.randint(0, 10, size=num_rows)
}

# Create dataframe
df = pd.DataFrame(data)
print("Initial dataset created successfully!")

### Generating Further synthentic data using LLM

In [None]:
# Fitting the model to understand the internal pattern
model.fit(df)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.927


<be_great.great_trainer.GReaTTrainer at 0x7c4549345cf0>

In [None]:
# Generating the data
synthetic_df = model.sample(n_samples=5000)

  0%|          | 0/5000 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  0%|          | 0/5000 [00:01<?, ?it/s]

In [None]:
synthetic_df

Unnamed: 0,Customer_ID,Age,Gender,Income_Level,Occupation,Location,Marital_Status,Family_Size,Education_Level,Housing_Status,...,Product_Categories,Preferred_Payment_Method,Loyalty_Program,Discount_Sensitivity,Return_Frequency,Seasonal_Purchases,Marketing_Channel,Campaign_Response,Customer_Reviews,Support_Interactions


In [None]:
df.to_csv("Customer_data.csv")

In [None]:
df

Unnamed: 0,name,age,gender,education,income,country,purchase_frequency,spending
0,Teresa Williams MD,42,Female,High School,53936,Slovenia,0.9,13227.120
1,Christine Myers,49,Female,Master,82468,Aruba,0.6,12674.040
2,Dwayne Moreno,55,Male,Bachelor,56941,Cyprus,0.3,5354.115
3,Amy Norton,24,Female,Bachelor,60651,Palau,0.2,2606.510
4,Tonya Adams,64,Male,Master,81884,Zambia,0.9,18984.780
...,...,...,...,...,...,...,...,...
995,Jason Haas,42,Female,PhD,98170,Bosnia and Herzegovina,0.7,17435.950
996,Sharon Valdez,39,Male,High School,59174,Eritrea,0.9,11662.830
997,Andrew Taylor,50,Male,Master,59808,Serbia,0.4,7196.160
998,Jessica Johnson,50,Female,Master,58272,Cyprus,0.7,13939.520
